# OTT Compliance Events - Data Analysis & Exploration

## Comprehensive Data Science Exploration

This notebook covers:
- Event data exploration and statistics
- Distribution analysis across regions and regulations
- Compliance violation patterns
- User behavior segmentation
- Correlation and risk analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

sns.set_theme(style='darkgrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Environment initialized for compliance data analysis")

## 1. Generate Realistic Compliance Event Dataset

In [None]:
# Generate comprehensive compliance events dataset
np.random.seed(42)
n_events = 5000

# Regions and regulations
regions = ['US', 'EU', 'ASIA', 'LATAM', 'AFRICA']
regulations = ['GDPR', 'CCPA', 'PIPL', 'LGPD', 'PDPA']
event_types = ['login', 'stream_start', 'stream_pause', 'stream_end', 'download', 'profile_update']
statuses = ['compliant', 'warning', 'violation']

# Create events
events = {
    'event_id': np.arange(n_events),
    'timestamp': [datetime.now() - timedelta(hours=np.random.randint(0, 720)) for _ in range(n_events)],
    'user_id': np.random.randint(1000, 5000, n_events),
    'region': np.random.choice(regions, n_events, p=[0.35, 0.25, 0.20, 0.15, 0.05]),
    'regulation': np.random.choice(regulations, n_events),
    'event_type': np.random.choice(event_types, n_events),
    'status': np.random.choice(statuses, n_events, p=[0.80, 0.15, 0.05]),
    'user_age': np.random.randint(13, 75, n_events),
    'session_duration_minutes': np.random.exponential(scale=45, size=n_events),
    'concurrent_streams': np.random.randint(1, 6, n_events),
    'data_shared': np.random.choice(['none', 'partial', 'full'], n_events, p=[0.70, 0.20, 0.10])
}

df = pd.DataFrame(events)
df['hour'] = df['timestamp'].dt.hour
df['date'] = df['timestamp'].dt.date

print(f"Dataset shape: {df.shape}")
print(f"\nDataset Overview:")
print(df.head(10))
print(f"\nData Info:")
print(df.info())

## 2. Compliance Status Distribution Analysis

In [None]:
# Compliance status breakdown
compliance_counts = df['status'].value_counts()
compliance_pct = df['status'].value_counts(normalize=True) * 100

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Pie chart
colors = ['#2ecc71', '#f39c12', '#e74c3c']  # green, orange, red
axes[0].pie(compliance_counts, labels=compliance_counts.index, autopct='%1.1f%%',
             colors=colors, startangle=90)
axes[0].set_title('Compliance Status Distribution', fontweight='bold', fontsize=12)

# Bar chart with counts
axes[1].bar(compliance_counts.index, compliance_counts.values, color=colors, alpha=0.7)
axes[1].set_title('Compliance Events Count', fontweight='bold', fontsize=12)
axes[1].set_ylabel('Number of Events')
for i, v in enumerate(compliance_counts.values):
    axes[1].text(i, v + 50, str(v), ha='center', fontweight='bold')

# Percentage breakdown
axes[2].barh(compliance_pct.index, compliance_pct.values, color=colors, alpha=0.7)
axes[2].set_title('Compliance Status Percentage', fontweight='bold', fontsize=12)
axes[2].set_xlabel('Percentage (%)')
for i, v in enumerate(compliance_pct.values):
    axes[2].text(v + 1, i, f'{v:.1f}%', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("Compliance Status Summary:")
print(compliance_counts)
print(f"\nCompliance Rate: {(compliance_counts['compliant'] / len(df) * 100):.2f}%")
print(f"Violation Rate: {(compliance_counts['violation'] / len(df) * 100):.2f}%")

## 3. Regional Compliance Analysis

In [None]:
# Regional analysis
regional_compliance = pd.crosstab(df['region'], df['status'], normalize='index') * 100
regional_counts = df['region'].value_counts().sort_values(ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Stacked bar chart - compliance by region
regional_compliance.plot(kind='bar', stacked=True, ax=axes[0], 
                         color=['#2ecc71', '#f39c12', '#e74c3c'], alpha=0.8)
axes[0].set_title('Compliance Distribution by Region (%)', fontweight='bold', fontsize=12)
axes[0].set_xlabel('Region')
axes[0].set_ylabel('Percentage (%)')
axes[0].legend(title='Status')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)

# Event counts by region
regional_counts.plot(kind='bar', ax=axes[1], color='steelblue', alpha=0.7)
axes[1].set_title('Event Volume by Region', fontweight='bold', fontsize=12)
axes[1].set_xlabel('Region')
axes[1].set_ylabel('Number of Events')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()

print("Regional Compliance Analysis:")
print(regional_compliance.round(2))
print(f"\nHighest Risk Region: {regional_compliance['violation'].idxmax()} ({regional_compliance['violation'].max():.2f}% violations)")
print(f"Best Compliance Region: {regional_compliance['compliant'].idxmax()} ({regional_compliance['compliant'].max():.2f}% compliant)")

## 4. Regulation-Specific Insights

In [None]:
# Regulation analysis
regulation_compliance = pd.crosstab(df['regulation'], df['status'])
regulation_violation_rate = (regulation_compliance['violation'] / (regulation_compliance.sum(axis=1))) * 100

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Compliance by regulation
regulation_compliance.plot(kind='bar', ax=axes[0], 
                          color=['#2ecc71', '#f39c12', '#e74c3c'], alpha=0.7)
axes[0].set_title('Events by Regulation and Status', fontweight='bold', fontsize=12)
axes[0].set_xlabel('Regulation')
axes[0].set_ylabel('Number of Events')
axes[0].legend(title='Status')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)

# Violation rate by regulation
violation_rate_sorted = regulation_violation_rate.sort_values(ascending=False)
axes[1].barh(violation_rate_sorted.index, violation_rate_sorted.values, color='#e74c3c', alpha=0.7)
axes[1].set_title('Violation Rate by Regulation', fontweight='bold', fontsize=12)
axes[1].set_xlabel('Violation Rate (%)')
for i, v in enumerate(violation_rate_sorted.values):
    axes[1].text(v + 0.2, i, f'{v:.1f}%', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("Regulation Compliance Summary:")
print(regulation_compliance)
print(f"\nMost Violated Regulation: {violation_rate_sorted.idxmax()} ({violation_rate_sorted.max():.2f}%)")

## 5. Event Type and Behavioral Analysis

In [None]:
# Event type analysis
event_compliance = pd.crosstab(df['event_type'], df['status'], normalize='index') * 100
event_counts = df['event_type'].value_counts()

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Stacked bar - compliance by event type
event_compliance.plot(kind='bar', stacked=True, ax=axes[0, 0], 
                     color=['#2ecc71', '#f39c12', '#e74c3c'], alpha=0.8)
axes[0, 0].set_title('Compliance by Event Type (%)', fontweight='bold')
axes[0, 0].set_ylabel('Percentage (%)')
axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=45)
axes[0, 0].legend(title='Status')

# Event count
event_counts.plot(kind='bar', ax=axes[0, 1], color='steelblue', alpha=0.7)
axes[0, 1].set_title('Event Volume by Type', fontweight='bold')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=45)

# Session duration distribution
axes[1, 0].hist(df['session_duration_minutes'], bins=50, color='teal', alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Session Duration Distribution', fontweight='bold')
axes[1, 0].set_xlabel('Duration (minutes)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(df['session_duration_minutes'].mean(), color='red', linestyle='--', 
                   linewidth=2, label=f"Mean: {df['session_duration_minutes'].mean():.1f} min")
axes[1, 0].legend()

# Concurrent streams distribution
concurrent_stats = df['concurrent_streams'].value_counts().sort_index()
axes[1, 1].bar(concurrent_stats.index, concurrent_stats.values, color='orange', alpha=0.7)
axes[1, 1].set_title('Concurrent Streams Distribution', fontweight='bold')
axes[1, 1].set_xlabel('Number of Concurrent Streams')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

print(f"Session Duration Statistics:")
print(df['session_duration_minutes'].describe())
print(f"\nConcurrent Streams Distribution:")
print(concurrent_stats)

## 6. Time Series and Temporal Patterns

In [None]:
# Hourly pattern analysis
hourly_stats = df.groupby('hour').agg({
    'event_id': 'count',
    'status': lambda x: (x == 'violation').sum()
}).rename(columns={'event_id': 'total_events', 'status': 'violations'})
hourly_stats['violation_rate'] = (hourly_stats['violations'] / hourly_stats['total_events']) * 100

fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Events per hour
axes[0].plot(hourly_stats.index, hourly_stats['total_events'], marker='o', 
             linewidth=2, markersize=8, color='steelblue')
axes[0].fill_between(hourly_stats.index, hourly_stats['total_events'], alpha=0.3, color='steelblue')
axes[0].set_title('Event Volume by Hour of Day', fontweight='bold', fontsize=12)
axes[0].set_ylabel('Number of Events')
axes[0].grid(True, alpha=0.3)

# Violation rate per hour
axes[1].plot(hourly_stats.index, hourly_stats['violation_rate'], marker='s', 
             linewidth=2, markersize=8, color='#e74c3c')
axes[1].fill_between(hourly_stats.index, hourly_stats['violation_rate'], alpha=0.3, color='#e74c3c')
axes[1].set_title('Violation Rate by Hour of Day', fontweight='bold', fontsize=12)
axes[1].set_ylabel('Violation Rate (%)')
axes[1].set_xlabel('Hour of Day')
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim([0, 10])

plt.tight_layout()
plt.show()

print("Hourly Statistics:")
print(hourly_stats)
print(f"\nPeak Traffic Hour: {hourly_stats['total_events'].idxmax()}:00 ({hourly_stats['total_events'].max()} events)")
print(f"Highest Violation Hour: {hourly_stats['violation_rate'].idxmax()}:00 ({hourly_stats['violation_rate'].max():.2f}%)")

## 7. User Behavior Segmentation

In [None]:
# User segmentation analysis
user_stats = df.groupby('user_id').agg({
    'event_id': 'count',
    'status': lambda x: (x == 'violation').sum(),
    'session_duration_minutes': 'mean',
    'concurrent_streams': 'mean',
    'user_age': 'first'
}).rename(columns={
    'event_id': 'total_events',
    'status': 'violations',
    'session_duration_minutes': 'avg_session_duration',
    'concurrent_streams': 'avg_concurrent_streams'
})

user_stats['violation_rate'] = (user_stats['violations'] / user_stats['total_events']) * 100
user_stats['risk_level'] = pd.cut(user_stats['violation_rate'], 
                                   bins=[0, 2, 5, 100], 
                                   labels=['Low', 'Medium', 'High'])

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Risk level distribution
risk_dist = user_stats['risk_level'].value_counts()
risk_dist.plot(kind='bar', ax=axes[0, 0], color=['#2ecc71', '#f39c12', '#e74c3c'], alpha=0.7)
axes[0, 0].set_title('User Risk Level Distribution', fontweight='bold')
axes[0, 0].set_ylabel('Number of Users')
axes[0, 0].set_xticklabels(axes[0, 0].get_xticklabels(), rotation=0)

# Violation rate distribution
axes[0, 1].hist(user_stats['violation_rate'], bins=30, color='#e74c3c', alpha=0.7, edgecolor='black')
axes[0, 1].set_title('User Violation Rate Distribution', fontweight='bold')
axes[0, 1].set_xlabel('Violation Rate (%)')
axes[0, 1].set_ylabel('Frequency')

# Session duration vs violation rate
scatter = axes[1, 0].scatter(user_stats['avg_session_duration'], user_stats['violation_rate'],
                             c=user_stats['user_age'], cmap='viridis', s=100, alpha=0.6)
axes[1, 0].set_title('Session Duration vs Violation Rate (colored by age)', fontweight='bold')
axes[1, 0].set_xlabel('Average Session Duration (minutes)')
axes[1, 0].set_ylabel('Violation Rate (%)')
plt.colorbar(scatter, ax=axes[1, 0], label='User Age')

# Age distribution
axes[1, 1].hist(user_stats['user_age'], bins=20, color='steelblue', alpha=0.7, edgecolor='black')
axes[1, 1].set_title('User Age Distribution', fontweight='bold')
axes[1, 1].set_xlabel('Age')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("User Segmentation Summary:")
print(f"Total Unique Users: {len(user_stats)}")
print(f"\nRisk Level Distribution:")
print(user_stats['risk_level'].value_counts())
print(f"\nHigh Risk Users: {len(user_stats[user_stats['risk_level'] == 'High'])}")
print(f"\nTop 5 High-Risk Users:")
print(user_stats.nlargest(5, 'violation_rate')[['total_events', 'violations', 'violation_rate', 'user_age']])

## 8. Data Sharing and Privacy Insights

In [None]:
# Data sharing analysis
sharing_compliance = pd.crosstab(df['data_shared'], df['status'], normalize='index') * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Stacked bar
sharing_compliance.plot(kind='bar', stacked=True, ax=axes[0],
                       color=['#2ecc71', '#f39c12', '#e74c3c'], alpha=0.8)
axes[0].set_title('Compliance by Data Sharing Level (%)', fontweight='bold')
axes[0].set_ylabel('Percentage (%)')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)
axes[0].legend(title='Status')

# Count
sharing_counts = df['data_shared'].value_counts()
sharing_counts.plot(kind='bar', ax=axes[1], color=['#2ecc71', '#f39c12', '#e74c3c'], alpha=0.7)
axes[1].set_title('Events by Data Sharing Level', fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()

print("Data Sharing Analysis:")
print(sharing_compliance.round(2))
print(f"\nHighest Risk Sharing Level: {sharing_compliance['violation'].idxmax()} ({sharing_compliance['violation'].max():.2f}% violations)")

## 9. Comprehensive Summary Statistics

In [None]:
print("="*80)
print("COMPREHENSIVE OTT COMPLIANCE EVENTS ANALYSIS SUMMARY")
print("="*80)
print(f"\nüìä DATASET OVERVIEW:")
print(f"  Total Events: {len(df):,}")
print(f"  Unique Users: {df['user_id'].nunique():,}")
print(f"  Time Period: {df['timestamp'].min().date()} to {df['timestamp'].max().date()}")
print(f"  Regions Covered: {', '.join(df['region'].unique())}")
print(f"  Regulations Tracked: {', '.join(df['regulation'].unique())}")

print(f"\n‚úÖ COMPLIANCE METRICS:")
print(f"  Compliant Events: {(df['status'] == 'compliant').sum():,} ({(df['status'] == 'compliant').sum()/len(df)*100:.1f}%)")
print(f"  Warning Events: {(df['status'] == 'warning').sum():,} ({(df['status'] == 'warning').sum()/len(df)*100:.1f}%)")
print(f"  Violation Events: {(df['status'] == 'violation').sum():,} ({(df['status'] == 'violation').sum()/len(df)*100:.1f}%)")

print(f"\nüåç REGIONAL INSIGHTS:")
for region in df['region'].unique():
    region_violations = (df[df['region'] == region]['status'] == 'violation').sum()
    region_total = len(df[df['region'] == region])
    print(f"  {region}: {region_violations}/{region_total} violations ({region_violations/region_total*100:.1f}%)")

print(f"\nüë• USER BEHAVIOR:")
print(f"  Average Session Duration: {df['session_duration_minutes'].mean():.1f} minutes")
print(f"  Average Concurrent Streams: {df['concurrent_streams'].mean():.1f}")
print(f"  Median User Age: {df['user_age'].median():.0f} years")

print(f"\n‚è∞ TEMPORAL PATTERNS:")
print(f"  Peak Traffic Hour: {hourly_stats['total_events'].idxmax()}:00 ({hourly_stats['total_events'].max()} events)")
print(f"  Highest Violation Hour: {hourly_stats['violation_rate'].idxmax()}:00 ({hourly_stats['violation_rate'].max():.2f}%)")

print(f"\nüéØ KEY FINDINGS:")
print(f"  - {len(user_stats[user_stats['risk_level'] == 'High'])} high-risk users identified")
print(f"  - Highest violation regulation: {violation_rate_sorted.idxmax()} ({violation_rate_sorted.max():.2f}%)")
print(f"  - Data sharing increases violation risk by {sharing_compliance.loc['full', 'violation'] - sharing_compliance.loc['none', 'violation']:.1f}%")
print(f"  - Most common event type: {event_counts.idxmax()} ({event_counts.max():,} events)")

print("\n" + "="*80)