# Netflix Viewing Pattern Analysis for Digital Attention Span

This notebook analyzes personal Netflix viewing patterns to understand changes in attention span over time. The analysis includes:

1. Basic viewing statistics and patterns
2. Time-based analysis (daily, weekly, monthly)
3. Binge-watching detection
4. Predictive analysis
5. Statistical tests and trend analysis

## 1. Data Loading and Initial Processing

First, we'll load the Netflix viewing history data from the CSV file and process it into a suitable format for analysis.

In [None]:
import csv
from datetime import datetime

dates = []
durations = []

with open('data/ViewingActivity.csv', 'r') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        if row["Profile Name"] == "C":
            date = datetime.strptime(row["Start Time"].split(" ")[0], '%Y-%m-%d')
            
            duration_parts = row["Duration"].split(':')
            duration_minutes = (int(duration_parts[0]) * 60 + 
                             int(duration_parts[1]) +
                             int(duration_parts[2]) / 60)
            
            dates.append(date)
            durations.append(duration_minutes)

## 2. Basic Viewing Statistics

Let's calculate and display some fundamental statistics about viewing patterns.

In [None]:
sessions = len(dates)
sum_durations = sum(durations)
avg_duration = sum_durations / sessions

print(f"Total Viewing Sessions: {sessions}")
print(f"Total Viewing Duration: {sum_durations:.2f} minutes")
print(f"Average Session Duration: {avg_duration:.2f} minutes")

## 3. Daily Viewing Patterns Analysis

Let's analyze how viewing patterns vary by day of the week and create visualizations to better understand these patterns.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create DataFrame with viewing data
df = pd.DataFrame({
    'date': dates,
    'duration': durations
})

# Add day of week
df['day_of_week'] = df['date'].dt.day_name()

# Calculate average duration by day
daily_avg = df.groupby('day_of_week')['duration'].mean()

# Create bar plot
plt.figure(figsize=(10, 6))
daily_avg.plot(kind='bar')
plt.title('Average Viewing Duration by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Duration (minutes)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4. Monthly Viewing Analysis

Now let's examine how viewing patterns change month over month.

In [None]:
# Add month column
df['month'] = df['date'].dt.to_period('M')

# Calculate monthly statistics
monthly_stats = df.groupby('month').agg({
    'duration': ['count', 'mean', 'sum']
}).reset_index()

# Create subplot for monthly trends
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Plot 1: Number of sessions per month
ax1.plot(monthly_stats['month'].astype(str), monthly_stats['duration']['count'], marker='o')
ax1.set_title('Number of Viewing Sessions per Month')
ax1.set_xticklabels(monthly_stats['month'].astype(str), rotation=45)

# Plot 2: Average duration per month
ax2.plot(monthly_stats['month'].astype(str), monthly_stats['duration']['mean'], marker='o', color='green')
ax2.set_title('Average Session Duration per Month')
ax2.set_xticklabels(monthly_stats['month'].astype(str), rotation=45)

plt.tight_layout()
plt.show()

## 5. Binge-Watching Detection

Let's identify and analyze binge-watching sessions (multiple episodes watched consecutively).

In [None]:
from datetime import timedelta

def detect_binge_sessions(df, max_gap_minutes=30, min_episodes=3):
    """Detect binge-watching sessions based on time gaps between episodes"""
    df = df.sort_values('date')
    
    binge_sessions = []
    current_session = []
    
    for i in range(len(df)-1):
        if current_session == []:
            current_session.append(i)
            
        time_gap = (df.iloc[i+1]['date'] - df.iloc[i]['date']).total_seconds() / 60
        
        if time_gap <= max_gap_minutes:
            current_session.append(i+1)
        else:
            if len(current_session) >= min_episodes:
                binge_sessions.append(current_session)
            current_session = []
    
    return binge_sessions

# Detect binge sessions
binge_sessions = detect_binge_sessions(df)

# Calculate binge-watching statistics
total_binge_sessions = len(binge_sessions)
avg_episodes_per_binge = sum(len(session) for session in binge_sessions) / total_binge_sessions if total_binge_sessions > 0 else 0

print(f"Total number of binge-watching sessions: {total_binge_sessions}")
print(f"Average episodes per binge: {avg_episodes_per_binge:.2f}")

## 6. Weekly Patterns and Rolling Averages

Analyze weekly viewing patterns and calculate rolling averages to identify trends.

In [None]:
# Calculate weekly statistics
df['week'] = df['date'].dt.isocalendar().week
df['year'] = df['date'].dt.year

weekly_stats = df.groupby(['year', 'week'])['duration'].agg([
    'count',
    'mean',
    'sum'
]).reset_index()

# Calculate rolling averages
rolling_avg = weekly_stats['mean'].rolling(window=4).mean()

# Plot weekly trends with rolling average
plt.figure(figsize=(12, 6))
plt.plot(weekly_stats['mean'], label='Weekly Average', alpha=0.5)
plt.plot(rolling_avg, label='4-Week Rolling Average', linewidth=2)
plt.title('Weekly Viewing Duration with Rolling Average')
plt.xlabel('Week Number')
plt.ylabel('Average Duration (minutes)')
plt.legend()
plt.tight_layout()
plt.show()

## 7. Statistical Analysis and Attention Span Metrics

Calculate key metrics related to attention span and perform statistical tests.

In [None]:
from scipy import stats

# Calculate attention span metrics
def calculate_attention_metrics(df):
    metrics = {
        'median_duration': df['duration'].median(),
        'duration_std': df['duration'].std(),
        'session_consistency': 1 - (df['duration'].std() / df['duration'].mean()),
        'completion_rate': len(df[df['duration'] >= 40]) / len(df)  # Assuming 40 min is typical episode length
    }
    return metrics

# Calculate metrics for first and second half of the dataset
mid_point = df['date'].median()
first_half = df[df['date'] <= mid_point]
second_half = df[df['date'] > mid_point]

metrics_first = calculate_attention_metrics(first_half)
metrics_second = calculate_attention_metrics(second_half)

# Perform t-test to compare durations
t_stat, p_value = stats.ttest_ind(first_half['duration'], second_half['duration'])

print("Attention Span Metrics:")
print("\nFirst Half of Data:")
for metric, value in metrics_first.items():
    print(f"{metric}: {value:.2f}")

print("\nSecond Half of Data:")
for metric, value in metrics_second.items():
    print(f"{metric}: {value:.2f}")

print(f"\nT-test p-value: {p_value:.4f}")