# YouTube Watch History Analysis

This notebook parses your YouTube watch-history HTML, enriches the data with temporal features, and performs exploratory data analysis with 15 visualizations using pretty color palettes.

## 1. Parse YouTube Watch History HTML

In [None]:
from lxml import etree
import pandas as pd
from dateutil import parser

# Load HTML
with open('watch_history.html', 'r', encoding='utf-8') as f:
    html = f.read()

# Parse HTML and extract records
tree = etree.HTML(html)
records = []
for div in tree.xpath('//div[contains(@class,"content-cell") and contains(@class,"mdl-cell--6-col")]'):
    title_el   = div.xpath('.//a[1]/text()')
    channel_el = div.xpath('.//a[2]/text()')
    date_el    = div.xpath('.//span/text() | .//text()[normalize-space()][last()]')

    title    = title_el[0].strip()   if title_el   else None
    channel  = channel_el[0].strip() if channel_el else None
    date_str = date_el[0].strip()    if date_el    else None

    try:
        watched_at = parser.parse(date_str)
    except:
        watched_at = pd.NaT

    if title and channel and pd.notna(watched_at):
        records.append({
            'video_title':  title,
            'channel_name': channel,
            'watched_at':   watched_at
        })

# Create DataFrame
df = pd.DataFrame(records)
df['watched_at'] = pd.to_datetime(df['watched_at'])
df.head()

## 2. Feature Engineering

In [None]:
# Add temporal features
df['date']        = df['watched_at'].dt.date
df['time']        = df['watched_at'].dt.time
df['day_of_week'] = df['watched_at'].dt.day_name()
df['hour']        = df['watched_at'].dt.hour

def time_bin(h):
    if 6 <= h < 12:   return 'Morning'
    if 12 <= h < 18:  return 'Afternoon'
    if 18 <= h < 24:  return 'Evening'
    return 'Night'

df['time_bin'] = df['hour'].apply(time_bin)
df.head()

## 3. Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt

dow_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
palette_dow = ['#4E79A7','#59A14F','#F28E2B','#E15759','#76B7B2','#EDC949','#AF7AA1']

plt.figure()
df['day_of_week'].value_counts().reindex(dow_order).plot(
    kind='bar', color=palette_dow
)
plt.title('Watch Count by Day of Week')
plt.xlabel('')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
palette_tb = ['#4E79A7','#F28E2B','#59A14F','#E15759']
plt.figure()
df['time_bin'].value_counts().reindex(['Morning','Afternoon','Evening','Night']).plot(
    kind='bar', color=palette_tb
)
plt.title('Watch Count by Time of Day')
plt.xlabel('')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
palette_chan = ['#4E79A7']*10
plt.figure(figsize=(8,5))
df['channel_name'].value_counts().head(10).plot(
    kind='barh', color=palette_chan
)
plt.title('Top 10 Watched Channels')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

In [None]:
daily = df.set_index('watched_at').resample('D').size()
plt.figure()
daily.plot(color='#59A14F')
plt.title('Daily Watch Count')
plt.xlabel('Date')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
rolling7 = daily.rolling(7).mean()
plt.figure()
rolling7.plot(color='#F28E2B')
plt.title('7-Day Rolling Average of Daily Count')
plt.xlabel('Date')
plt.ylabel('Avg Count')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
pivot_dh = df.pivot_table(index='day_of_week', columns='hour', aggfunc='size', fill_value=0)
pivot_dh = pivot_dh.reindex(dow_order)

plt.figure(figsize=(10,4))
plt.imshow(pivot_dh, aspect='auto', cmap='viridis')
plt.xticks(np.arange(24), np.arange(24))
plt.yticks(np.arange(len(dow_order)), dow_order)
plt.colorbar(label='Count')
plt.title('Heatmap: Day of Week vs Hour')
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
df['hour'].plot(kind='hist', bins=24, edgecolor='black', color='#76B7B2')
plt.title('Distribution of Watch Hours')
plt.xlabel('Hour of Day')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
weekly = df.set_index('watched_at').resample('W-MON').size()
plt.figure()
weekly.plot(marker='o', linestyle='-', color='#E15759')
plt.title('Weekly Watch Count (Mon–Sun)')
plt.xlabel('Week Starting')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
monthly = df.set_index('watched_at').resample('M').size()
plt.figure()
monthly.plot(marker='s', linestyle='-', color='#AF7AA1')
plt.title('Monthly Watch Count')
plt.xlabel('Month')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
df['time_bin'].value_counts().reindex(['Morning','Afternoon','Evening','Night']).plot(
    kind='pie', autopct='%1.1f%%', startangle=90,
    colors=palette_tb
)
plt.title('Proportion by Time Bin')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
df['month'] = df['watched_at'].dt.month_name()
pivot_dm = df.pivot_table(index='day_of_week', columns='month', aggfunc='size', fill_value=0)
pivot_dm = pivot_dm.reindex(index=dow_order, columns=[
    'January','February','March','April','May','June','July','August',
    'September','October','November','December'
])
plt.figure(figsize=(10,6))
plt.imshow(pivot_dm, aspect='auto', cmap='plasma')
plt.xticks(np.arange(12), pivot_dm.columns, rotation=90)
plt.yticks(np.arange(len(dow_order)), dow_order)
plt.colorbar(label='Count')
plt.title('Heatmap: Day of Week vs Month')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
data_dw = [df[df['day_of_week']==d]['hour'] for d in dow_order]
plt.boxplot(data_dw, labels=dow_order)
plt.title('Hour Distribution by Day of Week')
plt.ylabel('Hour')
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
data_tb2 = [df[df['time_bin']==tb]['hour'] for tb in ['Morning','Afternoon','Evening','Night']]
plt.boxplot(data_tb2, labels=['Morning','Afternoon','Evening','Night'])
plt.title('Hour Distribution by Time Bin')
plt.ylabel('Hour')
plt.tight_layout()
plt.show()

In [None]:
mtb = df.groupby([df['watched_at'].dt.to_period('M'),'time_bin']).size().unstack(fill_value=0)
mtb.index = mtb.index.to_timestamp()
plt.figure(figsize=(10,6))
mtb[['Morning','Afternoon','Evening','Night']].plot(
    kind='bar', stacked=True, colormap='tab20', width=0.8
)
plt.title('Monthly Distribution of Time Bins')
plt.xlabel('Month')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
daily_vals = daily.values
plt.scatter(daily_vals[:-1], daily_vals[1:], color='#EDC949', alpha=0.6)
plt.title('Lag Plot: Day N vs Day N+1')
plt.xlabel('Count Day N')
plt.ylabel('Count Day N+1')
plt.tight_layout()
plt.show()