# Analysis of YouTube Content Creation Evolution

This notebook explores the frequency and consistency of content creation across different YouTube channel categories over time, using the YouNiverse dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the datasets
channel_data = pd.read_csv("data/df_channels_en.tsv.gz", compression='infer', sep='\t')
video_data = pd.read_feather("data/yt_metadata_helper.feather")

In [5]:
channel_data.head(10)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087
5,Entertainment,2007-01-15,UCIwFjwMjI0y7PDBVEO9-bkQ,Justin Bieber,46574085,134,12.0,2.087
6,Music,2014-03-12,UCFFbwnve3yF62-tVXkTyHqg,Zee Music Company,43451109,4241,13.0,2.087
7,Sports,2009-03-17,UCRijo3ddMTht_IHyNSNXpNQ,Dude Perfect,45800000,213,14.0,2.087
8,Entertainment,2015-05-12,UCk8GzjMOrta8yxDcKfylJYw,✿ Kids Diana Show,34700000,599,16.0,2.087
9,Music,2006-08-08,UC0C-w0YjGpqDXGB8IHb662A,Ed Sheeran,42000000,154,17.0,2.087


In [6]:
video_data.head(10)

Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,upload_date,view_count
0,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,SBqSc91Hn9g,1159,8.0,2016-09-28,1057.0
1,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,UuugEl86ESY,2681,23.0,2016-09-28,12894.0
2,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,779.0,oB4c-yvnbjs,1394,1607.0,2016-09-28,1800602.0
3,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,24.0,ZaV-gTCMV8E,5064,227.0,2016-09-28,57640.0
4,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,13.0,cGvL7AvMfM0,3554,105.0,2016-09-28,86368.0
5,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,iVADSRjaLtQ,1066,11.0,2016-09-27,3426.0
6,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,2.0,ypVcK9mldPc,2450,11.0,2016-09-27,7652.0
7,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,0.0,qA1NayP9cNY,1539,9.0,2016-09-27,479.0
8,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,CWdSl9ta4Rg,2123,10.0,2016-09-27,3856.0
9,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,0.0,64dCTkxCHpY,1130,7.0,2016-09-27,1730.0


In [11]:
channel_data['join_date'] = pd.to_datetime(channel_data['join_date'])
video_data['upload_date'] = pd.to_datetime(video_data['upload_date'])

In [12]:
channel_data.isna().sum()

category_cc           128
join_date               1
channel                 0
name_cc                10
subscribers_cc          0
videos_cc               0
subscriber_rank_sb      0
weights                 0
dtype: int64

In [13]:
channel_data.shape

(136470, 8)

In [14]:
video_data.isna().sum()

categories             0
channel_id             0
dislike_count    1631458
display_id             0
duration               0
like_count       1631458
upload_date            0
view_count           654
dtype: int64

In [10]:
video_data.shape

(72924794, 8)

## Step 1: Preprocess and Merge Data

In [None]:
# Ensure relevant fields are present
video_data['upload_date'] = pd.to_datetime(video_data['upload_date'])  # Convert upload date
video_data['year'] = video_data['upload_date'].dt.year
video_data['month'] = video_data['upload_date'].dt.to_period('M')  # Monthly period
video_data['category'] = video_data['category'].fillna('Unknown')  # Handle missing categories

# Merge with channel metadata if needed
video_data = video_data.merge(channel_data[['channel_id', 'category']], on='channel_id', how='left')

## Step 2: Frequency Analysis

In [None]:
# Calculate the number of uploads per category per year
freq_per_year = video_data.groupby(['category', 'year']).size().reset_index(name='video_count')

## Step 3: Consistency Analysis

In [None]:
# Calculate time intervals between uploads for each channel
video_data = video_data.sort_values(by=['channel_id', 'upload_date'])
video_data['upload_interval'] = video_data.groupby('channel_id')['upload_date'].diff().dt.days

# Consistency metric: Variance in upload intervals
consistency = video_data.groupby(['category', 'year']).agg({
    'upload_interval': ['mean', 'std']
}).reset_index()
consistency.columns = ['category', 'year', 'mean_interval', 'std_interval']

## Step 4: Visualization

In [None]:
# Frequency over time
plt.figure(figsize=(12, 6))
for category in freq_per_year['category'].unique():
    subset = freq_per_year[freq_per_year['category'] == category]
    plt.plot(subset['year'], subset['video_count'], label=category)

plt.title("Video Upload Frequency by Category Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Videos")
plt.legend()
plt.show()

In [None]:
# Consistency over time (Standard deviation in upload intervals)
plt.figure(figsize=(12, 6))
for category in consistency['category'].unique():
    subset = consistency[consistency['category'] == category]
    plt.plot(subset['year'], subset['std_interval'], label=category)

plt.title("Consistency of Content Creation by Category Over Time")
plt.xlabel("Year")
plt.ylabel("Standard Deviation of Upload Intervals (Days)")
plt.legend()
plt.show()

## Step 5: Insights and Comparison

Add any comparative metrics or additional aggregations as needed.