# Analysis of YouTube Content Creation Evolution

This notebook explores the frequency and consistency of content creation across different YouTube channel categories over time, using the YouNiverse dataset.

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the datasets
channel_data = pd.read_csv("data/df_channels_en_with_monetization.csv", dtype={
        'has_affiliate': 'boolean',
        'has_sponsorships': 'boolean',
        'has_merchandise': 'boolean'
    })

timeseries_data = pd.read_csv("data/df_timeseries_en.tsv.gz", compression="infer", sep="\t")
video_data = pd.read_feather("data/yt_metadata_helper.feather")

In [17]:
channel_data.head(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,has_affiliate,has_sponsorships,has_merchandise
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087,9800.0,156700.0,117600.0,1900000.0,,,
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087,,,,,,,
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087,,,,,False,False,True
3,Howto & Style,2016-11-15,UC295-Dw_tDNtZXFeAPAW6Aw,5-Minute Crafts,60600000,3591,9.0,2.087,0.0,0.0,0.0,0.05,False,False,False
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087,178700.0,2900000.0,2100000.0,34300000.0,False,False,False


In [18]:
timeseries_data.head(5)

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03 00:00:00,202494.6,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10 00:00:00,394085.7,191591.111111,1046.0,395.777778,6,1,1
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17 00:00:00,835393.8,441308.083333,1501.5,455.5,6,0,1
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24 00:00:00,1104577.0,269183.25,1750.0,248.5,6,0,0
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31 00:00:00,1284406.0,179828.6,2008.3,258.3,6,0,0


In [19]:
video_data.head(5)

Unnamed: 0,categories,channel_id,dislike_count,display_id,duration,like_count,upload_date,view_count
0,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,SBqSc91Hn9g,1159,8.0,2016-09-28,1057.0
1,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,UuugEl86ESY,2681,23.0,2016-09-28,12894.0
2,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,779.0,oB4c-yvnbjs,1394,1607.0,2016-09-28,1800602.0
3,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,24.0,ZaV-gTCMV8E,5064,227.0,2016-09-28,57640.0
4,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,13.0,cGvL7AvMfM0,3554,105.0,2016-09-28,86368.0


In [20]:
channel_data['join_date'] = pd.to_datetime(channel_data['join_date'])
video_data['upload_date'] = pd.to_datetime(video_data['upload_date'])
timeseries_data['datetime'] = pd.to_datetime(timeseries_data['datetime'])

In [21]:
channel_data.isna().sum()

category_cc                    128
join_date                        1
channel                          0
name_cc                         10
subscribers_cc                   0
videos_cc                        0
subscriber_rank_sb               0
weights                          0
lowest_monthly_earnings     136175
highest_monthly_earnings    136175
lowest_yearly_earnings      136175
highest_yearly_earnings     136175
has_affiliate               101062
has_sponsorships            101062
has_merchandise             101062
dtype: int64

We might want to remove samples that have NaN category and/or NaN name.

In [22]:
channel_data.shape

(136470, 15)

In [23]:
# Remove samples that have no category and/or name
channel_data = channel_data.dropna(subset=['category_cc', 'name_cc'])

In [24]:
# New shape after removing samples
channel_data.shape

(136332, 15)

In [27]:
timeseries_data.isna().sum()

channel             0
category        20584
datetime            0
views               0
delta_views         0
subs                0
delta_subs          0
videos              0
delta_videos        0
activity            0
dtype: int64

In [28]:
timeseries_data.shape

(18872499, 10)

In [30]:
timeseries_data = timeseries_data.dropna()

In [None]:
# New shape after removing samples
timeseries_data.shape

(18851915, 10)

In [25]:
video_data.isna().sum()

categories             0
channel_id             0
dislike_count    1631458
display_id             0
duration               0
like_count       1631458
upload_date            0
view_count           654
dtype: int64

The fact that there are some NaN values about the dislike, like and view counts won't affect our research about the frequency and consistency of content creation, so we can keep all the data of the video metadata.

In [26]:
video_data.shape

(72924794, 8)

In [32]:
# Add year and month columns of upload date of the video data to facilitate time-based analysis
video_data['year'] = video_data['upload_date'].dt.year
video_data['month'] = video_data['upload_date'].dt.to_period('M')

In [35]:
# Merge with channel metadata if needed
channel_video_data = video_data.merge(
    channel_data,
    left_on='channel_id', # Column name in video data
    right_on='channel',   # Column name in channel data
    how='inner'
)

In [40]:
channel_video_data = channel_video_data.drop(columns='channel')

In [41]:
channel_video_data = channel_video_data.rename(
    columns={
        'categories': 'video_category',
        'category_cc': 'channel_category'
    }
)

In [42]:
channel_video_data.columns

Index(['video_category', 'channel_id', 'dislike_count', 'display_id',
       'duration', 'like_count', 'upload_date', 'view_count', 'year', 'month',
       'channel_category', 'join_date', 'name_cc', 'subscribers_cc',
       'videos_cc', 'subscriber_rank_sb', 'weights', 'lowest_monthly_earnings',
       'highest_monthly_earnings', 'lowest_yearly_earnings',
       'highest_yearly_earnings', 'has_affiliate', 'has_sponsorships',
       'has_merchandise'],
      dtype='object')

In [43]:
channel_video_data.head(5)

Unnamed: 0,video_category,channel_id,dislike_count,display_id,duration,like_count,upload_date,view_count,year,month,...,videos_cc,subscriber_rank_sb,weights,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,has_affiliate,has_sponsorships,has_merchandise
0,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,SBqSc91Hn9g,1159,8.0,2016-09-28,1057.0,2016,2016-09,...,2398,54617.0,3.5155,,,,,False,False,False
1,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,1.0,UuugEl86ESY,2681,23.0,2016-09-28,12894.0,2016,2016-09,...,2398,54617.0,3.5155,,,,,False,False,False
2,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,779.0,oB4c-yvnbjs,1394,1607.0,2016-09-28,1800602.0,2016,2016-09,...,2398,54617.0,3.5155,,,,,False,False,False
3,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,24.0,ZaV-gTCMV8E,5064,227.0,2016-09-28,57640.0,2016,2016-09,...,2398,54617.0,3.5155,,,,,False,False,False
4,Film & Animation,UCzWrhkg9eK5I8Bm3HfV-unA,13.0,cGvL7AvMfM0,3554,105.0,2016-09-28,86368.0,2016,2016-09,...,2398,54617.0,3.5155,,,,,False,False,False


In [44]:
channel_video_data.shape

(72882785, 24)

In [48]:
# Identify rows where channel_category is different from video_category
mismatched_categories = channel_video_data[channel_video_data['channel_category'] != channel_video_data['video_category']]

# Count the mismatched samples
num_mismatched = mismatched_categories.shape[0]

print(f"Number of samples with mismatched categories: {num_mismatched}")

# Percentage of mismatched samples
percentage_mismatched = (num_mismatched / channel_video_data.shape[0]) * 100

print(f"Percentage of samples with mismatched categories: {percentage_mismatched:.2f}%")

Number of samples with mismatched categories: 13341605
Percentage of samples with mismatched categories: 18.31%


Since some channels assigned as a certain category create videos of another category (18.31%) as the one they are assigned, it might be good to make a frequency and consistency analysis with respect to the channels and another analysis with respect to the video categories.

## Step 1: Channel Analysis

### Frequency Analysis

In [None]:
# Calculate the number of uploads per category per year
freq_per_year = video_data.groupby(['category', 'year']).size().reset_index(name='video_count')

## Step 2: Consistency Analysis

In [None]:
# Calculate time intervals between uploads for each channel
video_data = video_data.sort_values(by=['channel_id', 'upload_date'])
video_data['upload_interval'] = video_data.groupby('channel_id')['upload_date'].diff().dt.days

# Consistency metric: Variance in upload intervals
consistency = video_data.groupby(['category', 'year']).agg({
    'upload_interval': ['mean', 'std']
}).reset_index()
consistency.columns = ['category', 'year', 'mean_interval', 'std_interval']

## Step 3: Timeseries Analysis

In [None]:
# Frequency over time
plt.figure(figsize=(12, 6))
for category in freq_per_year['category'].unique():
    subset = freq_per_year[freq_per_year['category'] == category]
    plt.plot(subset['year'], subset['video_count'], label=category)

plt.title("Video Upload Frequency by Category Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Videos")
plt.legend()
plt.show()

In [None]:
# Consistency over time (Standard deviation in upload intervals)
plt.figure(figsize=(12, 6))
for category in consistency['category'].unique():
    subset = consistency[consistency['category'] == category]
    plt.plot(subset['year'], subset['std_interval'], label=category)

plt.title("Consistency of Content Creation by Category Over Time")
plt.xlabel("Year")
plt.ylabel("Standard Deviation of Upload Intervals (Days)")
plt.legend()
plt.show()

## Step 4: Comparing the three Analysis

Add any comparative metrics or additional aggregations as needed.

## Step 5: Some ML Algorithms