In [127]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import diagnostic

from scipy import stats
import scipy.fft as sf

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

pd.options.plotting.backend = "plotly"

# 1 - ***Channels***

In [128]:
channels_df = pd.read_csv('data/df_channels_en.tsv.gz', sep='\t', compression='infer')
channels_df.sample(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
125857,Gaming,2016-04-21,UCEk-ZgOUlbB4FDInMY14CMw,Keith Redacted,12000,45,807220.0,11.371
90946,Entertainment,2014-11-25,UCfFffiJALT8u5FeYFwF7HnQ,Brulas Reacts,25700,422,471386.0,7.6665
29457,Education,2017-05-07,UCD3KVjbb7aq2OiOffuungzw,DarkCode,168631,180,95834.0,3.716
78938,People & Blogs,2014-02-17,UCMz4CdBnsxbvMx7iDUOOLlg,Kyle Nicole,36237,48,379295.0,8.093
87749,Entertainment,2010-12-22,UCbYfeZ7gvIClUzbDcJadFTg,DentalTech1000,26900,72,443939.0,7.51


In [129]:
sample_channels = pd.read_parquet('data/gaming_channels_labelized.parquet')
sample_channels.sample(5)

Unnamed: 0,name_cc,game_category
33,TheGamingRevolution,COD
5,IGN,Gaming News
21,Trick2G,LOL
29,MrDalekJD,COD
7,League of Legends,Gaming News


In [130]:
print(f'{len(sample_channels)} channels sampled')

56 channels sampled


In [131]:
sample_channels_df = pd.merge(channels_df, sample_channels, how='inner')
sample_channels_df.sample(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,game_category
47,Gaming,2013-06-15,UCaMi81Bt9geDCcow-hHrP_Q,RaidAway,773000,1888,20149.0,2.869,COD
38,Gaming,2011-03-08,UCL7vy7MDOq9-tE-r6taQBlw,AA9skillz,1850000,3273,7384.0,2.535,FIFA
24,Gaming,2014-10-26,UCZyxY8Q7xgUCXfFViWkjrSw,Ceeday,6480000,384,1007.0,2.087,Fortnite
18,Gaming,2008-02-17,UCWZmCMB7mmKWcXJSIPRhzZw,Miniminter,8210000,1659,734.0,2.087,FIFA
37,Gaming,2009-07-23,UCMUP9j-QidC_S0KLlO06uCg,MrDalekJD,2190000,3145,6036.0,2.4615,COD


In [132]:
sample_channels.to_parquet('data/df_sample_channels.parquet')
print('Sampled channels df saved!')

Sampled channels df saved!


# 2 - ***Timeseries***

In [133]:
timeseries_df = pd.read_csv('data/df_timeseries_en.tsv.gz', sep='\t', compression='infer')
timeseries_df['datetime'] = pd.to_datetime(timeseries_df['datetime'])
timeseries_df.head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.6,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.7,191591.111111,1046.0,395.777778,6,1,1
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17,835393.8,441308.083333,1501.5,455.5,6,0,1
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24,1104577.0,269183.25,1750.0,248.5,6,0,0
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31,1284406.0,179828.6,2008.3,258.3,6,0,0


In [134]:
sample_timeseries_df = pd.merge(left=timeseries_df, right=sample_channels_df[['channel', 'name_cc', 'game_category']])
sample_timeseries_df = sample_timeseries_df.rename(columns={'name_cc': 'channel_name'})
sample_timeseries_df.head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,channel_name,game_category
0,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-10 00:00:00,6708559000.0,13590290.0,9150939.0,0.0,3255,0,29,PopularMMOs,Minecraft
1,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-17 00:00:00,6756017000.0,47457620.0,9224396.0,73457.375,3266,11,26,PopularMMOs,Minecraft
2,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-24 00:00:00,6802824000.0,46807210.0,9284667.0,60270.875,3280,14,29,PopularMMOs,Minecraft
3,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-30 23:00:00,6849790000.0,46965860.0,9333509.0,48841.865285,3292,12,31,PopularMMOs,Minecraft
4,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-11-06 23:00:00,6890571000.0,40780410.0,9375919.0,42410.384715,3304,12,28,PopularMMOs,Minecraft


In [135]:
N = sample_channels.shape[0]
dt = sample_timeseries_df.datetime.value_counts()
valid_dt = dt[dt == N].index
len(valid_dt)

154

In [136]:
sample_timeseries_df = sample_timeseries_df[sample_timeseries_df.datetime.isin(valid_dt)]
sample_timeseries_df.shape

(8624, 12)

In [137]:
sample_timeseries_df.to_parquet('data/df_sample_timeseries.parquet')
print('Sampled timeseries df saved!')

Sampled timeseries df saved!
