In [150]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import diagnostic

from scipy import stats
import scipy.fft as sf

import matplotlib.pyplot as plt


# 0 - ***Selecting the Hand-Picked Sample***

In [151]:
# Channel names and their respective game categories
channels_data = {
    'name_cc': ['Klaus Gaming', 'Past Amazing', 'I JACK SPARROW COC', 'Galadon Gaming', 'Orange Juice Gaming',
                'IGN', 'The Game Theorists', 'League of Legends', 'Fortnite', 'PlayStation',
                'Rockstar Games', 'Clash Royale', 'Clash of Clans',
                'LispyJimmy', 'speedyw03', 'KjraGaming', 'XpertThief', 'DarkViperAU',
                'Pianta', 'SkinSpotlights', 'KingStix', 'Trick2G', 'MagikarpUsedFly',
                'Miniminter', 'Castro1021', 'NepentheZ', 'AA9skillz', 'ZwebackHD', 'bateson87',
                'MrDalekJD', 'FaZe Clan', 'RaidAway', 'TheXclusiveAce', 'TheGamingRevolution',
                'Ninja', 'Ali-A', 'Lachlan', 'LazarBeam', 'Tfue', 'BCC Trolling', 'Muselk',
                'aLexBY11', 'Ceeday', 'Shroud', 'Fitz',
                'DanTDM', 'PopularMMOs', 'SSundee', 'Logdotzip', 'Sky Does Everything',
                'CaptainSparklez', 'TheSyndicateProject', 'stampylonghead', 'Mumbo Jumbo',
                'Bajan Canadian', 'TheAtlanticCraft'],
    'game_category': ['Mobile Games'] * 5 +
                     ['Gaming News'] * 8 +
                     ['GTA'] * 5 +
                     ['LOL'] * 5 +
                     ['FIFA'] * 6 +
                     ['COD'] * 5 +
                     ['Fortnite'] * 11 +
                     ['Minecraft'] * 11
}

# Create the DataFrame
sample_channels = pd.DataFrame(channels_data)
sample_channels.head()

Unnamed: 0,name_cc,game_category
0,Klaus Gaming,Mobile Games
1,Past Amazing,Mobile Games
2,I JACK SPARROW COC,Mobile Games
3,Galadon Gaming,Mobile Games
4,Orange Juice Gaming,Mobile Games


In [153]:
# Save it at a parquet file
sample_channels.to_parquet('data/sample_channels.parquet')

# 1 - ***Channels***

In [154]:
channels_df = pd.read_csv('data/df_channels_en.tsv.gz', sep='\t', compression='infer')
channels_df.sample(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
85694,Comedy,2015-05-30,UCMBtpPE5yfcfQEst1bXE9Qw,AlphaMaleBlake,27500,128,427309.0,8.865
111650,Entertainment,2016-04-15,UCYesO8TBChbsbdSafIHHbPg,amare vine,15616,17,659557.0,8.0475
118090,Music,2017-01-02,UCnP69OBnhoGKXmb646JZG5Q,Damian Castillo,13700,72,722877.0,11.9615
112601,Music,2008-05-03,UCuRGV-VLazMp3xQf9YTd4uA,Vespa,15800,253,663976.0,9.0535
92522,Gaming,2011-11-26,UCo3WFJ_lboEYIpJdBfgnTng,Charlie Pryor,23800,1442,485372.0,10.123


In [155]:
sample_channels = pd.read_parquet('data/gaming_channels_labelized.parquet')
sample_channels.sample(5)

Unnamed: 0,name_cc,game_category
18,Pianta,LOL
31,RaidAway,COD
47,SSundee,Minecraft
43,Shroud,Fortnite
39,BCC Trolling,Fortnite


In [156]:
print(f'{len(sample_channels)} channels sampled')

56 channels sampled


In [157]:
sample_channels_df = pd.merge(channels_df, sample_channels, how='inner')
sample_channels_df.sample(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,game_category
24,Gaming,2014-10-26,UCZyxY8Q7xgUCXfFViWkjrSw,Ceeday,6480000,384,1007.0,2.087,Fortnite
3,Gaming,2006-09-13,UCYVinkwSX7szARULgYpvhLw,Ali-A,16500000,3263,196.0,2.087,Fortnite
48,Gaming,2012-09-18,UCaN1rig0bL7SUod2WN0P8XA,TheXclusiveAce,470964,2398,31459.0,3.142,COD
17,Gaming,2011-07-29,UCj5i58mCkAREDqFWlhaQbOw,stampylonghead,9270000,3470,623.0,2.087,Minecraft
43,Gaming,2008-12-22,UCJ47W_WzuzbHaONWB5a9i7w,bateson87,1330000,4885,11095.0,2.6335,FIFA


In [158]:
sample_channels.to_parquet('data/df_sample_channels.parquet')
print('Sampled channels df saved!')

Sampled channels df saved!


# 2 - ***Timeseries***

In [159]:
timeseries_df = pd.read_csv('data/df_timeseries_en.tsv.gz', sep='\t', compression='infer')
timeseries_df['datetime'] = pd.to_datetime(timeseries_df['datetime'])
timeseries_df.head()

In [None]:
sample_timeseries_df = pd.merge(left=timeseries_df, right=sample_channels_df[['channel', 'name_cc', 'game_category']])
sample_timeseries_df = sample_timeseries_df.rename(columns={'name_cc': 'channel_name'})
sample_timeseries_df.head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,channel_name,game_category
0,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-10 00:00:00,6708559000.0,13590290.0,9150939.0,0.0,3255,0,29,PopularMMOs,Minecraft
1,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-17 00:00:00,6756017000.0,47457620.0,9224396.0,73457.375,3266,11,26,PopularMMOs,Minecraft
2,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-24 00:00:00,6802824000.0,46807210.0,9284667.0,60270.875,3280,14,29,PopularMMOs,Minecraft
3,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-30 23:00:00,6849790000.0,46965860.0,9333509.0,48841.865285,3292,12,31,PopularMMOs,Minecraft
4,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-11-06 23:00:00,6890571000.0,40780410.0,9375919.0,42410.384715,3304,12,28,PopularMMOs,Minecraft


In [None]:
N = sample_channels.shape[0]
dt = sample_timeseries_df.datetime.value_counts()
valid_dt = dt[dt == N].index
len(valid_dt)

154

In [None]:
sample_timeseries_df = sample_timeseries_df[sample_timeseries_df.datetime.isin(valid_dt)]
sample_timeseries_df.shape

(8624, 12)

In [None]:
sample_timeseries_df.to_parquet('data/df_sample_timeseries.parquet')
print('Sampled timeseries df saved!')

Sampled timeseries df saved!
