In [1]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import diagnostic

from scipy import stats
import scipy.fft as sf

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

pd.options.plotting.backend = "plotly"

In [2]:
channels_df = pd.read_csv('data/df_channels_en.tsv.gz', sep='\t', compression='infer')
channels_df.sample(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
10820,Entertainment,2016-09-02,UCLbJsr8-afJDyMKy30YQ_uQ,DC SQUAAD,627000,813,28607.0,2.9785
62431,Gaming,2015-02-01,UCJIYgIBaQt0YcGO73VCLPyg,,57000,180,266879.0,6.148
30319,Film and Animation,2014-07-30,UC8IHAQMuiJdY6ALuhG7iU8Q,FilmRise Movies,94900,90,99136.0,3.692
6484,Entertainment,2006-11-07,UCzofNVHFCdD_4Jxs5dVqtAA,TEAMSUPERTRAMP,970000,486,15912.0,2.8155
84159,Film and Animation,2011-10-08,UCxUaG88UPVP_0fc9g_shnLQ,LydiaColdGem,28025,27,418182.0,6.172


In [3]:
timeseries_df = pd.read_csv('data/df_timeseries_en.tsv.gz', sep='\t', compression='infer')
timeseries_df['datetime'] = pd.to_datetime(timeseries_df['datetime'])
timeseries_df.head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.6,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.7,191591.111111,1046.0,395.777778,6,1,1
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17,835393.8,441308.083333,1501.5,455.5,6,0,1
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24,1104577.0,269183.25,1750.0,248.5,6,0,0
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31,1284406.0,179828.6,2008.3,258.3,6,0,0


In [4]:
def get_timeseries(channels, labels=None):
    sample_channels_df = channels_df[channels_df.name_cc.isin(channels)]
    if labels is not None:
        sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])
    sample_timeseries_df = pd.merge(left=timeseries_df, right=sample_channels_df)
    sample_timeseries_df = sample_timeseries_df.rename(columns={'name_cc': 'channel_name'})
    
    N = sample_channels_df.shape[0]
    dt = sample_timeseries_df.datetime.value_counts()
    valid_dt = dt[dt == N].index
    sample_timeseries_df = sample_timeseries_df[sample_timeseries_df.datetime.isin(valid_dt)]

    return sample_timeseries_df.sort_values(by='datetime')


In [5]:
mg = ['Klaus Gaming', 'Past Amazing', 'I JACK SPARROW COC', 'Galadon Gaming', 'Orange Juice Gaming']
gta = ['LispyJimmy', 'speedyw03', 'KjraGaming', 'XpertThief', 'DarkViperAU']
lol = ['Pianta', 'SkinSpotlights', 'KingStix', 'Trick2G', 'MagikarpUsedFly']
fifa = ['Miniminter', 'Castro1021', 'NepentheZ', 'AA9skillz', 'ZwebackHD']
cod = ['MrDalekJD', 'FaZe Clan', 'RaidAway', 'TheXclusiveAce', 'TheGamingRevolution']
fortnite = ['Ninja', 'Ali-A', 'Lachlan', 'LazarBeam', 'Tfue']
minecraft = ['DanTDM', 'PopularMMOs', 'SSundee', 'Logdotzip', 'Sky Does Everything']

channels = mg + gta + lol + fifa + cod + fortnite + minecraft
labels = np.array([['Mobile Games'] * 5 +
        ['GTA'] * 5 +
        ['LOL'] * 5 +
        ['FIFA'] * 5 +
        ['COD'] * 5 +
        ['Fortnite'] * 5 +
        ['Minecraft'] * 5]).flatten()

sample_df = get_timeseries(channels, dict(zip(channels,labels)))
sample_df.to_parquet('data/df_sample_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])


In [6]:
mf_df = get_timeseries(['Marshmello', 'Fortnite'], labels={'Marshmello': 'Marshmello', 'Fortnite': 'Fortnite Official Channel'})
mf_df.to_parquet('data/mf_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])


In [7]:
pubg_df = get_timeseries(['Ninja', 'Dynamo'], labels={'Ninja': 'Fortnite', 'Dynamo': 'PUBG'})
pubg_df.to_parquet('data/pubg_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])


In [8]:
pubg_df

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,category_cc,join_date,channel_name,subscribers_cc,videos_cc,subscriber_rank_sb,weights,game_category
0,UC7SDsqJba5428-EOBZWOn3w,Entertainment,2016-10-10,6.480921e+06,5.599125e+03,1.640824e+05,0.000000,31,0,0,Entertainment,2011-07-09,Dynamo,256000,58,65935.0,3.999,PUBG
155,UCAW-NpUFkMyCNrvRSSGIvDQ,Gaming,2016-10-10,7.047610e+06,9.470625e+03,5.225712e+04,0.000000,278,0,0,Gaming,2011-11-11,Ninja,22400000,929,104.0,2.087,Fortnite
156,UCAW-NpUFkMyCNrvRSSGIvDQ,Gaming,2016-10-17,7.061097e+06,1.348738e+04,5.252300e+04,265.875000,278,0,0,Gaming,2011-11-11,Ninja,22400000,929,104.0,2.087,Fortnite
1,UC7SDsqJba5428-EOBZWOn3w,Entertainment,2016-10-17,6.495188e+06,1.426712e+04,1.651748e+05,1092.375000,31,0,0,Entertainment,2011-07-09,Dynamo,256000,58,65935.0,3.999,PUBG
2,UC7SDsqJba5428-EOBZWOn3w,Entertainment,2016-10-24,6.512070e+06,1.688144e+04,1.667127e+05,1537.908031,31,0,0,Entertainment,2011-07-09,Dynamo,256000,58,65935.0,3.999,PUBG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,UC7SDsqJba5428-EOBZWOn3w,Entertainment,2019-09-09,7.743163e+06,8.664875e+03,2.565035e+05,0.000000,54,0,0,Entertainment,2011-07-09,Dynamo,256000,58,65935.0,3.999,PUBG
308,UCAW-NpUFkMyCNrvRSSGIvDQ,Gaming,2019-09-16,1.941802e+09,2.897669e+06,2.240384e+07,0.000000,925,1,3,Gaming,2011-11-11,Ninja,22400000,929,104.0,2.087,Fortnite
153,UC7SDsqJba5428-EOBZWOn3w,Entertainment,2019-09-16,7.751091e+06,7.928375e+03,2.563045e+05,0.000000,54,0,0,Entertainment,2011-07-09,Dynamo,256000,58,65935.0,3.999,PUBG
154,UC7SDsqJba5428-EOBZWOn3w,Entertainment,2019-09-23,7.776039e+06,2.494761e+04,2.561429e+05,0.000000,54,0,0,Entertainment,2011-07-09,Dynamo,256000,58,65935.0,3.999,PUBG


In [9]:
fortnite_df = get_timeseries(fortnite, {f: f for f in fortnite})
fortnite_df.to_parquet('data/fortnite_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])
