In [1]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import diagnostic

from scipy import stats
import scipy.fft as sf

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

pd.options.plotting.backend = "plotly"

In [2]:
channels_df = pd.read_csv('data/df_channels_en.tsv.gz', sep='\t', compression='infer')
channels_df.sample(5)

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights
82043,Sports,2017-11-03,UCOoibxKsToPLr58kHzVhnDA,Barça World,27354,116,401130.0,6.9785
135393,Entertainment,2013-12-14,UCC-jLLbBcBY7wqb1G6-E5tg,Sisi Gaga,10700,224,939683.0,33.133
14640,Howto & Style,2008-09-27,UC0Cc-m348L1pyb1rlsCKf8Q,Evelina Forsell,453464,371,40723.0,3.3615
94568,Science & Technology,2018-02-26,UCZp2qyycTugXM9v4GRMv3hQ,Ubex AI,24700,44,501045.0,6.1375
117161,Gaming,2015-06-08,UC_G4uW-U0LcCsWis37D6xHg,"Hi,Its Vivian!",15200,73,710939.0,11.4745


In [3]:
timeseries_df = pd.read_csv('data/df_timeseries_en.tsv.gz', sep='\t', compression='infer')
timeseries_df['datetime'] = pd.to_datetime(timeseries_df['datetime'])
timeseries_df.head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.6,0.0,650.222222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.7,191591.111111,1046.0,395.777778,6,1,1
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17,835393.8,441308.083333,1501.5,455.5,6,0,1
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24,1104577.0,269183.25,1750.0,248.5,6,0,0
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31,1284406.0,179828.6,2008.3,258.3,6,0,0


In [4]:
def get_timeseries(channels, labels=None):
    sample_channels_df = channels_df[channels_df.name_cc.isin(channels)]
    if labels is not None:
        sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])
    sample_timeseries_df = pd.merge(left=timeseries_df, right=sample_channels_df)
    sample_timeseries_df = sample_timeseries_df.rename(columns={'name_cc': 'channel_name'})
    
    N = sample_channels_df.shape[0]
    dt = sample_timeseries_df.datetime.value_counts()
    valid_dt = dt[dt == N].index
    sample_timeseries_df = sample_timeseries_df[sample_timeseries_df.datetime.isin(valid_dt)]

    return sample_timeseries_df.sort_values(by='datetime')


In [10]:
mg = ['Klaus Gaming', 'Past Amazing', 'I JACK SPARROW COC', 'Galadon Gaming', 'Orange Juice Gaming']
gta = ['LispyJimmy', 'speedyw03', 'KjraGaming', 'XpertThief', 'DarkViperAU']
lol = ['Pianta', 'SkinSpotlights', 'KingStix', 'Trick2G', 'MagikarpUsedFly']
fifa = ['Miniminter', 'Castro1021', 'NepentheZ', 'AA9skillz', 'ZwebackHD']
cod = ['MrDalekJD', 'FaZe Clan', 'RaidAway', 'TheXclusiveAce', 'TheGamingRevolution']
fortnite = ['Ninja', 'Ali-A', 'Lachlan', 'LazarBeam', 'Tfue']
minecraft = ['DanTDM', 'PopularMMOs', 'SSundee', 'Logdotzip', 'Sky Does Everything']

channels = mg + gta + lol + fifa + cod + fortnite + minecraft
labels = np.array([['Mobile Games'] * 5 +
        ['GTA'] * 5 +
        ['LOL'] * 5 +
        ['FIFA'] * 5 +
        ['COD'] * 5 +
        ['Fortnite'] * 5 +
        ['Minecraft'] * 5]).flatten()

sample_df = get_timeseries(channels, dict(zip(channels,labels)))
sample_df.to_parquet('data/df_sample_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])


In [11]:
mf_df = get_timeseries(['Marshmello', 'Fortnite'], labels={'Marshmello': 'Marshmello', 'Fortnite': 'Fortnite Official Channel'})
mf_df.to_parquet('data/mf_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])


In [12]:
pubg_df = get_timeseries(['Ninja', 'Dynamo'], labels={'Ninja': 'Fortnite', 'Dynamo': 'PUBG'})
pubg_df.to_parquet('data/pubg_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])


In [13]:
fortnite_df = get_timeseries(fortnite, {f: f for f in fortnite})
fortnite_df.to_parquet('data/fortnite_timeseries.parquet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_channels_df['game_category'] = sample_channels_df['name_cc'].apply(lambda row: labels[row])
