# ***Preprocessing Notebook of the channels and time-series dataset***

>In this notebook, using a hand-selected number of channels labeled by 'Game Type,' we will build a filtered version of the channels and time-series dataset. This version will contain only those channels and store a **`game_category`** tag indicating the most frequently played type of game for each channel.

In [12]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import diagnostic

from scipy import stats
import scipy.fft as sf

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

pd.options.plotting.backend = "plotly" 

## ***Labels-Channels Dataset creation***

In [13]:
# Channel names and their respective game categories
channels_data = {
    'name_cc': ['Klaus Gaming', 'Past Amazing', 'I JACK SPARROW COC', 'Galadon Gaming', 'Orange Juice Gaming',
                'IGN', 'The Game Theorists', 'League of Legends', 'Fortnite', 'PlayStation',
                'Rockstar Games', 'Clash Royale', 'Clash of Clans',
                'LispyJimmy', 'speedyw03', 'KjraGaming', 'XpertThief', 'DarkViperAU',
                'Pianta', 'SkinSpotlights', 'KingStix', 'Trick2G', 'MagikarpUsedFly',
                'Miniminter', 'Castro1021', 'NepentheZ', 'AA9skillz', 'ZwebackHD', 'bateson87',
                'MrDalekJD', 'FaZe Clan', 'RaidAway', 'TheXclusiveAce', 'TheGamingRevolution',
                'Ninja', 'Ali-A', 'Lachlan', 'LazarBeam', 'Tfue', 'BCC Trolling', 'Muselk',
                'aLexBY11', 'Ceeday', 'Shroud', 'Fitz',
                'DanTDM', 'PopularMMOs', 'SSundee', 'Logdotzip', 'Sky Does Everything',
                'CaptainSparklez', 'TheSyndicateProject', 'stampylonghead', 'Mumbo Jumbo',
                'Bajan Canadian', 'TheAtlanticCraft'],
    'game_category': ['Mobile Games'] * 5 +
                     ['Gaming News'] * 8 +
                     ['GTA'] * 5 +
                     ['LOL'] * 5 +
                     ['FIFA'] * 6 +
                     ['COD'] * 5 +
                     ['Fortnite'] * 11 +
                     ['Minecraft'] * 11
}

# Create the DataFrame
gaming_channels_df = pd.DataFrame(channels_data)

# Display the DataFrame
display(gaming_channels_df)


Unnamed: 0,name_cc,game_category
0,Klaus Gaming,Mobile Games
1,Past Amazing,Mobile Games
2,I JACK SPARROW COC,Mobile Games
3,Galadon Gaming,Mobile Games
4,Orange Juice Gaming,Mobile Games
5,IGN,Gaming News
6,The Game Theorists,Gaming News
7,League of Legends,Gaming News
8,Fortnite,Gaming News
9,PlayStation,Gaming News


In [14]:
count_per_game = gaming_channels_df['game_category'].value_counts().reset_index()
ax = count_per_game.plot(kind='bar', x='game_category', y='count', color='game_category')
ax.update_layout(
    title = 'Number of Channels per Game Category'
)
ax.show()

In [15]:
# Save it at a parquet file
gaming_channels_df.to_parquet('data/gaming_channels_labelized.parquet')