In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import feather
import plotly.subplots as sp

from tqdm import tqdm
import plotly.graph_objects as go
import plotly.io as pio
from scipy import stats
import scipy.fft as sf

import networkx as nx
from ipywidgets import interact, IntSlider
import math

# First Step - Introduction and quick timeseries overview

In [24]:
df_channels = pd.read_csv( "data/df_channels_en.tsv.gz", sep="\t")
df_timeseries = pd.read_csv("data/df_timeseries_en.tsv.gz", sep="\t")

In [25]:
color_palette = px.colors.qualitative.Set2

In [26]:
# Visualization 1: Line graph showing the year-on-year growth of gaming channels, videos, and viewership
# Preprocess the 'df_timeseries' for year-on-year analysis
# Example code for preprocessing
df_timeseries['datetime'] = pd.to_datetime(df_timeseries['datetime'])
df_timeseries['month_year'] = df_timeseries['datetime'].dt.to_period('Q')

# Example aggregation for monthly data
monthly_stats = df_timeseries.groupby('month_year').agg({
    'channel': 'nunique', 
    'videos': 'sum', 
    'views': 'sum'
}).reset_index()

# Creating the subplots
fig1 = make_subplots(rows=1, cols=3, subplot_titles=('Channels', 'Videos', 'Views'))

# Adding traces
fig1.add_trace(go.Scatter(x=monthly_stats['month_year'].astype(str), y=monthly_stats['channel'], mode='lines', name='Channels'), row=1, col=1)
fig1.add_trace(go.Scatter(x=monthly_stats['month_year'].astype(str), y=monthly_stats['videos'], mode='lines', name='Videos'), row=1, col=2)
fig1.add_trace(go.Scatter(x=monthly_stats['month_year'].astype(str), y=monthly_stats['views'], mode='lines', name='Views'), row=1, col=3)

# Updating layout
fig1.update_layout(title='Quarterly Growth of Youtube communities Channels, Videos, and Viewership on YouTube', showlegend=False)
fig1.update_xaxes(title_text='Month-Year', row=3, col=1)
fig1.update_yaxes(title_text='Count')

# Display the figure
fig1.show()

fig1.write_html("quarterly_growth_gaming.html")

In [27]:
# Visualization 2: Demographic pie charts and engagement bar graphs
# For this visualization, assuming 'category_cc' represents demographics and 'subscribers_cc' represents engagement
# Calculating demographics and engagement
demographics = df_channels['category_cc'].value_counts()
engagement = df_channels.groupby('category_cc')['subscribers_cc'].sum()

extended_color_palette = px.colors.qualitative.Set2 + px.colors.qualitative.Pastel1 + px.colors.qualitative.Dark2

# Trimming or extending the palette to match the number of categories
if len(extended_color_palette) > len(demographics.index):
    extended_color_palette = extended_color_palette[:len(demographics.index)]
elif len(extended_color_palette) < len(demographics.index):
    extended_color_palette.extend(px.colors.qualitative.Plotly[len(extended_color_palette) - len(demographics.index):])

# Matching the extended, more varied color palette to the categories
colors = {category: extended_color_palette[i] for i, category in enumerate(demographics.index)}

# Recreating the pie chart and bar graph with the new color palette
fig2 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'bar'}]])

fig2.add_trace(
    go.Pie(
        labels=demographics.index, 
        values=demographics.values, 
        name='Demographics', 
        marker=dict(colors=[colors[label] for label in demographics.index])
    ), 
    1, 1
)

fig2.add_trace(
    go.Bar(
        x=engagement.index, 
        y=engagement.values, 
        name='Engagement', 
        marker=dict(color=[colors[label] for label in engagement.index])
    ), 
    1, 2
)

fig2.update_layout(title='Demographic Distribution and Engagement in Youtube communities')

fig2.show()
fig2.write_html("demographics_engagement_gaming.html")

In [28]:
df = df_timeseries[df_timeseries['category'] == 'Gaming']
channels = df_channels
pewdiepie_id = 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'
markiplier_id = 'UC7_YxT-KID8kRbqZo7MyscQ'
jacksepticeye_id = 'UCYzPXprvl5Y-Sf0g4vX-m6g'
vanossgaming_id = 'UCq-fj5jknLsUf-MWSy4_brA'
df = df[df.channel.isin([pewdiepie_id, markiplier_id, jacksepticeye_id, vanossgaming_id])]
df.loc[df.channel == pewdiepie_id, 'channel'] = 'PewDiePie'
df.loc[df.channel == markiplier_id, 'channel'] = 'Markiplier'
df.loc[df.channel == jacksepticeye_id, 'channel'] = 'jacksepticeye'

In [29]:
timeseries = df[df.datetime >= '2016-01-01']
timeseries.head()
color_palette = px.colors.qualitative.Set2
fig1 = px.line(timeseries, x='datetime', y='delta_views', color='channel', title='Delta Views For Big Gaming Channels', 
                   color_discrete_sequence=color_palette  # Set the color palette
)
fig1.update_xaxes(title='Datetime')
fig1.update_yaxes(title='Delta Views')

# Create a figure for Delta Subs
fig2 = px.line(timeseries, x='datetime', y='delta_subs', color='channel', title='Delta Subs For Big Gaming Channels',
                   color_discrete_sequence=color_palette ) # Set the color palette)
fig2.update_xaxes(title='Datetime')
fig2.update_yaxes(title='Delta Subs')

# Display the figures
fig1.show()
fig2.show()

# Part 2 - analysing the comments

LOAD DATASETS

In [2]:

COMMENTS = 'DATA/youtube_comments.tsv.gz'
HELPERMD = 'data/yt_metadata_helper.feather'
METADATA = 'data/yt_metadata_en.jsonl.gz'
MERGEDHELPERCOMM = 'data/mergedmetadatacomments.csv'
MERGEDTAGSCOMM = 'data/finalgamingtagcomment.parquet'
METADATAFILTERED = 'data/allmtdt.parquet'

In [3]:
df_helper = feather.read_dataframe(HELPERMD)
merged_commtags = pd.read_parquet(MERGEDTAGSCOMM)


In [6]:
df_merged = pd.read_csv(MERGEDHELPERCOMM)


In [4]:
num_unique_authors = merged_commtags['author'].nunique()
print(f"The number of unique authors in mergedmetadatacomments is: {num_unique_authors}")
vids = merged_commtags['display_id'].nunique()
print(f"The number of unique videos in mergedmetadatacomments is: {vids}")
print(f"The number of comments in mergedmetadatacomments is: {len(merged_commtags)}")


The number of unique authors in mergedmetadatacomments is: 1566342
The number of unique videos in mergedmetadatacomments is: 3538946
The number of comments in mergedmetadatacomments is: 25658704


In [9]:
filtered_vids = df_helper

category_counts = filtered_vids['categories'].value_counts().reset_index()
category_counts.columns = ['Category', 'Count']

fig = px.pie(category_counts, values='Count', names='Category', title='Distribution of Videos by Category')
fig.update_traces(textposition='inside', textinfo='percent+label')

html_file = "category_counts_pie_chart.html"
fig.write_html(html_file)
pio.show(fig)

print(f"The pie chart has been saved as {html_file}")

The pie chart has been saved as category_counts_pie_chart.html


Show distribution of comments per game for some authors

In [8]:
df_mergednew = df_merged.loc[:, ['display_id', 'author', 'replies','likes','categories']]
df_author_categories = df_mergednew.groupby(['author', 'categories']).size().reset_index(name='count')
df_author_categories['count_sum'] = df_author_categories.groupby('author')['count'].transform('sum')
df_author_categories['count_ratio'] = df_author_categories['count'] / df_author_categories['count_sum']

df_author_categories = df_author_categories.drop(columns=[ 'count_sum'])
categories_to_keep = ['Gaming', 'Music', 'News & Politics', 'Science & Technology', 'Sports','Howto & Style']
df_author_categories_filtered = df_author_categories[df_author_categories['categories'].isin(categories_to_keep)]

colors = {'call of duty':'black','fifa': 'blue', 'pes': 'green', 'fortnite': 'red', 'league of legends': 'orange', 'minecraft': 'purple'}

#store in each tage the conns related to it
frames = {tag: merged_commtags[merged_commtags['tags'].str.contains(tag, case=False)] for tag in tqdm(colors)}


author_counts = merged_commtags['author'].value_counts().sort_values(ascending=False)
sorted_authors = author_counts.index.tolist()
authors = sorted_authors[:30]

results = {}

# Loop through each category
for tag, color in tqdm(colors.items()):
    # Filter the data to keep only rows with the current category
    df = frames[tag]
    # Count the number of occurrences of each author
    counts = df['author'].value_counts()
    # Keep only the counts for the specified authors
    counts = counts.reindex(authors, fill_value=0)
    # Store the counts in the results dictionary
    results[tag] = counts



# Create an empty dataframe to store the results
df = pd.DataFrame(columns=colors.keys(), index=authors)

# Loop through each author
for author in authors:
    # Loop through each category
    for tag in colors.keys():
        # Get the count for the current author and category
        count = results[tag][author]
        # Add the count to the dataframe
        df.at[author, tag] = count

for index, row in df.iterrows():
    # Divide each value in the row by the sum of the row
    row_sum = row.sum()
    df.loc[index] = row / row_sum


# Colors for the plot
colorss = ['blue', 'green', 'red', 'purple', 'orange', 'pink']
df.index = df.index.astype(str)  # Convert index to string to treat as categorical data

fig = go.Figure()

for tag in df.columns:
    fig.add_trace(go.Bar(
        x=df.index,
        y=df[tag],
        name=tag
    ))

fig.update_layout(
    barmode='stack',
    title='Fraction of Comments in Each Tag per Author',
    xaxis=dict(title='Author', type='category'),  # Set x-axis type to category
    yaxis=dict(title='Fraction of Comments'),
    legend_title_text='Tags'
)
fig.show()

# Save the plot as an HTML file
html_file = "fract of comm in each tag.html"
pio.write_html(fig, file=html_file)



0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
100%|██████████| 6/6 [03:25<00:00, 34.33s/it]
100%|██████████| 6/6 [00:00<00:00, 45.32it/s]


In [10]:

# Define the tags to consider
tags_to_consider = ['call of duty', 'fifa', 'grand theft auto', 'fortnite', 'league of legends', 'minecraft']

# Filter the comments based on the specified tags
filtered_comments = merged_commtags[merged_commtags['tags'].str.contains('|'.join(tags_to_consider), case=False)]

# Transform the tags column to only contain the specific tag
filtered_comments['tags'] = filtered_comments['tags'].apply(lambda x: [tag for tag in tags_to_consider if tag.lower() in x.lower()][0])

# Group the filtered comments by tag and count the number of comments for each tag
tag_counts = filtered_comments['tags'].value_counts().reset_index()
tag_counts.columns = ['Tag', 'Count']

# Plotting using Plotly
fig = px.pie(tag_counts, values='Count', names='Tag', title='Fraction of Comments for Each Tag')
fig.update_traces(textposition='inside', textinfo='percent+label')

# Exporting to HTML
html_file = "tag_counts_pie_chart.html"
fig.write_html(html_file)

# Display the pie chart in the notebook
pio.show(fig)

print(f"The pie chart has been saved as {html_file}")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



The pie chart has been saved as tag_counts_pie_chart.html


In [1]:
# Define the tags to plot
tags = ['call of duty', 'fifa', 'pes', 'fortnite', 'league of legends', 'minecraft']
usefuld = merged_commtags[merged_commtags['upload_date']>='2018-01-01']

# Create subplots
fig = sp.make_subplots(rows=3, cols=2, subplot_titles=[f'{tag} Videos' for tag in tags])

# Loop through each tag
for i, tag in enumerate(tags):
    # Filter the data
    a = usefuld[usefuld['tags'].str.contains(tag, case=False)]
    a = a.sort_values(by='upload_date')
    a['upload_date'] = pd.to_datetime(a['upload_date'])
    
    # Create histogram trace
    hist_trace = go.Histogram(x=a['upload_date'], nbinsx=50, name=tag)
    
    # Add trace to the subplot
    fig.add_trace(hist_trace, row=(i // 2) + 1, col=(i % 2) + 1)

# Update layout
fig.update_layout(height=600, width=900, title_text="Number of comments per Interval of Upload Date in Different Tags Videos")
fig.update_xaxes(title_text="Comment Date")
fig.update_yaxes(title_text="# Comments")
pio.show(fig)
# Save the plot as an HTML file
pio.write_html(fig, file='nbcommpertag.html')


In [4]:
# Define the tags to plot
tags = [ 'fortnite','minecraft']
usefuld = merged_commtags[merged_commtags['upload_date']>='2018-01-01']

# Create subplots
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=[f'{tag} Videos' for tag in tags])

# Loop through each tag
for i, tag in enumerate(tags):
    # Filter the data
    a = usefuld[usefuld['tags'].str.contains(tag, case=False)]
    a = a.sort_values(by='upload_date')
    a['upload_date'] = pd.to_datetime(a['upload_date'])
    
    # Create histogram trace
    hist_trace = go.Histogram(x=a['upload_date'], nbinsx=50, name=tag)
    
    # Add trace to the subplot
    fig.add_trace(hist_trace, row=(i // 2) + 1, col=(i % 2) + 1)

# Update layout
fig.update_layout(height=400, width=900, title_text="Number of comments per Interval of Upload Date in Different Tags Videos")
fig.update_xaxes(title_text="Comment Date")
fig.update_yaxes(title_text="# Comments")
pio.show(fig)
# Save the plot as an HTML file
pio.write_html(fig, file='fortnite_minecraft.html')


In [11]:
merged_commtags['id'] = range(1, len(merged_commtags) + 1)

# Filter the comments based on the specified conditions
filtered_comments = merged_commtags[(merged_commtags['tags'].str.contains('fortnite', case=False))]

first_comment_dates = filtered_comments.groupby('author').agg({'upload_date': 'min', 'id': 'first'})

merged_comments = pd.merge(filtered_comments, first_comment_dates, on=['id'], how='inner')

# Calculate the number of authors in mergedcommtags that appear in merged_comments
common_authors = len(set(merged_commtags['author']).intersection(set(merged_comments['author'])))

# Calculate the number of authors in mergedcommtags that do not appear in merged_comments
unique_authors = len(set(merged_commtags['author'])) - common_authors

# Create a pie chart
labels = ['1st Comm. in Fortnite', '1st Comm. Not in Fortnite']
values = [common_authors, unique_authors]
colors = ['blue', 'red']

fig = go.Figure(data=[go.Pie(labels=labels, values=values, marker=dict(colors=colors))])
fig.update_layout(title='Authors origin distribution in the Fortnite community', showlegend=True)

# Add legend for colors
fig.update_traces(marker=dict(colors=colors), showlegend=True)

# Save the plot as an HTML file
html_file = "authors_fraction_firstcomm_fornite.html"
pio.write_html(fig, file=html_file)

fig.show()

In [None]:
# Define the tags to plot
tags = ['virtual reality']
usefuld = merged_commtags[merged_commtags['upload_date']>='2018-01-01']

# Create subplots
fig = sp.make_subplots(rows=1, cols=1, subplot_titles=[f'{tag} Videos' for tag in tags])

# Loop through each tag
for i, tag in enumerate(tags):
    # Filter the data
    a = usefuld[usefuld['tags'].str.contains(tag, case=False)]
    a = a.sort_values(by='upload_date')
    a['upload_date'] = pd.to_datetime(a['upload_date'])
    
    # Create histogram trace
    hist_trace = go.Histogram(x=a['upload_date'], nbinsx=50, name=tag)
    
    # Add trace to the subplot
    fig.add_trace(hist_trace, row=(i // 2) + 1, col=(i % 2) + 1)

# Update layout
fig.update_layout(height=600, width=900, title_text="Number of comments per Interval of Upload Date in VR vids")
fig.update_xaxes(title_text="Comment Date")
fig.update_yaxes(title_text="# Comments")
pio.show(fig)
# Save the plot as an HTML file
pio.write_html(fig, file='plotvirtualreality.html')


# Part 3 - In depth analysis of Fortnite

In [101]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats import diagnostic

from scipy import stats
import scipy.fft as sf

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import pandas as pd
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
from ipywidgets import interact, IntSlider
import math

pd.options.plotting.backend = "plotly" 

In [102]:
def ft(arr):
    N = arr.shape[0]

    # Fourier transform
    fft = np.abs(sf.rfft(arr))/N
    freqs = sf.rfftfreq(N, d=0.1)[::-1]

    return freqs, fft

# ***Timeseries***

### *Sample Analysis*

In [103]:
sample_df = pd.read_parquet('data/df_sample_timeseries.parquet')
sample_df.shape

(5390, 18)

In [104]:
def group_by_game(df):
    grouped_df = df.groupby(['game_category', 'datetime']).mean().reset_index().groupby(['game_category'])
    games = list(grouped_df.groups.keys())
    return games, grouped_df


def timeseries_correlations(df, groups, feature, verbose=False):
    corrs = np.zeros(shape=(len(groups),len(groups)))
    for i, (category_a, category_a_df) in enumerate(df):
        for j, (category_b, category_b_df) in enumerate(df):   
            corr = stats.pearsonr(category_a_df[feature], category_b_df[feature])
            corrs[i,j] = round(corr[0], 2)
            if verbose:
                print('-'*30)
                print(f"{category_a}-{category_b}")
                print(f"Delta subs Corr. : {corr}")
    return corrs

In [105]:
def remove_games(df, games):
    if type(games) == str:
        games = [games]
    return df[~df.game_category.isin(games)]

def select_channels(df, channels):
    if type(channels) == str:
        channels = [channels]
    return df[df.channel_name.isin(channels)]

In [106]:
# Fortnite Release Date
F_date = '2017-07-21'

sample_df = remove_games(sample_df, 'Gaming News')

# Samples
sample_beforeF = remove_games(sample_df, 'Fortnite')[sample_df.datetime < F_date]
sample_withF = sample_df[sample_df.datetime >= F_date]

games_beforeF, sample_beforeF = group_by_game(sample_beforeF)
games_withF, sample_withF = group_by_game(sample_withF)


Boolean Series key will be reindexed to match DataFrame index.



##### ***Before Fortnite***

In [107]:
# Create a color palette for each category
colors = dict(zip(games_withF, px.colors.qualitative.Plotly[:len(games_withF)]))

In [108]:
fig = go.Figure()

for (game, game_df) in sample_beforeF:
    fig.add_trace(go.Scatter(
        x=game_df['datetime'],
        y=game_df['delta_views'],
        mode='lines',
        name=game,
        line=dict(color=colors[game]),
    ))

fig.update_layout(
        title=f"Main Games: Mean Weekly Delta Views Timeseries Per Game",
        xaxis_title='Date',
        yaxis_title='Delta Views',
        showlegend=True
    )

fig.write_html(f"plots/dviews_beforeF.html")

fig.show()

In [109]:
fig = go.Figure()

for (game, game_df) in sample_withF:
    fig.add_trace(go.Scatter(
        x=game_df['datetime'],
        y=game_df['delta_views'],
        mode='lines',
        name=game,
        line=dict(color=colors[game]),
    ))

max_y = 25_000_000
br_out = pd.to_datetime('2017-09-26')

# Add a vertical line for Fortnite Battle-Royale Mode
fig.add_vline(
        x=br_out,
        line_dash='dash',
        line_color='grey',
    )
fig.add_annotation(
        x=br_out, 
        y=max_y*0.8,
        text="Battle-Royale Mode Out",
        font=dict(color='grey', size=10),
        showarrow=False,
        xshift=-10,
        textangle=-90
    )

# Plot Titles & Axis Settings
fig.update_layout(
        title=f"Main Games: Mean Weekly Delta Views Timeseries Per Game",
        xaxis_title='Date',
        yaxis_title='Delta Views',
        showlegend=True
    )

fig.write_html(f"plots/dviews_withF.html")

fig.show()

In [110]:
sample_withF['delta_views'].describe().T

game_category,COD,FIFA,Fortnite,GTA,LOL,Minecraft,Mobile Games
count,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,1304813.0,3054835.0,19892140.0,4061399.0,1473028.0,22004490.0,879616.0
std,623199.7,750682.8,10783920.0,715877.2,376969.3,4221552.0,364275.5
min,431852.7,1961598.0,1772967.0,2357766.0,786062.3,12892360.0,349793.1
25%,829062.8,2560916.0,12597560.0,3688860.0,1225581.0,19123730.0,596659.2
50%,1069899.0,2891925.0,21541530.0,4144359.0,1419759.0,21442680.0,845819.9
75%,1731089.0,3334220.0,26838180.0,4583120.0,1634407.0,23853710.0,1051523.0
max,3346936.0,6698777.0,42411450.0,5459258.0,3214658.0,34672880.0,1972783.0


In [111]:
fig = px.imshow(timeseries_correlations(sample_beforeF, games_beforeF, 'delta_views'), 
                x=list(games_beforeF),
                y=list(games_beforeF),
                text_auto=True,
                title=f'Main Games: Weekly Delta Views Correlation')

fig.write_html('plots/dviews_corr_beforeF.html')

fig.show()

In [112]:
fig = px.imshow(timeseries_correlations(sample_withF, games_withF, 'delta_views')[games_withF.index('Fortnite'), np.newaxis], 
                x=list(games_withF),
                y=['Fortnite'],
                text_auto=True,
                title=f'Fortnite - Main Games: Weekly Delta Views Correlation')

fig.write_html('plots/dviews_corr_withF.html')

fig.show()

In [113]:
sample_df.channel_name.unique()

array(['PopularMMOs', 'Klaus Gaming', 'Sky Does Everything', 'Castro1021',
       'AA9skillz', 'KjraGaming', 'LispyJimmy', 'Pianta', 'speedyw03',
       'Miniminter', 'Orange Juice Gaming', 'FaZe Clan', 'SSundee',
       'TheGamingRevolution', 'Past Amazing', 'NepentheZ', 'Logdotzip',
       'Ali-A', 'MrDalekJD', 'KingStix', 'I JACK SPARROW COC',
       'MagikarpUsedFly', 'XpertThief', 'Ninja', 'ZwebackHD', 'DanTDM',
       'TheXclusiveAce', 'Lachlan', 'Galadon Gaming', 'LazarBeam', 'Tfue',
       'Trick2G', 'RaidAway', 'DarkViperAU', 'SkinSpotlights'],
      dtype=object)

In [114]:
games = sample_df.game_category.unique()
titles = np.array([['# Subscribers', '# Videos', 'Rank'] for _ in games]).flatten()

fig = make_subplots(1, 2, subplot_titles=titles)

for i, game_category in enumerate(sample_df.game_category.unique()):
    subset = sample_df[sample_df['game_category'] == game_category].drop_duplicates(subset=['channel_name']).sort_values(by='subscriber_rank_sb')

    # Simple bar plot
    fig.add_trace(
        go.Bar(
            x=subset['channel_name'], 
            y=subset['subscribers_cc'],
            name=game_category,
            visible=game_category=='Fortnite'
        ),
        row=1,
        col=1
    )

    # Simple bar plot
    fig.add_trace(
        go.Bar(
            x=subset['channel_name'], 
            y=subset['videos_cc'],
            name=game_category,
            visible=game_category=='Fortnite'
        ),
        row=1,
        col=2,
    )

# Update layout to add dropdown button
fig.update_layout(
    updatemenus=[
        dict(
            active=list(games).index('Fortnite'),
            type='dropdown',
            x=1,
            y=1.27,
            buttons=[
                dict(label=game, method='update',
                     args=[{'visible': [game == trace.name for trace in fig.data]},
                           {'title': f'{game} Top 5 YouTubers Statistics'}])
                for game in games
            ],
        ),
    ]
)

fig.update_layout(
    title='Fortnite Top 5 YouTubers Statistics',
    showlegend=False
)

fig.write_html(f"plots/top5_stats.html")

fig.show()

In [115]:
fig = go.Figure()
metrics = ['subscriber_rank_sb', 'subscribers_cc', 'videos_cc']
metrics_txt = {'subscriber_rank_sb': 'Rank', 'subscribers_cc': '# Subscribers', 'videos_cc': '# Videos'}

for i, metric in enumerate(metrics):
    fig.add_trace(
        go.Bar(
            x=games_withF,
            y=sample_withF[metric].mean(),
            width=0.75,
            name=metric,
        )
    )


# Update layout to add dropdown button
fig.update_layout(
    updatemenus=[
        dict(
            active=1,
            type='dropdown',
            x=1,
            y=1.27,
            buttons=[
                dict(label=metrics_txt[metric], method='update',
                     args=[{'visible': [metric == trace.name for trace in fig.data]},
                           {'title': f'YouTuber Mean {metrics_txt[metric]} Per Game'}])
                for metric in metrics
            ],
        ),
    ]
)

fig.update_layout(
    title='YouTuber Mean # Subscribers Per Game',
    showlegend=False
)

fig.write_html(f"plots/game_mean_stats.html")

fig.show()

In [116]:
sample_df

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,category_cc,join_date,channel_name,subscribers_cc,videos_cc,subscriber_rank_sb,weights,game_category
1,UCpGdL9Sn3Q5YWUH2DVUW1Ug,Gaming,2016-10-17,6.756017e+09,4.745762e+07,9.224396e+06,73457.375,3266,11,26,Gaming,2012-04-17,PopularMMOs,16900000,4521,187.0,2.0870,Minecraft
4527,UCN35DM_vPpMz6zPBxeBRWIA,Gaming,2016-10-17,9.580426e+05,4.905162e+04,9.970750e+03,829.500,260,2,5,Gaming,2015-11-03,Klaus Gaming,427000,1400,40344.0,3.3215,Mobile Games
2971,UCKlhpmbHGxBE6uw9B_uLeqQ,Gaming,2016-10-17,3.388057e+09,2.498308e+06,1.208029e+07,0.000,1692,0,14,Gaming,2011-02-28,Sky Does Everything,11600000,1947,400.0,2.0870,Minecraft
1250,UCJk0D9no2q29C2FGaPXrG4g,Gaming,2016-10-17,2.468123e+07,2.452740e+05,4.842880e+05,0.000,141,0,3,Gaming,2014-02-08,Castro1021,1450000,602,9074.0,2.5455,FIFA
941,UCL7vy7MDOq9-tE-r6taQBlw,Gaming,2016-10-17,2.760783e+08,1.830575e+06,1.214710e+06,8924.875,2154,7,16,Gaming,2011-03-08,AA9skillz,1850000,3273,7384.0,2.5350,FIFA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3903,UCuSrv3qgQA7SSi6R9bWag5A,Gaming,2019-09-23,3.966678e+08,1.728653e+06,1.080000e+06,0.000,2023,4,9,Gaming,2009-03-23,Trick2G,1080000,2025,14445.0,2.8430,LOL
1559,UCEe076nFuVobN0bAsXK7ICw,Gaming,2019-09-23,1.571503e+09,3.555959e+05,6.400000e+06,0.000,1869,0,0,Gaming,2006-06-11,speedyw03,6400000,1872,1186.0,2.0870,GTA
3748,UCaMi81Bt9geDCcow-hHrP_Q,Gaming,2019-09-23,1.100189e+08,8.298482e+05,7.745139e+05,3402.875,1889,5,9,Gaming,2013-06-15,RaidAway,773000,1888,20149.0,2.8690,COD
3592,UCWBQKvXstY4yCLQpGvfB1Ug,Gaming,2019-09-23,4.834162e+07,6.241375e+03,1.531202e+05,0.000,840,0,0,Gaming,2013-11-15,I JACK SPARROW COC,153981,841,117055.0,4.4920,Mobile Games


In [117]:
sample_df['join_year'] = pd.to_datetime(sample_df['join_date']).dt.year

df_counts = sample_df.drop_duplicates(subset=['channel_name']).groupby(['join_year', 'game_category']).size().reset_index(name='count')

all_years = list(range(2005, 2017))

color_palette = px.colors.qualitative.Plotly

fig = px.bar(df_counts, x='join_year', y='count', color='game_category',
             labels={'join_year': 'Join Year', 'count': 'Frequency'},
             text='count',
             title='Distribution of Join Dates by Game Category (Per Year)',
             color_discrete_sequence=color_palette)

fig.update_xaxes(categoryorder='array', categoryarray=all_years)

fig.write_html(f"plots/join_dates.html")

fig.show()

## Sep

In [118]:
mf, mf_df = group_by_game(pd.read_parquet('data/mf_timeseries.parquet'))
mf

['Fortnite Official Channel', 'Marshmello']

In [119]:
fig = go.Figure()

for (game, game_df) in mf_df:
    fig.add_trace(go.Scatter(
        x=game_df['datetime'],
        y=game_df['delta_views'],
        mode='lines',
        name=game,
    ))

date = '2019-02-02'

# Add a vertical line for Fortnite Battle-Royale Mode
fig.add_vline(
        x=date,
        line_dash='dash',
        line_color='grey',
    )
fig.add_annotation(
        x=date, 
        y=150_000_000,
        text="Marshmello Concert",
        font=dict(color='grey', size=10),
        showarrow=False,
        xshift=-10,
        textangle=-90
    )


fig.update_layout(
        title=f"Marshmello: Weekly Delta Views Timeseries",
        xaxis_title='Date',
        yaxis_title='Delta Views',
        showlegend=True
    )

fig.write_html(f"plots/marshmello.html")

fig.show()

In [120]:
pubg, pubg_df = group_by_game(pd.read_parquet('data/pubg_timeseries.parquet'))

In [121]:
fig = go.Figure()

for (game, game_df) in pubg_df:
    fig.add_trace(go.Scatter(
        x=game_df['datetime'],
        y=game_df['delta_subs'],
        mode='lines',
        name=game,
    ))

fig.update_layout(
        title=f"Main Games: Weekly Delta Views Timeseries",
        xaxis_title='Date',
        yaxis_title='Delta Views',
        showlegend=True
    )

#fig.write_html(f"plots/dviews_beforeF.html")

fig.show()

In [122]:
fortnite, fortnite_df = group_by_game(pd.read_parquet('data/fortnite_timeseries.parquet'))
fortnite

['Ali-A', 'Lachlan', 'LazarBeam', 'Ninja', 'Tfue']

In [123]:
fig = go.Figure()

for (game, game_df) in fortnite_df:
    fig.add_trace(go.Scatter(
        x=game_df['datetime'],
        y=game_df['delta_views'],
        mode='lines',
        name=game,
    ))

fig.update_layout(
        title=f"Top 5 Fortnite YouTubers: Weekly Delta Views Timeseries",
        xaxis_title='Date',
        yaxis_title='Delta Views',
        showlegend=True
    )

fig.write_html(f"plots/fortnite_timeseries.html")

fig.show()

In [124]:
mf_df = pd.read_parquet('data/mf_timeseries.parquet')
mf_df = mf_df[mf_df.game_category == 'Fortnite Official Channel']
mf_df

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity,category_cc,join_date,channel_name,subscribers_cc,videos_cc,subscriber_rank_sb,weights,game_category
160,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2016-10-10 00:00:00,9.571610e+05,6.852500e+03,1.024162e+04,88.125000,30,0,0,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
161,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2016-10-17 00:00:00,9.640365e+05,6.875500e+03,1.031850e+04,76.875000,30,0,0,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
162,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2016-10-24 00:00:00,9.710920e+05,7.055500e+03,1.038225e+04,63.750000,30,0,0,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
163,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2016-10-30 23:00:00,9.781435e+05,7.051477e+03,1.044813e+04,65.884715,30,0,0,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
164,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2016-11-06 23:00:00,9.845627e+05,6.419258e+03,1.052267e+04,74.531952,30,0,0,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2019-08-26 00:00:00,8.097804e+08,8.140717e+06,7.187415e+06,8847.500000,298,10,12,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
311,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2019-09-02 00:00:00,8.214093e+08,1.162890e+07,7.195676e+06,8261.000000,304,6,16,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
312,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2019-09-09 00:00:00,8.349836e+08,1.357433e+07,7.202027e+06,6350.750000,310,6,13,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel
313,UClG8odDC8TS6Zpqk9CGVQiQ,Gaming,2019-09-16 00:00:00,8.386207e+08,3.637114e+06,7.200253e+06,0.000000,319,9,15,Gaming,2014-01-17,Fortnite,7200000,322,774.0,2.087,Fortnite Official Channel


In [125]:
fig = go.Figure()

# for (game, game_df) in fortnite_df:
#     wc_df = game_df[('2018-09-15' <= game_df.datetime) & (game_df.datetime <= '2019-09-15')]
#     fig.add_trace(go.Scatter(
#         x=wc_df['datetime'],
#         y=wc_df['delta_views'],
#         mode='lines',
#         name=game,
#     ))


wc_df = mf_df[('2019-06-15' <= mf_df.datetime) & (mf_df.datetime <= '2019-09-15')]
fig.add_trace(go.Scatter(
    x=wc_df['datetime'],
    y=wc_df['delta_views'],
    mode='lines',
    name='Fortnite Official Channel',
))

d0 = '2019-07-26' 
d1 = '2019-07-28' 

# Add a vertical line for Fortnite Battle-Royale Mode
fig.add_vrect(
        x0=d0,
        x1=d1,
        line_width=0,
        fillcolor='blue',
        opacity=.2
    )
fig.add_annotation(
        x=d0, 
        y=28_000_000,
        text="Fortnite 2019 WC",
        font=dict(color='grey', size=10),
        showarrow=False,
        xshift=-8,
        textangle=-90
    )


fig.update_layout(
        title=f"Impact of Fortnite World Cup on Weekly Delta Views Timeseries",
        xaxis_title='Date',
        yaxis_title='Delta Views',
        showlegend=False
    )

fig.write_html(f"plots/wc_fortnite_timeseries.html")

fig.show()

## Last but not least, the migration graph.
unfortunately, you would need to run the notebook to produce the graph, it is not viewable since it is an interactive graph

In [4]:
MERGEDTAGSCOMM = 'data/finalgamingtagcomment.parquet'


In [5]:
merged_commtags = pd.read_parquet(MERGEDTAGSCOMM)
merged_commtags.head()

Unnamed: 0,author,display_id,likes,replies,tags,upload_date
0,2,9pQILRT42Cg,0,0,"faze,fazeclan,banks,faze banks,my new girlfrie...",2017-06-22 00:00:00
1,2,PWWRzCyuiFU,0,0,,2018-06-03 00:00:00
2,5,9MuGpmXGlsY,0,0,"fortnite,fortnite gameplay,fortnite gotta chil...",2019-07-13 00:00:00
3,5,UvZPbfUkMGw,0,0,"fortnite,fortnite top 10,top 10 crazy ways peo...",2019-06-17 00:00:00
4,11,qj9sjQjQ19M,0,0,"مقلب ببجي,ببجي موبايل,pubg mobile,مقلب ببجي مو...",2019-02-22 00:00:00


In [1]:
colors = {'call of duty':'black','fifa': 'blue', 'Grand Theft Auto': 'green', 'fortnite': 'red', 'league of legends': 'orange', 'minecraft': 'purple'}

frames = {tag: merged_commtags[merged_commtags['tags'].str.contains(tag, case=False)] for tag in tqdm(colors)}

In [7]:
for game in frames.keys():
    frames[game]['game'] = game

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frames[game]['game'] = game
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frames[game]['game'] = game
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frames[game]['game'] = game
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

In [8]:
df = pd.concat(frames.values()).drop(columns=['tags', 'likes', 'replies']).sort_values(by='upload_date')

In [10]:
uniqueauth = df['author'].unique()
author_dict = {author: None for author in uniqueauth}

upload_dates = df['upload_date'].unique()
dates_dict = {date: [] for date in upload_dates}

In [11]:
print(uniqueauth.shape)

(845479,)


In [12]:
authors_minecraft = {}
authors_fortnite = {}

authors_games = {game: {} for game in frames.keys()}

In [13]:
games = frames.keys()

In [14]:
for index, row in tqdm(df.iterrows()):
    # Get the tags of the current author
    author_tags = row['game']
    
    # Iterate over each tag in tags
    for tag in frames.keys():
        # Check if the current tag is in the author_tags
        if tag in author_tags:
            # Perform your desired action here
            if author_dict[row['author']] == None:
                author_dict[row['author']] = tag
            else:
                dates_dict[row['upload_date']].append((author_dict[row['author']],tag))
                author_dict[row['author']] = tag

0it [00:00, ?it/s]

8813372it [02:47, 52483.63it/s]


In [16]:
from itertools import permutations
from collections import Counter
combs = list(permutations(list(frames.keys()), 2))

In [17]:
migrations = {}
for date in tqdm(dates_dict.keys()):
    occs = Counter(dates_dict[date])
    for combination in combs:
        if combination not in occs.keys():
            occs[combination] = 0
    date_dict = {
            'migrations': list(occs.items())
        }
    migrations[date] = date_dict

100%|██████████| 3743/3743 [00:00<00:00, 12162.04it/s]


In [18]:
import networkx as nx

In [19]:
dates_list = list(dates_dict.keys())
dates_list.sort()
selected_date = dates_list[1]
selected_date


Timestamp('2008-04-04 00:00:00')

In [20]:
print(dates_list)

[Timestamp('2007-08-16 00:00:00'), Timestamp('2008-04-04 00:00:00'), Timestamp('2008-04-15 00:00:00'), Timestamp('2008-05-14 00:00:00'), Timestamp('2008-06-14 00:00:00'), Timestamp('2008-06-19 00:00:00'), Timestamp('2008-06-26 00:00:00'), Timestamp('2008-06-29 00:00:00'), Timestamp('2008-07-13 00:00:00'), Timestamp('2008-07-20 00:00:00'), Timestamp('2008-07-25 00:00:00'), Timestamp('2008-08-05 00:00:00'), Timestamp('2008-08-15 00:00:00'), Timestamp('2008-08-23 00:00:00'), Timestamp('2008-09-01 00:00:00'), Timestamp('2008-09-16 00:00:00'), Timestamp('2008-09-22 00:00:00'), Timestamp('2008-09-23 00:00:00'), Timestamp('2008-09-26 00:00:00'), Timestamp('2008-10-11 00:00:00'), Timestamp('2008-10-28 00:00:00'), Timestamp('2008-11-03 00:00:00'), Timestamp('2008-11-18 00:00:00'), Timestamp('2008-11-19 00:00:00'), Timestamp('2008-11-30 00:00:00'), Timestamp('2008-12-05 00:00:00'), Timestamp('2008-12-11 00:00:00'), Timestamp('2008-12-15 00:00:00'), Timestamp('2008-12-18 00:00:00'), Timestamp('20

In [22]:
dates_list = list(dates_dict.keys())
dates_list.sort()  


G = nx.DiGraph()
@interact(date=IntSlider(min=1200, max=len(dates_list)-1, step=1))
def plot(date):
    if date>1:
         G.remove_edges_from(list(G.edges()))
    
    
    selected_date = dates_list[date]
    m = migrations[selected_date]
    migr = m['migrations']
    weights = []
    num_users = {game: 0 for game in frames.keys()}
            
    for migration in migr:
        origin = migration[0][0]
        destination = migration[0][1]
        count = migration[1]
        weights.append(count)
        if (origin == 'fortnite' or destination == 'fortnite') and date < 2800:
            continue
        if origin == destination:
            num_users[origin] += count
        else: 
            if count >10:
                G.add_edge(origin, destination, weight=count)

        # Get edge weights  
    edge_weights = [weight for weight in weights]
    scaled_weights = [math.log(1 + weight)**3 for weight in edge_weights]

    # Normalize edge weights for visualization purposes
    
    min_scaled_weight = min(scaled_weights)
    max_scaled_weight = max(scaled_weights)

    normalized_weights = [2+ 7 * (scaled_weight - min_scaled_weight) / (1+max_scaled_weight - min_scaled_weight) for scaled_weight in scaled_weights]
    node_sizes = [1500+ 5 * num_users[node] for node in G.nodes]
    scaled_node_sizes = [math.log(1 + size) for size in node_sizes]
    node_colors = ['skyblue' if node == 'fortnite' else 'lightgray' for node in G.nodes()]

    # Draw the graph
    pos = nx.circular_layout(G)
    plt.figure(figsize=(15, 8))  # Increase the dimensions as needed
    nx.draw(G, pos, with_labels=True, node_size=node_sizes, node_color=node_colors, font_size=11, font_color="black", width=normalized_weights, edge_color='gray', arrowsize=20, connectionstyle="arc3,rad=0.1")

    # Show the graph
    plt.title("Migration of users between games on " + str(selected_date))
    plt.show()

interactive(children=(IntSlider(value=1200, description='date', max=3742, min=1200), Output()), _dom_classes=(…