In [None]:
import pandas as pd
import numpy as np
import polars as pl
from matplotlib import pyplot as plt
from scipy import stats
from analysis_functions import *

# Filter out a certain event

In [None]:
#load all dataframes (already filtered for the news channels except for the channel dataframe)
filtered_df_vid = pl.read_csv("./../data/filtered_yt_metadata_helper.feather.csv")
df_ch = pl.read_csv("./../data/df_channels_en.tsv", separator='\t')
filtered_df_ch = df_ch.filter(pl.col("category_cc") == "News & Politics")
filtered_df_timeseries = pl.read_csv("./../data/filtered_df_timeseries_en.tsv", separator='\t')
num_comments = pl.read_csv("./../data/num_comments.tsv", separator='\t')

In [None]:
#rename channel id columns to all have the same name
filtered_df_ch = filtered_df_ch.rename({'channel':'channel_id'})
filtered_df_timeseries = filtered_df_timeseries.rename({'channel':'channel_id'})

In [None]:
#filter by date
min_date = pl.datetime(2017,1,1)
max_date = pl.datetime(2018,1,1)

filtered_df_timeseries = filtered_df_timeseries.with_columns(pl.col('datetime').str.to_datetime())
filtered_df_vid = filtered_df_vid.with_columns(pl.col('upload_date').str.to_datetime())

filtered_df_timeseries = filtered_df_timeseries.filter((pl.col('datetime') >= min_date) & (pl.col('datetime') <= max_date))
filtered_df_vid = filtered_df_vid.filter((pl.col('upload_date') >= min_date) & (pl.col('upload_date') <= max_date))
filtered_df_ch = filtered_df_ch.filter(pl.col('channel_id').is_in(filtered_df_vid['channel_id']))
filtered_num_comments = num_comments.filter(pl.col('display_id').is_in(filtered_df_vid['display_id']))

In [None]:
#create dictionnary to more easily navigate between channel name and channel id
channel_dict = dict(df_ch[['name_cc','channel']].iter_rows())
inv_channel_dict = {v: k for k, v in channel_dict.items()}

# Identify holes in the data
    - channels that don’t report for specific events
    - videos with too few comments (under 50 it is not in the comment dataset)

In [None]:
#datasets for first event
timeseries_1 = filtered_df_timeseries
videos_1 = filtered_df_vid
num_comments_1 = filtered_num_comments
channels_1 = filtered_df_ch

In [None]:
#datasets for second event
timeseries_2 = filtered_df_timeseries
videos_2 = filtered_df_vid
num_comments_2 = filtered_num_comments
channels_2 = filtered_df_ch

### Looking for channels that do not report on certain events

In [None]:
# channels that report on event1 but not on event2
channels_1.filter(~pl.col('channel_id').is_in(channels_2['channel_id']))

In [None]:
# channels that report on event2 but not on event1
channels_2.filter(~pl.col('channel_id').is_in(channels_1['channel_id']))

### Filtering out videos with too few comments

In [None]:
#the videos with too few comments can be excluded by filtering
comment_threshold = 100

too_few_comments = filtered_num_comments.filter(pl.col('num_comms') < comment_threshold) #100 is an arbitrary choice

#videos with not enough comments
filtered_df_vid.filter(pl.col('display_id').is_in(too_few_comments['display_id']))

# Compare channels
    - this channels videos, have these characteristics, or perform well with these subjects

### Channels with correlated video performances (view count, likes, dislikes, number of comments)

In [None]:
#get general statistics for all channel
#gives information on the general performance characteristics of the videos from each channel
grouped_vids = filtered_df_vid.join(filtered_num_comments, on='display_id')
vid_count, vid_mean, vid_std, vid_med = get_general_ch_statistics(grouped_vids, cols_to_keep=['dislike_count','like_count','view_count','num_comms','duration'])

In [None]:
cov = plot_covariance (vid_mean.drop('duration'),'Covariance matrix between channels', 'Histogram of covariances')

In [None]:
corrolated_channels = get_correlated_channels(vid_mean,1e10)
corrolated_channels

### Channels with correlated video characteristics (length, key words?, ...)

##### !!! Note : Keyword analysis has not been added yet.

In [None]:
cov = plot_covariance(vid_mean.drop(['num_comms','like_count','dislike_count','view_count']), 'Covariance matrix', 'Histogram of covariances')

In [None]:
get_correlated_channels(vid_mean.drop(['num_comms','like_count','dislike_count','view_count']), 1e10)

### More in depth comparaison between two given channels

Optional procedure to analyse more in depth the relation between two channels

##### Based on video dataframe

In [None]:
# ttest : checks the null hypothesis that two independant channels have an identical mean number of views, likes etc...
# used to compare if two sample's means differ significantly or not

ttest_between_two_channels(grouped_vids, channel_dict['ABC News'],channel_dict["CNN"], 'num_comms')

In [None]:
# F test : test for the null hypothesis that two channels have the same variance
# used to compare if two sample's variance differ significantly or not

Ftest_between_two_channels(grouped_vids, channel_dict['ABC News'],channel_dict["CNN"], 'num_comms')

In [None]:
plot_video_characteristics_for_given_channel(grouped_vids, channel_dict['CNN'])

##### Based on timeseries dataframe

In [None]:
ts_count, ts_mean, ts_std, ts_med = get_general_ch_statistics(filtered_df_timeseries,cols_to_keep=['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])

In [None]:
ttest_between_two_channels(filtered_df_timeseries, channel_dict['ABC News'],channel_dict["CNN"], 'activity')

In [None]:
Ftest_between_two_channels(filtered_df_timeseries, channel_dict['ABC News'],channel_dict["CNN"], 'activity')

### Compare channel's video performance when normalized by size (number of subscribers or number of views)

##### Normalize by subs

In [None]:
normalized_df_vid = normalize_vids_with_timeseries(filtered_df_vid, filtered_df_timeseries, 'subs')

In [None]:
#get general statistics for all channel
#gives information on the general performance characteristics of the videos from each channel
grouped_vids = normalized_df_vid.join(filtered_num_comments, on='display_id')
vid_count, vid_mean, vid_std, vid_med = get_general_ch_statistics(grouped_vids, cols_to_keep=['dislike_count','like_count','view_count','num_comms','duration'])

In [None]:
cov = plot_covariance (vid_mean.drop('duration'),'Covariance matrix between channels normalized by subscribers', 'Histogram of covariances')

In [None]:
corrolated_channels = get_correlated_channels(vid_mean,25000)
corrolated_channels

##### Normalize by views

In [None]:
normalized_df_vid = normalize_vids_with_timeseries(filtered_df_vid, filtered_df_timeseries, 'views')

In [None]:
#get general statistics for all channel
#gives information on the general performance characteristics of the videos from each channel
grouped_vids = normalized_df_vid.join(filtered_num_comments, on='display_id')
vid_count, vid_mean, vid_std, vid_med = get_general_ch_statistics(grouped_vids, cols_to_keep=['dislike_count','like_count','view_count','num_comms','duration'])

In [None]:
cov = plot_covariance (vid_mean.drop('duration'),'Covariance matrix between channels normalized by views', 'Histogram of covariances')

In [None]:
corrolated_channels = get_correlated_channels(vid_mean,25000)
corrolated_channels

### Compare channel performance across events

In [None]:
#datasets for first event
timeseries_1 = filtered_df_timeseries
videos_1 = filtered_df_vid
num_comments_1 = filtered_num_comments
channels_1 = filtered_df_ch

In [None]:
#datasets for second event
timeseries_2 = filtered_df_timeseries
videos_2 = filtered_df_vid
num_comments_2 = filtered_num_comments
channels_2 = filtered_df_ch

In [None]:
#calculate general statistics for both events
grouped_vids_1 = videos_1.join(num_comments_1, on='display_id')
vid_count_1, vid_mean_1, vid_std_1, vid_med_1 = get_general_ch_statistics(grouped_vids_1, cols_to_keep=['dislike_count','like_count','view_count','num_comms'])

grouped_vids_2 = videos_2.join(num_comments_2, on='display_id')
vid_count_2, vid_mean_2, vid_std_2, vid_med_2 = get_general_ch_statistics(grouped_vids_2, cols_to_keep=['dislike_count','like_count','view_count','num_comms'])


ts_count_1, ts_mean_1, ts_std_1, ts_med_1 = get_general_ch_statistics(timeseries_1,cols_to_keep=['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])
ts_count_2, ts_mean_2, ts_std_2, ts_med_2 = get_general_ch_statistics(timeseries_2,cols_to_keep=['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])

##### Compare general channel performance between multiple events

In [None]:
df_1 = vid_mean_1.mean()
df_2 = vid_mean_2.mean()

event_performance = pl.concat([df_1,df_2])

In [None]:
#Covariance matrix for the channel performance to identify channels that perform similarly for a given event.
cov = plot_covariance(event_performance,'Covariance across the mean performance of all channels for different events','Histogram of the covariance between events')

##### Compare a given channel statistic between two events

In [None]:
# ttest : checks the null hypothesis that a given parameter has the same mean across between two events
# used to compare if two means differ significantly or not

ttest_between_events(ts_mean_1['activity'], ts_mean_2['activity'])

In [None]:
# Ftest : checks the null hypothesis that a given parameter has the same variance across between two events
# used to compare if two means differ significantly or not

Ftest_between_events(ts_mean_1['activity'], ts_mean_2['activity'])

# Compare between kinds of events and where events are from
    - how many videos
    - how many views
    - interactions: likes, comments

In [None]:
#compute general statistics for each event

vid_count_1, vid_mean_1, vid_std_1, vid_med_1 = get_general_ch_statistics(videos_1)
vid_count_2, vid_mean_2, vid_std_2, vid_med_2 = get_general_ch_statistics(videos_2)

ts_count_1, ts_mean_1, ts_std_1, ts_med_1 = get_general_ch_statistics(timeseries_1,['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])
ts_count_2, ts_mean_2, ts_std_2, ts_med_2 = get_general_ch_statistics(timeseries_2,['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])

### Compare number of videos

In [None]:
#compare number of videos between two events

compare_overall_vid_count_between_events(vid_count_1, vid_count_2)

In [None]:
# Compare average number of videos per channel between two events
ttest_between_events(vid_count_1['counts'], vid_count_2['counts'])

In [None]:
# Compare variance of the number of videos per channel between two events
Ftest_between_events(vid_count_1['counts'], vid_count_2['counts'])

### Analyse each event by videos (number of views, number of likes/dislikes)

In [None]:
v_means_1,v_stdevs_1,v_medians_1 = get_general_vid_statistics(videos_1)
v_means_2,v_stdevs_2,v_medians_2 = get_general_vid_statistics(videos_2)

In [None]:
pl.concat([v_means_1,v_means_2]).insert_column(0,pl.Series(['event_1','event_2']))

In [None]:
compare_video_statistics_between_events(videos_1,videos_2)

In [None]:
ttest_between_events(videos_1['view_count'], videos_2['view_count'])

# How many comments have replies in each video
    - See distributions across different channels/topics
    - LET JEFF KNOW IF YOU NEED HELP WITH THIS