In [None]:
import pandas as pd
import numpy as np
import polars as pl
from matplotlib import pyplot as plt
from scipy import stats
from functions import *

# Filter out a certain event

In [None]:
#load all dataframes (already filtered for the news channels except for the channel dataframe)
filtered_df_vid = pl.read_csv("./../data/filtered_yt_metadata_helper.feather.csv")
df_ch = pl.read_csv("./../data/df_channels_en.tsv", separator='\t')
filtered_df_ch = df_ch.filter(pl.col("category_cc") == "News & Politics")
filtered_df_timeseries = pl.read_csv("./../data/filtered_df_timeseries_en.tsv", separator='\t')
num_comments = pl.read_csv("./../data/num_comments.tsv", separator='\t')

In [None]:
#rename channel_id columns to all have the same name
filtered_df_ch = filtered_df_ch.rename({'channel':'channel_id'})
filtered_df_timeseries = filtered_df_timeseries.rename({'channel':'channel_id'})

In [None]:
#find the high activity channels
#56 => 4 videos a day
grouped_df = filtered_df_timeseries.group_by('channel_id').agg(pl.col('activity').mean().alias('mean_activity'))
high_activity_channels = filtered_df_ch.filter(pl.col("channel_id").is_in(grouped_df.filter(pl.col("mean_activity")>56)["channel_id"]))
high_activity_channels = high_activity_channels.join(grouped_df, on="channel_id", how="inner")

In [None]:
#filter the dataframes to keep the high frequency channels
filtered_df_vid = filtered_df_vid.filter(pl.col('channel_id').is_in(high_activity_channels['channel_id']))
filtered_df_ch = filtered_df_ch.filter(pl.col('channel_id').is_in(high_activity_channels['channel_id']))
filtered_df_timeseries = filtered_df_timeseries.filter(pl.col('channel_id').is_in(high_activity_channels['channel_id']))
num_comments = num_comments.filter(pl.col('display_id').is_in(filtered_df_vid['display_id']))

In [None]:
#filter out videos not tagged News and Politics
filtered_df_vid = filtered_df_vid.filter(pl.col('categories') == 'News & Politics')
filtered_df_ch = filtered_df_ch.filter(pl.col('category_cc') == 'News & Politics')
filtered_df_timeseries = filtered_df_timeseries.filter(pl.col('category') == 'News & Politics')
filtered_num_comments = num_comments.filter(pl.col('display_id').is_in(filtered_df_vid['display_id']))

In [None]:
#filter by date
min_date = pl.datetime(2017,1,1)
max_date = pl.datetime(2018,1,1)

filtered_df_timeseries = filtered_df_timeseries.with_columns(pl.col('datetime').str.to_datetime())
filtered_df_vid = filtered_df_vid.with_columns(pl.col('upload_date').str.to_datetime())

filtered_df_timeseries = filtered_df_timeseries.filter((pl.col('datetime') >= min_date) & (pl.col('datetime') <= max_date))
filtered_df_vid = filtered_df_vid.filter((pl.col('upload_date') >= min_date) & (pl.col('upload_date') <= max_date))
filtered_df_ch = filtered_df_ch.filter(pl.col('channel_id').is_in(filtered_df_vid['channel_id']))
filtered_num_comments = num_comments.filter(pl.col('display_id').is_in(filtered_df_vid['display_id']))

In [None]:
#create dictionnary to more easily navigate between channel name and channel id
channel_dict = dict(df_ch[['name_cc','channel']].iter_rows())
inv_channel_dict = {v: k for k, v in channel_dict.items()}

# Compare channels
    - this channels videos, have these characteristics, or perform well with these subjects
### General statistics

In [None]:
#get general statistics for all channel on a given column

vid_count, vid_mean, vid_std, vid_med = get_general_ch_statistics(filtered_df_vid)

In [None]:
# ttest : checks the null hypothesis that two independant channels have an identical mean number of views, likes etc...
# used to compare if two sample's means differ significantly or not

ttest_between_two_channels(filtered_df_vid, channel_dict['ABC News'],channel_dict["CNN"], 'like_count')

In [None]:
# F test : test for the null hypothesis that two channels have the same variance
# used to compare if two sample's variance differ significantly or not

Ftest_between_two_channels(filtered_df_vid, channel_dict['ABC News'],channel_dict["CNN"], 'view_count')

In [None]:
plot_video_variables_for_video_dataset(filtered_df_vid, channel_dict['CNN'])

### Compare two channels normalized by size (number of subscribers, number of views)

In [None]:
normalized_df_vid = normalize_vids_with_timeseries(filtered_df_vid, filtered_df_timeseries, 'subs')

In [None]:
ttest_between_two_channels(normalized_df_vid, channel_dict['ABC News'],channel_dict["CNN"], 'view_count')

In [None]:
Ftest_between_two_channels(normalized_df_vid, channel_dict['ABC News'],channel_dict["CNN"], 'view_count')

### Compare timeseries data

In [None]:
ts_count, ts_mean, ts_std, ts_med = get_general_ch_statistics(filtered_df_timeseries,cols_to_keep=['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])

In [None]:
ttest_between_two_channels(filtered_df_timeseries, channel_dict['ABC News'],channel_dict["CNN"], 'activity')

In [None]:
Ftest_between_two_channels(filtered_df_timeseries, channel_dict['ABC News'],channel_dict["CNN"], 'activity')

# Identify holes in the data
    - channels that don’t report for specific events
    - videos with too few comments (under 50 it is not in the comment dataset)

In [None]:
#datasets for first event
timeseries_1 = filtered_df_timeseries
videos_1 = filtered_df_vid
num_comments_1 = filtered_num_comments
channels_1 = filtered_df_ch

In [None]:
#datasets for second event
timeseries_2 = filtered_df_timeseries
videos_2 = filtered_df_vid
num_comments_2 = filtered_num_comments
channels_2 = filtered_df_ch

### Channels that don't report on both events

In [None]:
# channels that report on event1 but not on event2
channels_1.filter(~pl.col('channel_id').is_in(channels_2['channel_id']))

In [None]:
# channels that report on event2 but not on event1
channels_2.filter(~pl.col('channel_id').is_in(channels_1['channel_id']))

### Filtering out videos with too few comments

In [None]:
#the videos with too few comments can be excluded by filtering
too_few_comments = filtered_num_comments.filter(pl.col('num_comms') < 100) #100 is an arbitrary choice

#videos with not enough comments
filtered_df_vid.filter(pl.col('display_id').is_in(too_few_comments['display_id']))

# Compare between kinds of events and where events are from
    - how many videos
    - how many views
    - interactions: likes, comments

### Analyse each event by channel

In [None]:
#compute general statistics for each event

vid_count_1, vid_mean_1, vid_std_1, vid_med_1 = get_general_ch_statistics(videos_1)
vid_count_2, vid_mean_2, vid_std_2, vid_med_2 = get_general_ch_statistics(videos_2)

ts_count_1, ts_mean_1, ts_std_1, ts_med_1 = get_general_ch_statistics(timeseries_1,['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])
ts_count_2, ts_mean_2, ts_std_2, ts_med_2 = get_general_ch_statistics(timeseries_2,['views', 'delta_views', 'subs','delta_subs','videos','delta_videos','activity'])

In [None]:
plot_video_variables_for_video_dataset(videos_1, channel_dict['CNN'])
plot_video_variables_for_video_dataset(videos_2, channel_dict['CNN'])

In [None]:
# Compare average number of videos per channel between two events
ttest_between_events(vid_count_1['counts'], vid_count_2['counts'])

In [None]:
# Compare variance of the number of videos per channel between two events
Ftest_between_events(vid_count_1['counts'], vid_count_2['counts'])

In [None]:
# Compare average number of subscribers gained per channel on videos of a given event
ttest_between_events(ts_mean_1['delta_subs'], ts_mean_2['delta_subs'])

In [None]:
# Compare variance of the number subscribers gained per channel on videos of a given event
Ftest_between_events(ts_mean_1['delta_subs'], ts_mean_2['delta_subs'])

### Analyse each event by videos

In [None]:
v_means,v_stdevs,v_medians = get_general_vid_statistics(filtered_df_vid)
v_means

In [None]:
ttest_between_events(videos_1['view_count'], videos_2['view_count'])

# How many comments have replies in each video
    - See distributions across different channels/topics
    - LET JEFF KNOW IF YOU NEED HELP WITH THIS