# Generating clean data to use in duration analysis and other notebooks

In [None]:
# Imports
%load_ext autoreload
%autoreload 2
from config import DRIVE_PATH_VIVA, label_clustering, content_categories
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
import math
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr, spearmanr

In [None]:
# Loading raw data
general_path = os.path.join(DRIVE_PATH_VIVA, 'extracted_Education')
dir_list = os.listdir(general_path)
total_df = pd.DataFrame()
for dir in dir_list:
    batch_dir = os.path.join(general_path, dir)
    batch_df = pd.read_csv(batch_dir)
    total_df = pd.concat([total_df, batch_df])
print(len(total_df))
    
channels_df = pd.read_csv('data/raw/df_channels_en.tsv', sep = "\t")
channels_df["join_date"] = pd.to_datetime(channels_df["join_date"])

# Classified with keyword search
classified_df = pd.read_csv('data/derivatives/subcategories_18_12_w_spam.csv')


### Processing

> - Print number of videos
> - Check duration
> - Get duration in minutes
> - After visualizing time distribution, 
> - Remove too small and too large videos?
> - Adding label column (classified / channel)
> - Add year channel
> - Create duration bins and add duration bin column
> - Select period
> - Add likes / dislikes / views for each videos (function in basic file)
> - Drop nans
> - Remove the keyword search for trash
> - Plot hist of duration bins

In [None]:
# Add duration in minutes for clarity
total_df['duration_in_min'] = total_df['duration']/60

In [None]:
# Visualise the durations distribution
sns.histplot(data = total_df, x = 'duration_in_min', bins = 50)
plt.title('Distribution of video duration in labelled data')
plt.xlabel('Duration [min]')
plt.yscale('log')

# Zoom on small durations
sns.histplot(data = total_df[total_df['duration_in_min']< 10], x = 'duration_in_min', bins = 20)
plt.title('Distribution of video duration in labelled data')
plt.xlabel('Duration [min]')
plt.yscale('log')


In [None]:
def find_key(category):
    for key, values in label_clustering.items():
        if category in values:
            return key
    return 'trash'

def get_metric_per_view(row, metric):
    return row[metric]/row['view_count']

In [None]:
# Select durations and view limits
min_duration = 0.5 # in minutes
max_duration = 1000
total_df = total_df[(total_df['duration_in_min'] > min_duration) & (total_df['duration_in_min'] < max_duration)]

min_views = 1000
total_df = total_df[total_df['view_count'] > min_views]

# Add duration bin column
duration_bins = [0, 5, 10, 15, 20, 25, 30, 40, 50, 60, 90, 120, float('inf')]
duration_bin_titles = ['0-5', '5-10', '10-15', '15-20', '20-25', '25-30', '30-40', 
                       '40-50', '50-60', '60-90', '90-120', '>120']

# Create duration_bin column
total_df['duration_bin'] = pd.cut(total_df['duration_in_min'], bins=duration_bins, labels=duration_bin_titles, right=False)

# Add label and year columns
category_ch_map = channels_df.set_index('channel')['category_cc'].to_dict()
category_lab_int_map = classified_df.set_index('display_id')['category']
classified_df['category_name'] = classified_df['category'].apply(find_key)
category_lab_str_map = classified_df.set_index('display_id')['category_name']

total_df['channel_category'] = total_df['channel_id'].map(category_ch_map)
total_df['classified_category'] = total_df['display_id'].map(category_lab_str_map)
total_df['upload_year'] = total_df['upload_date'].apply(lambda x : int(x[0:4]))

# Drop NaN + 'nan' in channel categories
total_df = total_df.dropna()
total_df = total_df[total_df['channel_category'] != 'nan']

# select time period
period = [2015, 2016, 2017, 2018]
period_data = total_df[total_df['upload_year'].isin(period)]

# Remove trash videos
period_data = period_data[period_data['classified_category'] != 'trash']

# Add metric/view
period_data['likes_per_view']= period_data.apply(lambda x: get_metric_per_view(x, 'like_count'), axis = 1)
period_data['dislikes_per_view']= period_data.apply(lambda x: get_metric_per_view(x, 'dislike_count'), axis = 1)

print(len(period_data))

In [None]:
period_data.to_csv(os.path.join('data', 'derivatives', 'clean_viva.csv'))

In [None]:
# Plot bin histogram
fig = plt.figure(figsize = (10, 5))
sns.histplot(data = period_data, x = 'duration_bin')
plt.title('Distribution of Video Duration in Total Data')
plt.xlabel('Duration [min]')
plt.ylabel('Video Count')
# plt.yscale('log')
# plt.ylabel('Video Count (x10^5)')
# plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x / 1e5:.1f}'))

plt.tight_layout()