# Video Classification - HOT or NOT

### Import stuff

In [124]:

# Check the versions of libraries
 
# Python version
import sys
print('Python: {}'.format(sys.version))
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy as np
print('numpy: {}'.format(np.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas as pd
print('pandas: {}'.format(pd.__version__))
# scikit-learn
import sklearn
print('sklearn: {}'.format(sklearn.__version__))


# Import more stuff
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import seaborn as sb
import os
import datetime as dt
import random as rd

Python: 3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]
scipy: 1.5.0
numpy: 1.18.5
matplotlib: 3.2.2
pandas: 1.0.5
sklearn: 0.23.1


### Import Data

In [125]:
path_folder = r"..\\data\\"

df_vc = pd.read_csv(path_folder+'video_count.csv')#.set_index('video_id')
df_vf = pd.read_csv(path_folder+'video_features.csv')#.set_index('video_id')

# Convert Quality Value for str to int
df_vf['video_quality'] = df_vf['video_quality'].apply(lambda x: int(x[:-1]))

df_vc.head(3)

Unnamed: 0,video_id,date,count
0,3,27/01/2018,6
1,3,01/02/2018,6
2,3,31/01/2018,6


### Clean and format

In [126]:
# Convert str into datetime type
df_vc['date'] = pd.to_datetime(df_vc.date, format='%d/%m/%Y')
df_vf['video_upload_date'] = pd.to_datetime(df_vf.video_upload_date, format='%d/%m/%Y')

# Create Calculated Columns
df_vc['week_start_date'] = df_vc.date - pd.to_timedelta((df_vc['date'].dt.dayofweek + 1) % 7, unit='d')


df_vf.head()

Unnamed: 0,video_id,video_length,video_language,video_upload_date,video_quality
0,1,16,chineese,2017-09-11,480
1,2,27,spanish,2017-10-03,480
2,3,30,spanish,2017-10-06,240
3,4,15,spanish,2017-10-12,720
4,5,19,chineese,2017-09-14,720


### Add Running Total

In [127]:
df_vc_rt = df_vc[['video_id', 'date', 'count']].sort_values('date').groupby(['video_id', 'date']).sum().groupby(level=0).cumsum().reset_index()
df_vc_rt.rename(columns={'count':'Running_Total'}, inplace=True)
df_vc_rt.tail(200)

df_vc = df_vc.merge(df_vc_rt, left_on=['video_id', 'date'], right_on=['video_id', 'date'])
df_vc.head()

Unnamed: 0,video_id,date,count,week_start_date,Running_Total
0,3,2018-01-27,6,2018-01-21,3396
1,3,2018-02-01,6,2018-01-28,3439
2,3,2018-01-31,6,2018-01-28,3433
3,3,2018-01-25,7,2018-01-21,3378
4,3,2018-02-03,7,2018-01-28,3454


### Calculate days from upload to each day

In [128]:
df_vc_upload = df_vc.merge(df_vf[['video_id', 'video_upload_date']], left_on='video_id', right_on='video_id')
df_vc_upload['days_from_upload'] = (df_vc_upload.date - df_vc_upload.video_upload_date).dt.days
# df_vc_upload.to_excel(path_folder+'video_vc_RT.xlsx', index=False)
df_vc = df_vc_upload
df_vc_upload.head(1)

Unnamed: 0,video_id,date,count,week_start_date,Running_Total,video_upload_date,days_from_upload
0,3,2018-01-27,6,2018-01-21,3396,2017-10-06,113


# Adding Calculated Dimensions

## 0 - Offset from weekly average

### 0.1 - Weekly count per Video

In [102]:
df_weekly = df_vc.groupby(['week_start_date', 'video_id'])['count'].sum()
df_weekly = pd.DataFrame(df_weekly).reset_index()
print(df_weekly.shape)
df_weekly.head()

(1813, 3)


Unnamed: 0,week_start_date,video_id,count
0,2017-09-03,13,736
1,2017-09-03,16,260
2,2017-09-03,32,177
3,2017-09-03,70,838
4,2017-09-03,73,538


### 0.2 - Calculate Weekly AVG

In [103]:
df_wk_avg = df_weekly[['week_start_date', 'count']].groupby('week_start_date')['count'].mean()
df_wk_avg.head(3)

week_start_date
2017-09-03    391.625000
2017-09-10    487.347826
2017-09-17    578.162162
Name: count, dtype: float64

### 0.3 - Calculate Weekly AVGs offset

In [105]:
df_joined_with_avg = df_weekly.merge(df_wk_avg, right_on='week_start_date', left_on='week_start_date', suffixes=('', '_week_AVG'))

# Create Calculated Column: Difference from WAVG
df_joined_with_avg['Avg_Offset'] = df_joined_with_avg['count'] - df_joined_with_avg.count_week_AVG.apply(lambda x: int(x))
df_joined_with_avg['Prc_Avg_Offset'] = (df_joined_with_avg['Avg_Offset'] / df_joined_with_avg['count_week_AVG'])
df_joined_with_avg['is_Above_Avg'] = (df_joined_with_avg['Prc_Avg_Offset'] > 0).apply(lambda x: int(x))
print(df_joined_with_avg.shape)
df_joined_with_avg.head(3)
df_joined_with_avg.sort_values('video_id').head(5)


(1813, 7)


Unnamed: 0,week_start_date,video_id,count,count_week_AVG,Avg_Offset,Prc_Avg_Offset,is_Above_Avg
1240,2017-12-24,1,621,451.74,170,0.376323,1
840,2017-11-26,1,647,509.93,138,0.270625,1
343,2017-10-22,1,805,594.989691,211,0.354628,1
740,2017-11-19,1,775,549.1,226,0.411583,1
1140,2017-12-17,1,628,475.11,153,0.322031,1


### 0.4 Avg the Prc Offset

In [106]:
new_measure_name = 'Avg_Offset_Prc'

df = df_joined_with_avg[['video_id', 'Prc_Avg_Offset']]
df = df.groupby(['video_id'])['Prc_Avg_Offset'].mean()
df = pd.DataFrame(df).reset_index()[['video_id', 'Prc_Avg_Offset']]
df.rename(columns={'Prc_Avg_Offset':new_measure_name}, inplace=True)
df[f'{new_measure_name}_Rnk'] = df[new_measure_name].rank(method='dense', ascending=False).astype(int)

df_avg_offset = df
print(df.shape)
df.head()

(100, 3)


Unnamed: 0,video_id,Avg_Offset_Prc,Avg_Offset_Prc_Rnk
0,1,0.375732,15
1,2,-0.3212,78
2,3,-0.626815,100
3,4,0.679774,4
4,5,0.264207,26


### 1. Total count per Video

In [107]:
new_measure_name = 'Total_Count'

df = df_vc
df = df.groupby(['video_id'])['count'].sum()
df = pd.DataFrame(df).reset_index()[['video_id', 'count']]
df.rename(columns={'count':new_measure_name}, inplace=True)
df[f'{new_measure_name}_Rnk'] = df[new_measure_name].rank(method='dense', ascending=False).astype(int)

df_total_cnt = df
print(df.shape)
df.head()

(100, 3)


Unnamed: 0,video_id,Total_Count,Total_Count_Rnk
0,1,13197,10
1,2,6163,79
2,3,3454,100
3,4,14192,5
4,5,12083,19


### 2. Daily views Avg per Video

In [108]:
new_measure_name = 'Daily_views_Avg'

df = df_vc
df = df.groupby(['video_id'])['count'].mean()
df = pd.DataFrame(df).reset_index()[['video_id', 'count']]
df.rename(columns={'count':new_measure_name}, inplace=True)
df[f'{new_measure_name}_Rnk'] = df[new_measure_name].rank(method='dense', ascending=False).astype(int)

df_daily_avg = df
print(df.shape)
df.head()

(100, 3)


Unnamed: 0,video_id,Daily_views_Avg,Daily_views_Avg_Rnk
0,1,109.975,10
1,2,51.789916,78
2,3,29.02521,100
3,4,119.260504,7
4,5,101.537815,19


### 3. Viewed days per Video

In [109]:
new_measure_name = 'Viewed_Days_Count'

df = df_vc
df = df.groupby(['video_id']).count()
df = pd.DataFrame(df).reset_index()[['video_id', 'count']]
df.rename(columns={'count':new_measure_name}, inplace=True)
df[f'{new_measure_name}_Rnk'] = df[new_measure_name].rank(method='dense', ascending=False).astype(int)

df_days_cnt = df
print(df.shape)
df.head()

(100, 3)


Unnamed: 0,video_id,Viewed_Days_Count,Viewed_Days_Count_Rnk
0,1,120,2
1,2,119,3
2,3,119,3
3,4,119,3
4,5,119,3


### 4. Above Avg Prc Video

In [110]:
new_measure_name = 'Above_Avg_Prc'

df = df_joined_with_avg
df = df.groupby(['video_id'])['is_Above_Avg'].sum() / df.groupby(['video_id'])['is_Above_Avg'].count()
df = pd.DataFrame(df).reset_index()
df.rename(columns={'is_Above_Avg':new_measure_name}, inplace=True)
df[f'{new_measure_name}_Rnk'] = df[new_measure_name].rank(method='dense', ascending=False).astype(int)

df_above_avg = df
print(df.shape)
df.head()

(100, 3)


Unnamed: 0,video_id,Above_Avg_Prc,Above_Avg_Prc_Rnk
0,1,0.944444,2
1,2,0.0,23
2,3,0.0,23
3,4,0.944444,2
4,5,0.944444,2


# Merge to Features

In [117]:
feats_df_lst = [df_total_cnt, df_daily_avg, df_days_cnt, df_above_avg, df_avg_offset]

# Merge
df_features_enriched = df_vf
for feat in feats_df_lst:
    df_features_enriched = df_features_enriched.merge(feat, left_on='video_id', right_on='video_id', how='left')

df_features_enriched.head()

Unnamed: 0,video_id,video_length,video_language,video_upload_date,video_quality,Total_Count,Total_Count_Rnk,Daily_views_Avg,Daily_views_Avg_Rnk,Viewed_Days_Count,Viewed_Days_Count_Rnk,Above_Avg_Prc,Above_Avg_Prc_Rnk,Avg_Offset_Prc,Avg_Offset_Prc_Rnk
0,1,16,chineese,2017-09-11,480,13197,10,109.975,10,120,2,0.944444,2,0.375732,15
1,2,27,spanish,2017-10-03,480,6163,79,51.789916,78,119,3,0.0,23,-0.3212,78
2,3,30,spanish,2017-10-06,240,3454,100,29.02521,100,119,3,0.0,23,-0.626815,100
3,4,15,spanish,2017-10-12,720,14192,5,119.260504,7,119,3,0.944444,2,0.679774,4
4,5,19,chineese,2017-09-14,720,12083,19,101.537815,19,119,3,0.944444,2,0.264207,26


In [119]:
df_features_enriched.to_excel(path_folder+'Features Enriched.xlsx', index=False)