## 0.3. Interim Data Initial Exploration

This notebook provides a brief exploration of the interim dataset obtained from `youtube_trends/dataset.py` and saved in `data/interim/dataset.csv`. This exploration was performed to determine the techniques and tools to use during data processing for future analysis. The data processing stage can also be found in `youtube_trends/dataset.py`.

In [1]:
import re
import torch
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
from IPython.display import HTML
from IPython.display import display
from scipy.stats import gaussian_kde
from sklearn.decomposition import PCA
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from youtube_trends.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[32m2025-05-16 21:26:38.976[0m | [1mINFO    [0m | [36myoutube_trends.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\eddel\OneDrive\Documents\MCD\AAA\youtube_trends\venv\src\youtube-trends[0m


Checking the quiality of data in `df_train`, `df_val` and `df_test`.

In [33]:
df_train = pd.read_csv(INTERIM_DATA_DIR / "train_dataset.csv", low_memory=False)
df_val = pd.read_csv(INTERIM_DATA_DIR / "val_dataset.csv", low_memory=False)
df_test = pd.read_csv(INTERIM_DATA_DIR / "test_dataset.csv", low_memory=False)

In [3]:
display(df_train)

Unnamed: 0,video_published_at,video_duration,video_view_count,video_like_count,video_comment_count,channel_view_count,channel_subscriber_count,published_dayofweek,published_hour,days_to_trend,...,video_title_language_sv,video_title_language_sw,video_title_language_tl,video_title_language_tr,video_title_language_unknown,video_title_language_vi,video_category_pca_0,video_category_pca_1,video_category_pca_2,video_category_pca_3
0,2025-03-19 12:30:12,60.0,9075151.0,228504.0,119.0,5.082500e+09,9620000.0,2,12,10,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
1,2025-03-19 12:30:12,60.0,9345171.0,233878.0,121.0,5.085156e+09,9620000.0,2,12,12,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
2,2025-03-19 12:30:12,60.0,5704659.0,138572.0,72.0,5.070373e+09,9610000.0,2,12,3,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
3,2025-03-19 12:30:12,60.0,3459131.0,79531.0,46.0,5.067204e+09,9600000.0,2,12,2,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
4,2025-03-19 12:30:12,60.0,10228951.0,253555.0,129.0,5.096864e+09,9630000.0,2,12,19,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271640,2025-04-10 21:18:05,660.0,128084.0,3259.0,454.0,1.601386e+08,358000.0,3,21,2,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.282365,-0.088570,0.739837,-0.551516
271641,2025-04-10 21:18:05,660.0,98587.0,2777.0,389.0,1.599977e+08,358000.0,3,21,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.282365,-0.088570,0.739837,-0.551516
271642,2025-04-10 21:18:05,660.0,98412.0,2776.0,389.0,1.599977e+08,358000.0,3,21,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.282365,-0.088570,0.739837,-0.551516
271643,2025-04-10 21:18:05,660.0,120471.0,3122.0,440.0,1.600908e+08,358000.0,3,21,1,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.282365,-0.088570,0.739837,-0.551516


In [4]:
display(df_val)

Unnamed: 0,video_published_at,video_duration,video_view_count,video_like_count,video_comment_count,channel_view_count,channel_subscriber_count,published_dayofweek,published_hour,days_to_trend,...,video_title_language_sv,video_title_language_sw,video_title_language_tl,video_title_language_tr,video_title_language_unknown,video_title_language_vi,video_category_pca_0,video_category_pca_1,video_category_pca_2,video_category_pca_3
0,2025-04-10 21:18:05,660.0,134505.0,3342.0,461.0,160443341.0,358000.0,3,21,5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.282365,-0.088570,0.739837,-0.551516
1,2025-04-10 21:18:05,660.0,134505.0,3342.0,461.0,160443341.0,358000.0,3,21,5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.282365,-0.088570,0.739837,-0.551516
2,2025-04-10 21:18:05,660.0,128080.0,3259.0,454.0,160138604.0,358000.0,3,21,2,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.282365,-0.088570,0.739837,-0.551516
3,2025-04-10 21:18:37,171.0,359909.0,9059.0,825.0,85008981.0,525000.0,3,21,13,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
4,2025-04-10 21:18:37,171.0,297385.0,8654.0,794.0,84050367.0,521000.0,3,21,8,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58211,2025-04-17 12:01:27,28.0,1304269.0,30991.0,104.0,901086919.0,2070000.0,3,12,5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.245858,-0.062659,0.329247,0.743482
58212,2025-04-17 12:01:33,62.0,361452.0,19420.0,117.0,478672644.0,1730000.0,3,12,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.032040
58213,2025-04-17 12:01:33,62.0,544466.0,28039.0,144.0,479543684.0,1730000.0,3,12,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.032040
58214,2025-04-17 12:01:33,62.0,553162.0,28363.0,144.0,479693133.0,1730000.0,3,12,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.032040


In [5]:
display(df_test)

Unnamed: 0,video_published_at,video_duration,video_view_count,video_like_count,video_comment_count,channel_view_count,channel_subscriber_count,published_dayofweek,published_hour,days_to_trend,...,video_title_language_sv,video_title_language_sw,video_title_language_tl,video_title_language_tr,video_title_language_unknown,video_title_language_vi,video_category_pca_0,video_category_pca_1,video_category_pca_2,video_category_pca_3
0,2025-04-17 12:01:33,62.0,300117.0,16302.0,104.0,478357091.0,1730000.0,3,12,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
1,2025-04-17 12:01:33,62.0,511110.0,26760.0,135.0,479297744.0,1730000.0,3,12,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
2,2025-04-17 12:01:33,62.0,198056.0,11222.0,68.0,477915510.0,1730000.0,3,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
3,2025-04-17 12:01:35,1131.0,334192.0,19842.0,421.0,77013861.0,215000.0,3,12,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
4,2025-04-17 12:01:35,1131.0,361084.0,20553.0,426.0,77194021.0,216000.0,3,12,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58214,2025-04-29 23:39:17,1200.0,494390.0,27101.0,1610.0,190363872.0,3270000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
58215,2025-04-29 23:39:17,1200.0,494385.0,27085.0,1610.0,190363872.0,3270000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
58216,2025-04-29 23:39:17,1200.0,494385.0,27085.0,1610.0,190363872.0,3270000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204
58217,2025-04-29 23:39:17,1200.0,494385.0,27082.0,1610.0,190363872.0,3270000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.03204


In [None]:
for column in df_train.columns:
    print(column)

Removing duplicated values and recreating `df_train`, `df_val` and `df_test`.

In [6]:
df = pd.concat([df_train, df_val, df_test], axis=0, ignore_index=True)
df = df.drop_duplicates()

In [7]:
df = df.sort_values(by='video_published_at')

train_end = int(len(df) * 0.7)
val_end = int(len(df) * 0.85)

df_train = df.iloc[:train_end]
df_val = df.iloc[train_end:val_end]
df_test = df.iloc[val_end:]

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [8]:
display(df_train)

Unnamed: 0,video_published_at,video_duration,video_view_count,video_like_count,video_comment_count,channel_view_count,channel_subscriber_count,published_dayofweek,published_hour,days_to_trend,...,video_title_language_sv,video_title_language_sw,video_title_language_tl,video_title_language_tr,video_title_language_unknown,video_title_language_vi,video_category_pca_0,video_category_pca_1,video_category_pca_2,video_category_pca_3
0,2025-03-19 12:30:12,60.0,9075151.0,228504.0,119.0,5.082500e+09,9620000.0,2,12,10,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
1,2025-03-19 12:30:12,60.0,9431677.0,235578.0,122.0,5.086626e+09,9620000.0,2,12,13,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
2,2025-03-19 12:30:12,60.0,8578672.0,217814.0,112.0,5.077640e+09,9620000.0,2,12,6,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
3,2025-03-19 12:30:12,60.0,10305804.0,255262.0,129.0,5.098385e+09,9630000.0,2,12,20,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
4,2025-03-19 12:30:12,60.0,9546141.0,238542.0,123.0,5.087882e+09,9620000.0,2,12,14,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.473435,0.714817,-0.309635,-0.096183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130222,2025-04-11 09:00:07,135.0,853255.0,15395.0,7145.0,1.220683e+08,162000.0,4,9,17,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
130223,2025-04-11 09:00:07,143.0,506394.0,6333.0,264.0,9.929313e+08,822000.0,4,9,5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
130224,2025-04-11 09:00:07,143.0,646582.0,6938.0,277.0,9.936595e+08,822000.0,4,9,8,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
130225,2025-04-11 09:00:07,143.0,506386.0,6333.0,264.0,9.929313e+08,822000.0,4,9,5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053


In [9]:
display(df_val)

Unnamed: 0,video_published_at,video_duration,video_view_count,video_like_count,video_comment_count,channel_view_count,channel_subscriber_count,published_dayofweek,published_hour,days_to_trend,...,video_title_language_sv,video_title_language_sw,video_title_language_tl,video_title_language_tr,video_title_language_unknown,video_title_language_vi,video_category_pca_0,video_category_pca_1,video_category_pca_2,video_category_pca_3
0,2025-04-11 09:00:07,135.0,190418.0,11268.0,6282.0,116893584.0,159000.0,4,9,0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
1,2025-04-11 09:00:07,135.0,517706.0,14028.0,7044.0,119496239.0,160000.0,4,9,5,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
2,2025-04-11 09:00:07,135.0,429518.0,13454.0,6898.0,118139416.0,160000.0,4,9,3,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
3,2025-04-11 09:00:07,143.0,799801.0,7468.0,285.0,994502818.0,822000.0,4,9,12,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
4,2025-04-11 09:00:07,143.0,799811.0,7468.0,285.0,994502818.0,822000.0,4,9,12,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27902,2025-04-17 17:00:06,166.0,89108.0,4467.0,630.0,81058112.0,142000.0,3,17,2,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
27903,2025-04-17 17:00:06,4334.0,87576.0,2972.0,135.0,57241349.0,128000.0,3,17,7,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.231205,-0.054801,0.258836,0.345655
27904,2025-04-17 17:00:06,4334.0,92139.0,3032.0,137.0,57311462.0,128000.0,3,17,10,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.231205,-0.054801,0.258836,0.345655
27905,2025-04-17 17:00:06,4334.0,89306.0,2989.0,135.0,57269882.0,128000.0,3,17,8,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.231205,-0.054801,0.258836,0.345655


In [10]:
display(df_test)

Unnamed: 0,video_published_at,video_duration,video_view_count,video_like_count,video_comment_count,channel_view_count,channel_subscriber_count,published_dayofweek,published_hour,days_to_trend,...,video_title_language_sv,video_title_language_sw,video_title_language_tl,video_title_language_tr,video_title_language_unknown,video_title_language_vi,video_category_pca_0,video_category_pca_1,video_category_pca_2,video_category_pca_3
0,2025-04-17 17:00:06,166.0,169198.0,5478.0,729.0,81905201.0,142000.0,3,17,11,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
1,2025-04-17 17:00:06,4334.0,94562.0,3066.0,139.0,57348201.0,128000.0,3,17,12,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.231205,-0.054801,0.258836,0.345655
2,2025-04-17 17:00:08,171.0,1108951.0,13607.0,313.0,394182210.0,393000.0,3,17,11,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
3,2025-04-17 17:00:08,171.0,1182751.0,13976.0,319.0,394182210.0,393000.0,3,17,12,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
4,2025-04-17 17:00:08,171.0,234037.0,7686.0,213.0,390087497.0,391000.0,3,17,1,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.411025,-0.690400,-0.454210,-0.118053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27901,2025-04-29 23:38:45,2019.0,72380.0,5500.0,387.0,23682586.0,166000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.032040
27902,2025-04-29 23:39:17,1200.0,494385.0,27085.0,1610.0,190363872.0,3270000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.032040
27903,2025-04-29 23:39:17,1200.0,494385.0,27085.0,1610.0,190363872.0,3270000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.032040
27904,2025-04-29 23:39:17,1200.0,494390.0,27101.0,1610.0,190363872.0,3270000.0,1,23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.738773,0.031086,-0.069742,-0.032040


#### Title columns

In [11]:
df_train_title = df_train.filter(like='video_title_')
for column in list(df_train_title.columns):
    print(column)

video_title_length
video_title_clean
video_title_translated
video_title_language_af
video_title_language_ca
video_title_language_cs
video_title_language_cy
video_title_language_da
video_title_language_de
video_title_language_en
video_title_language_es
video_title_language_et
video_title_language_fi
video_title_language_fr
video_title_language_hr
video_title_language_hu
video_title_language_id
video_title_language_it
video_title_language_lt
video_title_language_lv
video_title_language_nl
video_title_language_no
video_title_language_pl
video_title_language_pt
video_title_language_ro
video_title_language_sk
video_title_language_sl
video_title_language_so
video_title_language_sq
video_title_language_sv
video_title_language_sw
video_title_language_tl
video_title_language_tr
video_title_language_unknown
video_title_language_vi


In [12]:
df_train = df_train.drop(['video_title_clean', 'video_title_translated'], axis=1)
df_val = df_val.drop(['video_title_clean', 'video_title_translated'], axis=1)
df_test = df_test.drop(['video_title_clean', 'video_title_translated'], axis=1)

In [13]:
df_train_title = df_train.filter(like='video_title_')
for column in list(df_train_title.columns):
    print(column)

video_title_length
video_title_language_af
video_title_language_ca
video_title_language_cs
video_title_language_cy
video_title_language_da
video_title_language_de
video_title_language_en
video_title_language_es
video_title_language_et
video_title_language_fi
video_title_language_fr
video_title_language_hr
video_title_language_hu
video_title_language_id
video_title_language_it
video_title_language_lt
video_title_language_lv
video_title_language_nl
video_title_language_no
video_title_language_pl
video_title_language_pt
video_title_language_ro
video_title_language_sk
video_title_language_sl
video_title_language_so
video_title_language_sq
video_title_language_sv
video_title_language_sw
video_title_language_tl
video_title_language_tr
video_title_language_unknown
video_title_language_vi


In [14]:
def plot_distribution(df, column, color="#636EFA"):
    values = df[column].dropna()
        
    kde = gaussian_kde(values)
    x_range = np.linspace(values.min(), values.max(), 200)
    y_values = kde(x_range)

    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        row_heights=[0.7, 0.3],
        vertical_spacing=0.05,
        subplot_titles=(f"{column} - KDE", f"{column} - Boxplot")
    )

    fig.add_trace(go.Histogram(
        x=values,
        name=f"{column} Histograma",
        marker_color=color,
        opacity=0.75
    ), row=1, col=1)

    fig.add_trace(go.Box(
        x=values,
        name=f"{column} Boxplot",
        marker_color=color,
        boxmean=True,
        orientation='h'
    ), row=2, col=1)

    fig.update_layout(
        height=500,
        width=700,
        title_text=f"Distribution and Boxplot for {column}",
        template="plotly_white"
    )

    fig.update_xaxes(title_text="Value", row=2, col=1)
    fig.update_yaxes(title_text="Density", row=1, col=1)

    fig.show()

In [15]:
plot_distribution(df_train_title, 'video_title_length')

In [16]:
df_train_title = df_train_title.drop(['video_title_length'], axis=1)

In [17]:
train_lang_videos = df_train_title.sum(numeric_only=True).copy()
train_lang_videos = train_lang_videos[train_lang_videos.index.str.startswith('video_title_language_')]
train_lang_videos.index = train_lang_videos.index.str.replace('video_title_language_', '', regex=False)
df_train_lang = train_lang_videos.reset_index()
df_train_lang.columns = ['language', 'videos']
df_train_lang = df_train_lang.sort_values(by='videos', ascending=False)

fig = px.bar(df_train_lang, x='language', y='videos', title='Amount of videos per detected language')
fig.update_layout(xaxis_title='Language', yaxis_title='Videos', template='plotly_white')
fig.show()

In [18]:
df_train_lang['percentage'] = round(df_train_lang['videos'] / len(df_train_title), 2)

fig = px.bar(df_train_lang, x='language', y='percentage', title='Percentage of videos in each detected language')
fig.update_layout(xaxis_title='Language', yaxis_title='Videos', template='plotly_white')
fig.show()

PCA will be applied to reduced dimensionality, since there are some detected languages that are neglilible.

In [None]:
def reduce_language_pca(df_train, df_val, df_test, pca_variance_target=0.8, pca_max_components=10):
    lang_cols = [col for col in df_train.columns if str(col).startswith('video_title_language_')]

    df_train = df_train.dropna(subset=lang_cols)
    df_val = df_val.dropna(subset=lang_cols)
    df_test = df_test.dropna(subset=lang_cols)

    X_train = df_train[lang_cols].values
    X_val = df_val[lang_cols].values
    X_test = df_test[lang_cols].values

    cumulative = np.cumsum(PCA().fit(X_train).explained_variance_ratio_)
    n_components = np.argmax(cumulative >= pca_variance_target) + 1
    
    n_components = min(pca_max_components, n_components)

    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    X_test_pca = pca.transform(X_test)

    pca_cols = [f'lang_pca_{i}' for i in range(n_components)]

    df_train_pca = pd.DataFrame(X_train_pca, columns=pca_cols, index=df_train.index)
    df_val_pca = pd.DataFrame(X_val_pca, columns=pca_cols, index=df_val.index)
    df_test_pca = pd.DataFrame(X_test_pca, columns=pca_cols, index=df_test.index)

    df_train = pd.concat([df_train.drop(columns=lang_cols), df_train_pca], axis=1)
    df_val = pd.concat([df_val.drop(columns=lang_cols), df_val_pca], axis=1)
    df_test = pd.concat([df_test.drop(columns=lang_cols), df_test_pca], axis=1)

    return df_train, df_val, df_test, pca

In [23]:
df_train, df_val, df_test, language_pca = reduce_language_pca(df_train, df_val, df_test)

In [26]:
display(df_train.filter(like='lang_pca_'))

Unnamed: 0,lang_pca_0,lang_pca_1,lang_pca_2,lang_pca_3,lang_pca_4,lang_pca_5,lang_pca_6,lang_pca_7,lang_pca_8,lang_pca_9
0,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
1,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021
2,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021
3,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021
4,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021
...,...,...,...,...,...,...,...,...,...,...
130222,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
130223,-0.333936,-0.285876,-0.177380,-0.034585,0.122728,0.717304,0.586194,0.149858,0.044043,0.000200
130224,-0.333936,-0.285876,-0.177380,-0.034585,0.122728,0.717304,0.586194,0.149858,0.044043,0.000200
130225,-0.333936,-0.285876,-0.177380,-0.034585,0.122728,0.717304,0.586194,0.149858,0.044043,0.000200


In [27]:
display(df_val.filter(like='lang_pca_'))

Unnamed: 0,lang_pca_0,lang_pca_1,lang_pca_2,lang_pca_3,lang_pca_4,lang_pca_5,lang_pca_6,lang_pca_7,lang_pca_8,lang_pca_9
0,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
1,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
2,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
3,-0.333936,-0.285876,-0.177380,-0.034585,0.122728,0.717304,0.586194,0.149858,0.044043,0.000200
4,-0.333936,-0.285876,-0.177380,-0.034585,0.122728,0.717304,0.586194,0.149858,0.044043,0.000200
...,...,...,...,...,...,...,...,...,...,...
27902,-0.325397,-0.265472,-0.139378,-0.018144,0.056656,0.163949,-0.554448,0.768687,0.096090,0.000366
27903,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021
27904,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021
27905,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021


In [28]:
display(df_test.filter(like='lang_pca_'))

Unnamed: 0,lang_pca_0,lang_pca_1,lang_pca_2,lang_pca_3,lang_pca_4,lang_pca_5,lang_pca_6,lang_pca_7,lang_pca_8,lang_pca_9
0,-0.325397,-0.265472,-0.139378,-0.018144,0.056656,0.163949,-0.554448,0.768687,0.096090,0.000366
1,-0.645268,0.691893,0.044201,0.002208,-0.006004,-0.011748,0.015098,0.009484,0.004019,0.000021
2,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
3,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
4,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
...,...,...,...,...,...,...,...,...,...,...
27901,-0.343326,-0.310771,-0.247795,-0.622538,-0.569655,-0.284901,0.186440,0.081487,0.028151,0.000135
27902,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
27903,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007
27904,0.630230,0.082127,0.011908,0.000667,-0.001832,-0.003664,0.004847,0.003126,0.001352,0.000007


#### Thumbnail stats columns

In [29]:
df_train_stats = df_train.filter(like='thumbnail_')
display(df_train_stats)

Unnamed: 0,thumbnail_brightness,thumbnail_contrast,thumbnail_saturation
0,0.401444,0.490360,0.173942
1,0.401444,0.490360,0.173942
2,0.401444,0.490360,0.173942
3,0.401444,0.490360,0.173942
4,0.401444,0.490360,0.173942
...,...,...,...
130222,0.383354,0.563104,0.406289
130223,0.107356,0.373082,0.279782
130224,0.107356,0.373082,0.279782
130225,0.107356,0.373082,0.279782


In [30]:
df_train_stats.describe()

Unnamed: 0,thumbnail_brightness,thumbnail_contrast,thumbnail_saturation
count,130227.0,130227.0,130227.0
mean,0.40542,0.569546,0.401801
std,0.130601,0.12562,0.153554
min,0.0,0.0,0.0
25%,0.327867,0.489312,0.293722
50%,0.405627,0.568635,0.401346
75%,0.480718,0.659145,0.50284
max,1.0,1.0,1.0


In [31]:
for stat in df_train_stats.columns:
    plot_distribution(df_train_stats, stat)

Columns remained

In [32]:
for column in df_train.columns:
    print(column)

video_published_at
video_duration
video_view_count
video_like_count
video_comment_count
channel_view_count
channel_subscriber_count
published_dayofweek
published_hour
days_to_trend
video_title_length
video_tag_count
thumbnail_brightness
thumbnail_contrast
thumbnail_saturation
thumb_pca_0
thumb_pca_1
thumb_pca_2
thumb_pca_3
thumb_pca_4
thumb_pca_5
thumb_pca_6
thumb_pca_7
thumb_pca_8
thumb_pca_9
thumb_pca_10
thumb_pca_11
thumb_pca_12
thumb_pca_13
thumb_pca_14
thumb_pca_15
thumb_pca_16
thumb_pca_17
thumb_pca_18
thumb_pca_19
thumb_pca_20
thumb_pca_21
thumb_pca_22
thumb_pca_23
thumb_pca_24
thumb_pca_25
thumb_pca_26
thumb_pca_27
thumb_pca_28
thumb_pca_29
thumb_pca_30
thumb_pca_31
thumb_pca_32
thumb_pca_33
thumb_pca_34
thumb_pca_35
thumb_pca_36
thumb_pca_37
thumb_pca_38
thumb_pca_39
10
100
1000
10000
100000
11
12
14
1446
15
150
19
20
202425
2025
2026
21
210m
22
24
2425
25
26
27
28
30
41
4k
abandonado
action
acts
actually
ad
aerated
afford
after
ahead
al
all
an
and
andreygrechka
animation
anti