## Import Libraries

In [73]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit, GridSearchCV, GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor

## Data Loading

In [74]:
df = pd.read_csv("youtube_final_csv.csv")

In [75]:
df.head()

Unnamed: 0,video_id,title,topic,category_id,published_at,duration_seconds,video_definition,captions_flag,channel_id,channel_title,subscriber_count,total_videos,channel_age_days,views,likes,comments
0,0BkEej0H1VU,*SMALL* Apartment Makeover w/ Surprise REVEAL!...,Hobby,22,2025-05-15T20:50:21Z,1493,hd,False,UCtwzsvzRCfPddq2gPlgKW1w,Lone Fox,1770000,485,2810,312359,19516,949
1,nL6DN0CWRWI,transforming my apartment üè°üå∑ i got a new couch...,Hobby,26,2024-05-26T17:00:34Z,1508,hd,False,UCFGQGLGZU1TgX-PhawwxjJg,annika's leaf,1070000,308,3314,353355,14823,727
2,gAgAYyo1O0o,Scandi-Style Studio Apartment Makeover With Po...,Hobby,26,2025-10-25T15:01:20Z,1798,hd,True,UCrh9tOpAY2-Ev5pRssXq2Wg,Alexandra Gater,888000,422,4020,350306,26864,1018
3,3EJu45Tkbyc,*196 sq ft* Organic Modern Studio Apartment Ma...,Hobby,26,2025-09-20T15:00:07Z,1821,hd,True,UCrh9tOpAY2-Ev5pRssXq2Wg,Alexandra Gater,888000,422,4020,458367,33976,1458
4,cShkxcg_la0,Small APARTMENT MAKEOVER 2024 :: Budget AirBnB...,Hobby,26,2024-03-18T17:00:31Z,2203,hd,False,UCbQj1aJiioDM8g0tmGmtC_w,This Crazy Life,457000,787,3009,490393,17309,1610


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50160 entries, 0 to 50159
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   video_id          50160 non-null  object
 1   title             50160 non-null  object
 2   topic             50160 non-null  object
 3   category_id       50160 non-null  int64 
 4   published_at      50160 non-null  object
 5   duration_seconds  50160 non-null  int64 
 6   video_definition  50160 non-null  object
 7   captions_flag     50160 non-null  bool  
 8   channel_id        50160 non-null  object
 9   channel_title     50160 non-null  object
 10  subscriber_count  50160 non-null  int64 
 11  total_videos      50160 non-null  int64 
 12  channel_age_days  50160 non-null  int64 
 13  views             50160 non-null  int64 
 14  likes             50160 non-null  int64 
 15  comments          50160 non-null  int64 
dtypes: bool(1), int64(8), object(7)
memory usage: 5.8+ MB


In [77]:
df.describe()

Unnamed: 0,category_id,duration_seconds,subscriber_count,total_videos,channel_age_days,views,likes,comments
count,50160.0,50160.0,50160.0,50160.0,50160.0,50160.0,50160.0,50160.0
mean,22.571531,2422.810825,2977674.0,5172.923405,3236.794597,2709607.0,30100.9,1119.361643
std,6.164992,6201.971156,10610330.0,28809.301109,1826.909349,32784820.0,186156.1,6548.073127
min,1.0,240.0,0.0,1.0,3.0,1000.0,0.0,0.0
25%,22.0,630.0,53500.0,213.0,1809.0,20331.25,425.0,29.0
50%,24.0,1199.0,323000.0,565.0,3121.0,138537.0,2630.0,141.0
75%,27.0,1990.0,1580000.0,1505.0,4581.0,813255.5,13872.25,596.0
max,29.0,367231.0,450000000.0,643411.0,7446.0,3580474000.0,12997250.0,550394.0


## Data Cleaning

In [78]:
df.isnull().sum()

video_id            0
title               0
topic               0
category_id         0
published_at        0
duration_seconds    0
video_definition    0
captions_flag       0
channel_id          0
channel_title       0
subscriber_count    0
total_videos        0
channel_age_days    0
views               0
likes               0
comments            0
dtype: int64

In [79]:
df.duplicated().sum()

np.int64(90)

In [80]:
# Drop duplicates
df = df.drop_duplicates(subset=['video_id'])

In [81]:
df.duplicated().sum()

np.int64(0)

In [82]:
# Basic Filters
df = df[df['views'] > 0]
df = df[df['likes'] <= df['views']]

## Feature Engineering

In [83]:
# Computing target variable (Engagement Rate)
df['engagement_rate'] = (np.log1p(df['likes']) + np.log1p(df['comments'])) / np.log1p(df['views'])

In [84]:
# Datetime conversion
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')

df['publish_hour'] = df['published_at'].dt.hour
df['publish_dayofweek'] = df['published_at'].dt.dayofweek

df['publish_hour_sin'] = np.sin(2*np.pi*df['publish_hour']/24)
df['publish_hour_cos'] = np.cos(2*np.pi*df['publish_hour']/24)
df['publish_dow_sin'] = np.sin(2*np.pi*df['publish_dayofweek']/7)
df['publish_dow_cos'] = np.cos(2*np.pi*df['publish_dayofweek']/7)

In [85]:
# Categorical conversions
df['category_id'] = df['category_id'].astype(str)
df['video_definition'] = df['video_definition'].astype(str)
df['captions_flag'] = df['captions_flag'].astype(str)

In [86]:
# Convert Duration in seconds to minutes
df['duration_seconds'] = df['duration_seconds'].clip(5, 7200)
df['duration_minutes'] = df['duration_seconds'] / 60

In [87]:
# Resolving noisiness of subscriber count by Log-Transformation
df['subscriber_count_log'] = np.log1p(df['subscriber_count'])

# Additinal Features
df['title_length'] = df['title'].fillna("").str.len()
df['title_words'] = df['title'].fillna("").str.split().str.len()
df['title_exclamation'] = df['title'].fillna("").str.count('!')
df['title_question'] = df['title'].fillna("").str.count(r'\?')

df['is_hd'] = (df['video_definition'] == "hd").astype(int)

In [88]:
# Drop uneccessary/ leaky columns
drop_cols = [
    'video_id', 'topic', 'title', 'published_at', 'channel_title',
    'duration_seconds', 'views', 'likes', 'comments', 'subscriber_count',
    'publish_hour', 'publish_dayofweek','video_definition'
]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

In [89]:
# Group Train-Test Split
target = 'engagement_rate'
categorical_features = ['category_id', 'captions_flag']

numeric_features = [
    'total_videos','channel_age_days','title_length','title_words',
    'title_exclamation','title_question',
    'duration_minutes','publish_hour_sin','publish_hour_cos',
    'publish_dow_sin','publish_dow_cos',
    'subscriber_count_log','is_hd'
]

all_features = categorical_features + numeric_features

X = df[all_features].copy()
y = df[target].copy()

groups = df['channel_id']

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

groups_train = groups.iloc[train_idx].copy()

train_means = y_train.groupby(X_train['category_id']).mean()
X_train['category_te'] = X_train['category_id'].map(train_means)
X_test['category_te']  = X_test['category_id'].map(train_means)
numeric_features.append('category_te')

In [90]:
# Clipping
def robust_clip(train_col, test_col):
    q_low = train_col.quantile(0.01)
    q_high = train_col.quantile(0.99)
    
    Q1, Q3 = train_col.quantile(0.25), train_col.quantile(0.75)
    IQR = Q3 - Q1
    
    lower = max(q_low, Q1 - 1.5*IQR)
    upper = min(q_high, Q3 + 1.5*IQR)
    
    return train_col.clip(lower, upper), test_col.clip(lower, upper)

for col in ['total_videos', 'channel_age_days']:
    X_train[col], X_test[col] = robust_clip(X_train[col], X_test[col])

## Model Training

In [91]:
# Train Catboost
base_model = CatBoostRegressor(
    loss_function='RMSE',
    cat_features=categorical_features,
    random_seed=42,
    verbose=0
)

param_grid = {
    "depth": [4, 6],
    "learning_rate": [0.02, 0.03, 0.05],
    "iterations": [1200, 2000],
    "l2_leaf_reg": [5, 20],
    "subsample": [0.7, 0.9]
}

# GroupKFold for grouped cross-validation
group_cv = GroupKFold(n_splits=3)

grid = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=group_cv,
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train, groups=groups_train)

# Train final model using Best Parameters
print("\n BEST PARAMS FROM GRID SEARCH ")
print(grid.best_params_)

best_params = grid.best_params_

model = CatBoostRegressor(
    **best_params,
    loss_function='RMSE',
    cat_features=categorical_features,
    random_seed=42,
    early_stopping_rounds=150,
    verbose=200
)

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test)
)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.02, subsample=0.7; total time=  21.7s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.02, subsample=0.7; total time=  21.8s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.02, subsample=0.7; total time=  21.9s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.02, subsample=0.9; total time=  23.0s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.03, subsample=0.7; total time=  22.1s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.02, subsample=0.9; total time=  22.3s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.03, subsample=0.7; total time=  21.4s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.02, subsample=0.9; total time=  22.7s
[CV] END depth=4, iterations=1200, l2_leaf_reg=5, learning_rate=0.03, subsample=0.7; total time=  22.3s
[C

<catboost.core.CatBoostRegressor at 0x135e0c550>

In [92]:
# Evaluation
y_pred = model.predict(X_test)

print("\n TEST PERFORMANCE ")
print("Test MAE:", mean_absolute_error(y_test, y_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("Test R2:", r2_score(y_test, y_pred))

y_train_pred = model.predict(X_train)
print("\n TRAIN PERFORMANCE ")
print("Train RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Train MAE:", mean_absolute_error(y_train, y_train_pred))
print("Train R2:", r2_score(y_train, y_train_pred))


 TEST PERFORMANCE 
Test MAE: 0.15204589042670164
Test RMSE: 0.20823041251799765
Test R2: 0.28300364071088824

 TRAIN PERFORMANCE 
Train RMSE: 0.18483668126343047
Train MAE: 0.13370693441005715
Train R2: 0.4220652859186843


## Save the Model

In [93]:
import joblib
joblib.dump(model, "Catboost_final.pkl")

print("Model saved successfully!")

Model saved successfully!
