## Import library

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from lightgbm import LGBMRegressor

## Load the dataset

In [5]:
df["category_id"].nunique()

15

In [2]:
df = pd.read_csv("youtube_final_csv.csv")
df.head()

Unnamed: 0,video_id,title,topic,category_id,published_at,duration_seconds,video_definition,captions_flag,channel_id,channel_title,subscriber_count,total_videos,channel_age_days,views,likes,comments
0,0BkEej0H1VU,*SMALL* Apartment Makeover w/ Surprise REVEAL!...,Hobby,22,2025-05-15T20:50:21Z,1493,hd,False,UCtwzsvzRCfPddq2gPlgKW1w,Lone Fox,1770000,485,2810,312359,19516,949
1,nL6DN0CWRWI,transforming my apartment üè°üå∑ i got a new couch...,Hobby,26,2024-05-26T17:00:34Z,1508,hd,False,UCFGQGLGZU1TgX-PhawwxjJg,annika's leaf,1070000,308,3314,353355,14823,727
2,gAgAYyo1O0o,Scandi-Style Studio Apartment Makeover With Po...,Hobby,26,2025-10-25T15:01:20Z,1798,hd,True,UCrh9tOpAY2-Ev5pRssXq2Wg,Alexandra Gater,888000,422,4020,350306,26864,1018
3,3EJu45Tkbyc,*196 sq ft* Organic Modern Studio Apartment Ma...,Hobby,26,2025-09-20T15:00:07Z,1821,hd,True,UCrh9tOpAY2-Ev5pRssXq2Wg,Alexandra Gater,888000,422,4020,458367,33976,1458
4,cShkxcg_la0,Small APARTMENT MAKEOVER 2024 :: Budget AirBnB...,Hobby,26,2024-03-18T17:00:31Z,2203,hd,False,UCbQj1aJiioDM8g0tmGmtC_w,This Crazy Life,457000,787,3009,490393,17309,1610


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50160 entries, 0 to 50159
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   video_id          50160 non-null  object
 1   title             50160 non-null  object
 2   topic             50160 non-null  object
 3   category_id       50160 non-null  int64 
 4   published_at      50160 non-null  object
 5   duration_seconds  50160 non-null  int64 
 6   video_definition  50160 non-null  object
 7   captions_flag     50160 non-null  bool  
 8   channel_id        50160 non-null  object
 9   channel_title     50160 non-null  object
 10  subscriber_count  50160 non-null  int64 
 11  total_videos      50160 non-null  int64 
 12  channel_age_days  50160 non-null  int64 
 13  views             50160 non-null  int64 
 14  likes             50160 non-null  int64 
 15  comments          50160 non-null  int64 
dtypes: bool(1), int64(8), object(7)
memory usage: 5.8+ MB


In [5]:
df.describe()

Unnamed: 0,category_id,duration_seconds,subscriber_count,total_videos,channel_age_days,views,likes,comments
count,50160.0,50160.0,50160.0,50160.0,50160.0,50160.0,50160.0,50160.0
mean,22.571531,2422.810825,2977674.0,5172.923405,3236.794597,2709607.0,30100.9,1119.361643
std,6.164992,6201.971156,10610330.0,28809.301109,1826.909349,32784820.0,186156.1,6548.073127
min,1.0,240.0,0.0,1.0,3.0,1000.0,0.0,0.0
25%,22.0,630.0,53500.0,213.0,1809.0,20331.25,425.0,29.0
50%,24.0,1199.0,323000.0,565.0,3121.0,138537.0,2630.0,141.0
75%,27.0,1990.0,1580000.0,1505.0,4581.0,813255.5,13872.25,596.0
max,29.0,367231.0,450000000.0,643411.0,7446.0,3580474000.0,12997250.0,550394.0


## Data cleaning

In [6]:
df.isnull().sum()

video_id            0
title               0
topic               0
category_id         0
published_at        0
duration_seconds    0
video_definition    0
captions_flag       0
channel_id          0
channel_title       0
subscriber_count    0
total_videos        0
channel_age_days    0
views               0
likes               0
comments            0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(90)

In [10]:
# Date conversion
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')

# drop duplicates
df = df.drop_duplicates(subset=['video_id'])

# Basic filters
df = df[df['views'] > 0]
df = df[df['likes'] <= df['views']]

# Convert categorical
df['captions_flag'] = df['captions_flag'].astype(str)
df['video_definition'] = df['video_definition'].astype(str)
df['category_id'] = df['category_id'].astype(str)

# Duration cleaning
df['duration_seconds'] = df['duration_seconds'].clip(5, 7200)
df['duration_minutes'] = df['duration_seconds'] / 60

## Feature Engineering

In [11]:
df['engagement_rate_log'] = (np.log1p(df['likes']) + np.log1p(df['comments'])) / np.log1p(df['views'])
target = 'engagement_rate_log'

In [12]:
#Log transform subscriber count
df['subscriber_count_log'] = np.log1p(df['subscriber_count'])

In [13]:
#Extract cyclic time features
df['publish_hour'] = df['published_at'].dt.hour
df['publish_dayofweek'] = df['published_at'].dt.dayofweek

df['publish_hour_sin'] = np.sin(2*np.pi*df['publish_hour']/24)
df['publish_hour_cos'] = np.cos(2*np.pi*df['publish_hour']/24)
df['publish_dow_sin'] = np.sin(2*np.pi*df['publish_dayofweek']/7)
df['publish_dow_cos'] = np.cos(2*np.pi*df['publish_dayofweek']/7)

In [15]:
#Additional features
df['is_hd'] = (df['video_definition'] == "hd").astype(int)
df['category_id_int'] = df['category_id'].astype(int)
df['subscribers_per_category'] = df['subscriber_count_log'] * df['category_id_int']

df['title_length'] = df['title'].str.len()
df['title_words'] = df['title'].str.split().str.len()
df['title_exclamation'] = df['title'].str.count('!')
df['title_question'] = df['title'].str.count('\\?')

In [16]:
#drop unnecessary columns
drop_cols = [
    'video_id','topic','title','published_at','channel_title',
    'duration_seconds','views','likes','comments','subscriber_count',
    'publish_hour','publish_dayofweek','video_definition','category_id_int'
]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

## Group split

In [18]:
categorical_features = ['category_id', 'captions_flag']
numeric_features = [
    'total_videos','channel_age_days','title_length','title_words',
    'title_exclamation','title_question',
    'duration_minutes','publish_hour_sin','publish_hour_cos',
    'publish_dow_sin','publish_dow_cos',
    'subscriber_count_log','is_hd','subscribers_per_category'
]

all_features = categorical_features + numeric_features

X = df[all_features].copy()
y = df[target].copy()
groups = df['channel_id']

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
y_train, y_test = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()


In [19]:
#Target Encoding
train_means = y_train.groupby(X_train['category_id']).mean()
X_train['category_te'] = X_train['category_id'].map(train_means)
X_test['category_te']  = X_test['category_id'].map(train_means)
numeric_features.append('category_te')


In [20]:
#outlier clipping
def robust_clip(t, v):
    q1, q3 = t.quantile([0.25,0.75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    return t.clip(lo, hi), v.clip(lo, hi)

for col in ['total_videos','channel_age_days','subscribers_per_category']:
    X_train[col], X_test[col] = robust_clip(X_train[col], X_test[col])

In [22]:
#prep categories
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')


In [24]:
base_model = LGBMRegressor(
    objective='regression',
    metric='mae',
    bagging_fraction=0.85,
    bagging_freq=5,
    lambda_l1=5,
    lambda_l2=10,
    verbosity=-1,
    random_state=42
)

param_grid = {
    "learning_rate": [0.03, 0.05],
    "num_leaves": [15, 31],
    "max_depth": [4, 5],
    "min_child_samples": [50, 100],
    "feature_fraction": [0.7, 0.8]
}

search = HalvingGridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    factor=2,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1
)

search.fit(X_train, y_train, categorical_feature=categorical_features)
print("BEST PARAMS:", search.best_params_)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 1233
max_resources_: 39475
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 32
n_resources: 1233
Fitting 3 folds for each of 32 candidates, totalling 96 fits
----------
iter: 1
n_candidates: 16
n_resources: 2466
Fitting 3 folds for each of 16 candidates, totalling 48 fits
----------
iter: 2
n_candidates: 8
n_resources: 4932
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 3
n_candidates: 4
n_resources: 9864
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 4
n_candidates: 2
n_resources: 19728
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 5
n_candidates: 1
n_resources: 39456
Fitting 3 folds for each of 1 candidates, totalling 3 fits
BEST PARAMS: {'feature_fraction': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_samples': 50, 'num_leaves': 31}


In [25]:
final_model = LGBMRegressor(
    **search.best_params_,
    objective='regression',
    metric='mae',
    random_state=42
)

final_model.fit(X_train, y_train, categorical_feature=categorical_features)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,5
,learning_rate,0.05
,n_estimators,100
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [27]:
y_pred = np.clip(final_model.predict(X_test), y_train.min(), y_train.max())
y_train_pred = np.clip(final_model.predict(X_train), y_train.min(), y_train.max())

print("====== TEST PERFORMANCE ======")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2:", r2_score(y_test, y_pred))

# Train diagnostics
y_train_pred = final_model.predict(X_train)
print("\n====== TRAIN PERFORMANCE ======")
print("MAE:", mean_absolute_error(y_train, y_train_pred))
print("Train RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Train R2:", r2_score(y_train, y_train_pred))


MAE: 0.15365056808249036
RMSE: 0.21045154780423053
R2: 0.26762606435984604

MAE: 0.13852243171442824
Train RMSE: 0.1905519605433147
Train R2: 0.38577243854307386


In [28]:
import joblib

# save the best model from your halving search
joblib.dump(final_model, "LIGHTGBM.pkl")

print("Model saved successfully!")

Model saved successfully!
