# Train CTR Prediction Model

## 1. Load Data

In [1]:
import json
import pandas as pd
from pathlib import Path

DATA_PATH = Path('../../data/input/ctr_dataset.json')

with open(DATA_PATH) as f:
    data = json.load(f)

df = pd.DataFrame(data)
print(f'Rows: {len(df)}')
print(f'CTR mean: {df["ctr"].mean()*100:.2f}%')
df.head()

Rows: 5000
CTR mean: 4.70%


Unnamed: 0,garment_type,color,fit,gender,style,lighting,background,pose,expression,angle,ctr,impressions
0,shorts,light grey,loose,unisex,casual_lifestyle,golden_hour,studio_white,walking,serious,side,0.0552,3069
1,jeans,black,regular,unisex,lifestyle_outdoor,dramatic,park,standing,neutral,3/4,0.0402,4300
2,sweatshirt,purple,oversized,female,lifestyle_indoor,studio,urban_street,action,confident,front,0.0614,2556
3,zip-up hoodie,beige,tight,unisex,lifestyle_indoor,golden_hour,busy_pattern,dynamic,confident,back,0.0484,3574
4,jeans,beige,regular,unisex,urban_outdoor,golden_hour,urban_street,action,confident,side,0.0384,4549


## 2. Data Preparation

In [2]:
from sklearn.model_selection import train_test_split

ALL_FEATURES = ['garment_type', 'color', 'fit', 'gender',
                'style', 'lighting', 'background', 'pose', 'expression', 'angle']

X = pd.get_dummies(df[ALL_FEATURES])
y = df['ctr']
feature_columns = list(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Features: {len(feature_columns)}')
print(f'Train: {len(X_train)}  Test: {len(X_test)}')

Features: 76
Train: 4000  Test: 1000


## 3. Train

In [3]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
print('Training complete.')

Training complete.


## 4. Evaluate

In [4]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred   = rf.predict(X_test)
mae      = mean_absolute_error(y_test, y_pred)
r2       = r2_score(y_test, y_pred)
baseline = mean_absolute_error(y_test, [y_train.mean()] * len(y_test))

print(f'MAE:          {mae*100:.3f}% CTR')
print(f'Baseline MAE: {baseline*100:.3f}% CTR')
print(f'R\u00b2:           {r2:.3f}')

MAE:          0.667% CTR
Baseline MAE: 0.940% CTR
RÂ²:           0.469


## 5. Save Model

In [5]:
import joblib

MODELS_DIR = Path('../../data/models')
MODELS_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(rf, MODELS_DIR / 'rf_ctr_model.pkl')
joblib.dump(feature_columns, MODELS_DIR / 'feature_columns.pkl')

print(f'Saved to {MODELS_DIR.resolve()}')

Saved to /Users/edvinrunhellen/Documents/MAI24HA/Exjobb/skejl/data/models
