# EDA: Ads data

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix



In [2]:
# Load the synthetic ad data
df = pd.read_csv("../data/ads_data.csv")
# Quick look at the data
df.head()

Unnamed: 0,user_id,timestamp,device_type,ad_id,campaign_id,site_id,ad_category,clicked
0,user_1825,2025-01-02 05:08:22,tablet,ad_032,camp_04,site_05,tech,0
1,user_8936,2025-01-05 05:17:27,mobile,ad_005,camp_01,site_03,fashion,1
2,user_3812,2025-01-25 12:46:53,tablet,ad_072,camp_04,site_18,finance,0
3,user_3612,2025-01-22 19:21:59,mobile,ad_001,camp_03,site_14,sports,0
4,user_4553,2025-01-08 13:08:48,mobile,ad_098,camp_06,site_04,tech,0


In [3]:
# Check the class distirbution/balance
df['clicked'].value_counts(normalize=True)

clicked
0    0.950633
1    0.049367
Name: proportion, dtype: float64

In [4]:
# Per column check number of unique values
df.nunique()

user_id         9967
timestamp      59322
device_type        3
ad_id            100
campaign_id       10
site_id           20
ad_category        5
clicked            2
dtype: int64

In [None]:
sns.countplot(data=df, x='clicked')
plt.title("Click Distribution")
plt.show()

In [6]:
# Check the relation between categorical columns and the target
# categorical_cols =df.select_dtypes(include='object').columns
# categorical_cols = [col for col in categorical_cols if col != 'clicked']

# for col in categorical_cols:
#     click_rate = df.groupby(col)['clicked'].mean().sort_values(ascending=False)
#     print(click_rate)

#     plt.figure(figsize=(6,3))
#     sns.barplot(x=click_rate.index, y=click_rate.values)
#     plt.title(f"Click-through Rate by {col}")
#     plt.ylabel("CTR")
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()

In [11]:
# Create date time fetaures
df['timestamp'] = pd.to_datetime(df['timestamp'])

df['hour']=df['timestamp'].dt.hour
df['day_of_week']=df['timestamp'].dt.day_of_week
df['weekend']=df['day_of_week'].isin([5,6]).astype(int)

# time of day
def map_time_of_day(hour):
    if 5 <=hour < 12:
        return 'morning'
    elif 12 <=hour<17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

df['time_of_day'] =df['hour'].apply(map_time_of_day)

# Drop timestamp to avoid leakage
df =df.drop(columns=['timestamp'])

In [29]:
# Define target and features
X = df.drop(columns=['clicked'])
y =df[['clicked']]

# Identify column types
categorical_colums = X.select_dtypes(include='object').columns.tolist()
numeric_cols = ['hour', 'day_of_week', 'weekend']


In [None]:
print("Numerical columns:", numeric_cols)
print("Categorical columns:", categorical_colums)

In [31]:
# 3. --- Preprocessing ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'),categorical_colums),
    ]
)


In [32]:
# 4. --- Train test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=32, stratify=y
)

In [33]:
# 5.--- Model pipeline ---
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=32))
    ]
)

In [34]:
# 6. --- Train and evaluate ---
model.fit(X_train, y_train)
y_pred= model.predict(X_test)

print("Confusion matrix")
print(confusion_matrix(y_test, y_pred))

print("n\Classification Report:")
print(classification_report(y_test, y_pred))

  print("n\Classification Report:")
  return fit_method(estimator, *args, **kwargs)


Confusion matrix
[[11338    70]
 [  587     5]]
n\Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     11408
           1       0.07      0.01      0.01       592

    accuracy                           0.95     12000
   macro avg       0.51      0.50      0.49     12000
weighted avg       0.91      0.95      0.92     12000



In [38]:
X.columns

Index(['user_id', 'device_type', 'ad_id', 'campaign_id', 'site_id',
       'ad_category', 'hour', 'day_of_week', 'weekend', 'time_of_day'],
      dtype='object')

In [37]:
# Serialize the model
joblib.dump(model,'/Users/juanfe/Documents/Datascience/Projects/real-time-ad-bidding-pipeline/models/ad_click_pipeline.pkl')

['/Users/juanfe/Documents/Datascience/Projects/real-time-ad-bidding-pipeline/models/ad_click_pipeline.pkl']