In [53]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [55]:
# Load the tables
email_df = pd.read_csv('email_table.csv')           # Main email data
opened_df = pd.read_csv('email_opened_table.csv')   # Emails that were opened
clicked_df = pd.read_csv('link_clicked_table.csv')  # Emails where link was clicked

In [57]:
# Add 'opened' and 'clicked' columns based on email IDs present in the other tables
email_df['opened'] = email_df['email_id'].isin(opened_df['email_id']).astype(int)
email_df['clicked'] = email_df['email_id'].isin(clicked_df['email_id']).astype(int)

In [61]:
# Calculate and display open and click-through rates
total_emails = len(email_df)
open_rate = email_df['opened'].mean() * 100
click_rate = email_df['clicked'].mean() * 100

print(f"Open Rate: {open_rate:.2f}%")
print(f"Click Rate: {click_rate:.2f}%")

Open Rate: 10.35%
Click Rate: 2.12%


In [63]:
# Prepare data for modeling
df = email_df.copy()

# Encode categorical features to numerical values for ML
label_cols = ['email_text', 'email_version', 'weekday', 'user_country']
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

In [67]:
# Define features (X) and target variable (y)
features = ['email_text', 'email_version', 'hour', 'weekday', 'user_country', 
            'user_past_purchases']
target = 'clicked'

X = df[features]
y = df[target]

In [73]:
# Split data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [69]:
# Evaluate model performance
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     19547
           1       0.06      0.01      0.02       453

    accuracy                           0.97     20000
   macro avg       0.52      0.50      0.50     20000
weighted avg       0.96      0.97      0.97     20000



In [71]:
# Predict click probability for each email in the full dataset
df['click_proba'] = model.predict_proba(df[features])[:, 1]

In [75]:
# Simulate model-based targeting:
# Assume company only sends to top 20% users most likely to click
top_n = int(0.2 * len(df))
df_sorted = df.sort_values(by='click_proba', ascending=False)
model_selected = df_sorted.head(top_n)

In [77]:
# Calculate simulated CTR and compare with baseline
simulated_ctr = model_selected['clicked'].mean() * 100
print(f"Simulated CTR (Top 20% model-selected users): {simulated_ctr:.2f}%")
print(f"Baseline CTR (All users): {click_rate:.2f}%")
print(f"Estimated Improvement: {simulated_ctr - click_rate:.2f}%")

Simulated CTR (Top 20% model-selected users): 9.21%
Baseline CTR (All users): 2.12%
Estimated Improvement: 7.09%
