In [12]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score, classification_report)
import itertools

# Import Data
df = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

In [13]:
# Define the feature matrix and target variable using the original dataset without filtering
features = ['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'is_bogo', 'is_discount', 'reward', 'difficulty', 'duration_hrs']
X = df[features]
y = df['offer_completed']

# Check the new distribution of the target variable
target_distribution_new = y.value_counts()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [14]:
# Define a new customer profile and offers (for demonstration)
new_customer_profile = pd.DataFrame({
    'age': [30, 40, 50, 60],
    'income': [50000, 60000, 70000, 80000],
    'days_as_member': [200, 400, 600, 800],
    'gender_F': [0, 1, 0, 1],
    'gender_M': [1, 0, 1, 0]
})

offers = pd.DataFrame({
    'is_bogo': [0, 1, 0, 1],
    'is_discount': [1, 0, 1, 0],
    'reward': [2, 5, 3, 10],
    'difficulty': [10, 5, 7, 10],
    'duration_hrs': [168, 120, 240, 168]
})

# Create a combined dataset for prediction
customer_offer_pairs = pd.DataFrame(itertools.product(new_customer_profile.index, offers.index), columns=['customer_idx', 'offer_idx'])
customer_offer_pairs = customer_offer_pairs.merge(new_customer_profile, left_on='customer_idx', right_index=True)
customer_offer_pairs = customer_offer_pairs.merge(offers, left_on='offer_idx', right_index=True)

# Predict response probability
X_new = customer_offer_pairs[features]
customer_offer_pairs['response_probability'] = model.predict_proba(X_new)[:, 1]

# Calculate top recommendations
grouped = customer_offer_pairs.groupby('customer_idx')
sorted_pairs = customer_offer_pairs.sort_values(by=['customer_idx', 'response_probability'], ascending=[True, False])
top_per_group = sorted_pairs.drop_duplicates(subset=['customer_idx'])
top_recommendations = top_per_group.reset_index(drop=True)
top_recommendations

Unnamed: 0,customer_idx,offer_idx,age,income,days_as_member,gender_F,gender_M,is_bogo,is_discount,reward,difficulty,duration_hrs,response_probability
0,0,1,30,50000,200,0,1,1,0,5,5,120,0.65
1,1,2,40,60000,400,1,0,0,1,3,7,240,1.0
2,2,2,50,70000,600,0,1,0,1,3,7,240,1.0
3,3,1,60,80000,800,1,0,1,0,5,5,120,0.99


In [15]:
# Predict on the test set
y_test_pred = model.predict(X_test)

# Calculate Metrics
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Unpack the confusion matrix
tn, fp, fn, tp = conf_matrix.ravel()

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

In [16]:
# Classification Report
class_report = classification_report(y_test, y_test_pred, output_dict=True)

# Extract metrics for both classes
metrics_values = [
    class_report['0']['precision'], class_report['0']['recall'], class_report['0']['f1-score'], class_report['0']['support'],
    class_report['1']['precision'], class_report['1']['recall'], class_report['1']['f1-score'], class_report['1']['support']
]

# Calculate total support as the sum of supports for both classes
total_support = class_report['0']['support'] + class_report['1']['support']

# Creating the dataframe with the metrics
metrics_df = pd.DataFrame({
    'Metric': ["Confusion Matrix TN", "Confusion Matrix FP", "Confusion Matrix FN", "Confusion Matrix TP", 
               'Accuracy', 'Precision', 'Recall', 'F1-Score', 
               'ROC-AUC Score', 'Cross-Validation Mean F1', 'Cross-Validation Std F1',
               'Support (Total)',
               "Precision (0)", "Recall (0)", "F1-Score (0)", "Support (0)",
               "Precision (1)", "Recall (1)", "F1-Score (1)", "Support (1)"],
    'Value': [tn, fp, fn, tp, accuracy, precision, recall, f1, roc_auc, mean_cv_score, std_cv_score,
              total_support] + metrics_values
})

metrics_df

Unnamed: 0,Metric,Value
0,Confusion Matrix TN,6051.0
1,Confusion Matrix FP,2133.0
2,Confusion Matrix FN,1744.0
3,Confusion Matrix TP,6639.0
4,Accuracy,0.765981
5,Precision,0.75684
6,Recall,0.79196
7,F1-Score,0.774002
8,ROC-AUC Score,0.854687
9,Cross-Validation Mean F1,0.770713
