In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score, classification_report)
import itertools

# Import Data
df = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

In [2]:
# Define the feature matrix and target variable using the original dataset without filtering
features = ['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'is_bogo', 'is_discount', 'reward', 'difficulty', 'duration_hrs']
X = df[features]
y = df['offer_completed']

# Check the new distribution of the target variable
target_distribution_new = y.value_counts()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [3]:
# Define a new customer profile and offers (for demonstration)
new_customer_profile = pd.DataFrame({
    'age': [30, 40, 50, 60],
    'income': [50000, 60000, 70000, 80000],
    'days_as_member': [200, 400, 600, 800],
    'gender_F': [0, 1, 0, 1],
    'gender_M': [1, 0, 1, 0]
})

offers = pd.DataFrame({
    'is_bogo': [0, 1, 0, 1],
    'is_discount': [1, 0, 1, 0],
    'reward': [2, 5, 3, 10],
    'difficulty': [10, 5, 7, 10],
    'duration_hrs': [168, 120, 240, 168]
})

# Create a combined dataset for prediction
customer_offer_pairs = pd.DataFrame(itertools.product(new_customer_profile.index, offers.index), columns=['customer_idx', 'offer_idx'])
customer_offer_pairs = customer_offer_pairs.merge(new_customer_profile, left_on='customer_idx', right_index=True)
customer_offer_pairs = customer_offer_pairs.merge(offers, left_on='offer_idx', right_index=True)

# Predict response probability
X_new = customer_offer_pairs[features]
customer_offer_pairs['response_probability'] = model.predict_proba(X_new)[:, 1]

# Calculate top recommendations
grouped = customer_offer_pairs.groupby('customer_idx')
sorted_pairs = customer_offer_pairs.sort_values(by=['customer_idx', 'response_probability'], ascending=[True, False])
top_per_group = sorted_pairs.drop_duplicates(subset=['customer_idx'])
top_recommendations = top_per_group.reset_index(drop=True)
top_recommendations

Unnamed: 0,customer_idx,offer_idx,age,income,days_as_member,gender_F,gender_M,is_bogo,is_discount,reward,difficulty,duration_hrs,response_probability
0,0,1,30,50000,200,0,1,1,0,5,5,120,0.65
1,1,2,40,60000,400,1,0,0,1,3,7,240,1.0
2,2,2,50,70000,600,0,1,0,1,3,7,240,1.0
3,3,1,60,80000,800,1,0,1,0,5,5,120,0.99


In [16]:
# Predict on the test set
y_test_pred = model.predict(X_test)

### Model Validation Metrics ###
# Calculate ROC AUC Score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f'The ROC-ACU score is: {roc_auc}')

# Calculate False Positive and False Negative Rates
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
fpr = fp / (fp + tn)
print(f'The False Positive Rate is: {fpr}')
fnr = fn / (fn + tp)
print(f'The False Negative Rate is: {fnr}')

# Calculate Cross-Validation Score
cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
mean_cv_score = cv_scores.mean()
print(f'The mean cross-validation F1 score is: {mean_cv_score}')
std_cv_score = cv_scores.std()
print(f'The standard deviation of the cross-validation F1 scores is: {std_cv_score}')

### Generate Classication Report ###
class_report = classification_report(y_test, y_test_pred, output_dict=True)
df_main = pd.DataFrame(class_report).transpose().drop(['accuracy'])
overall_metrics = pd.DataFrame(class_report).transpose().loc[['accuracy']]
df_final = pd.concat([df_main, overall_metrics])
df_final

The ROC-ACU score is: 0.8546874484378092
The False Positive Rate is: 0.26063049853372433
The False Negative Rate is: 0.2080400811165454
The mean cross-validation F1 score is: 0.7707134823605762
The standard deviation of the cross-validation F1 scores is: 0.003420447570060341


Unnamed: 0,precision,recall,f1-score,support
0,0.776267,0.73937,0.757369,8184.0
1,0.75684,0.79196,0.774002,8383.0
macro avg,0.766553,0.765665,0.765685,16567.0
weighted avg,0.766437,0.765981,0.765785,16567.0
accuracy,0.765981,0.765981,0.765981,0.765981


In [19]:
metrics_dict = {
    'ROC-AUC Score': roc_auc,
    'False Positive Rate': fpr, 
    'False Negative Rate': fnr,
    'F1 Macro Avg': df_final.loc['macro avg', 'f1-score'],
    'F1 Weighted Avg': df_final.loc['weighted avg', 'f1-score'],
    'Mean Cross-Validation F1 Score': mean_cv_score,
    'STD Cross-Validation F1 Score': std_cv_score
}

# Convert the dictionary to a DataFrame
metrics_df = pd.DataFrame(list(metrics_dict.items()), columns=['Metric', 'Value'])
metrics_df

Unnamed: 0,Metric,Value
0,ROC-AUC Score,0.854687
1,False Positive Rate,0.26063
2,False Negative Rate,0.20804
3,F1 Macro Avg,0.765685
4,F1 Weighted Avg,0.765785
5,Mean Cross-Validation F1 Score,0.770713
6,STD Cross-Validation F1 Score,0.00342
