In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score, classification_report)
import itertools

# Import Data
df = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

# Drop rows where bogo = 0 and discount = 0 because we're only interested in offers that can be completed
df = df[(df['is_bogo'] != 0) | (df['is_discount'] != 0)]
df.drop(columns=['is_discount'], inplace=True) # Drop is_discount to avoid collinearity

# Creating a new taraget variable to indicate if an offer was viewed before being completed
df['offer_completed_viewed'] = df.apply(lambda x: 1 if x['offer_completed'] == 1 and x['viewed_before_completion'] == 1 else 0, axis=1)
df.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,total_transactions,total_transaction_amount,offer_completed_viewed
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,1,0,5,5,168,1,0,1,1,1,23.22,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,5,5,168,1,0,1,1,1,19.89,1
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,10,10,168,1,1,0,1,1,21.72,1
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,0,5,5,120,1,1,1,1,1,21.72,0
e2127556f4f64592b11af22de27a7932,68,70000,91,0,1,1,0,0,10,2,168,1,0,1,0,0,0.0,0


In [30]:
# Define the feature matrix and target variable using the original dataset without filtering
features = ['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'is_bogo', 'reward', 'difficulty', 'duration_hrs']
X = df[features]
y = df['offer_completed_viewed']

# Check the new distribution of the target variable
target_distribution_new = y.value_counts()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Random Forest model, class_weight='balanced' is used to handle class imbalance
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [31]:
# Define a new customer profile and offers (for demonstration)
new_customer_profile = pd.DataFrame({
    'age': [30, 40, 50, 60],
    'income': [50000, 60000, 70000, 80000],
    'days_as_member': [200, 400, 600, 800],
    'gender_F': [0, 1, 0, 1],
    'gender_M': [1, 0, 1, 0]
})

offers = pd.DataFrame({
    'is_bogo': [0, 1, 0, 1],
    'is_discount': [1, 0, 1, 0],
    'reward': [2, 5, 3, 10],
    'difficulty': [10, 5, 7, 10],
    'duration_hrs': [168, 120, 240, 168]
})

# Create a combined dataset for prediction
customer_offer_pairs = pd.DataFrame(itertools.product(new_customer_profile.index, offers.index), columns=['customer_idx', 'offer_idx'])
customer_offer_pairs = customer_offer_pairs.merge(new_customer_profile, left_on='customer_idx', right_index=True)
customer_offer_pairs = customer_offer_pairs.merge(offers, left_on='offer_idx', right_index=True)

# Predict response probability
X_new = customer_offer_pairs[features]
customer_offer_pairs['response_probability'] = model.predict_proba(X_new)[:, 1]

# Calculate top recommendations
grouped = customer_offer_pairs.groupby('customer_idx')
sorted_pairs = customer_offer_pairs.sort_values(by=['customer_idx', 'response_probability'], ascending=[True, False])
top_per_group = sorted_pairs.drop_duplicates(subset=['customer_idx'])
top_recommendations = top_per_group.reset_index(drop=True)
top_recommendations

Unnamed: 0,customer_idx,offer_idx,age,income,days_as_member,gender_F,gender_M,is_bogo,is_discount,reward,difficulty,duration_hrs,response_probability
0,0,2,30,50000,200,0,1,0,1,3,7,240,0.62
1,1,3,40,60000,400,1,0,1,0,10,10,168,0.88
2,2,2,50,70000,600,0,1,0,1,3,7,240,0.89
3,3,3,60,80000,800,1,0,1,0,10,10,168,0.91


In [32]:
# Predict on the test set
y_test_pred = model.predict(X_test)

### Model Validation Metrics ###
# Calculate ROC AUC Score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f'The ROC-ACU score is: {roc_auc}')

# Calculate False Positive and False Negative Rates
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
fpr = fp / (fp + tn)
print(f'The False Positive Rate is: {fpr}')
fnr = fn / (fn + tp)
print(f'The False Negative Rate is: {fnr}')

# Calculate Cross-Validation Score
cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
mean_cv_score = cv_scores.mean()
print(f'The mean cross-validation F1 score is: {mean_cv_score}')
std_cv_score = cv_scores.std()
print(f'The standard deviation of the cross-validation F1 scores is: {std_cv_score}')

### Generate Classication Report ###
class_report = classification_report(y_test, y_test_pred, output_dict=True)
df_main = pd.DataFrame(class_report).transpose().drop(['accuracy'])
overall_metrics = pd.DataFrame(class_report).transpose().loc[['accuracy']]
df_final = pd.concat([df_main, overall_metrics])
df_final

The ROC-ACU score is: 0.7202790822936419
The False Positive Rate is: 0.2701259583789704
The False Negative Rate is: 0.40446983700218453
The mean cross-validation F1 score is: 0.6129654453336695
The standard deviation of the cross-validation F1 scores is: 0.006161520146434323


Unnamed: 0,precision,recall,f1-score,support
0,0.688938,0.729874,0.708815,7304.0
1,0.642378,0.59553,0.618068,5951.0
macro avg,0.665658,0.662702,0.663441,13255.0
weighted avg,0.668034,0.669559,0.668073,13255.0
accuracy,0.669559,0.669559,0.669559,0.669559


In [33]:
metrics_dict = {
    'ROC-AUC Score': roc_auc,
    'False Positive Rate': fpr, 
    'False Negative Rate': fnr,
    'F1 Macro Avg': df_final.loc['macro avg', 'f1-score'],
    'F1 Weighted Avg': df_final.loc['weighted avg', 'f1-score'],
    'Mean Cross-Validation F1 Score': mean_cv_score,
    'STD Cross-Validation F1 Score': std_cv_score
}

# Convert the dictionary to a DataFrame
metrics_df = pd.DataFrame(list(metrics_dict.items()), columns=['Metric', 'Value'])
metrics_df.to_csv(r'data/04_fct/fct_personalized_evaluation_results.csv', index=False)
metrics_df

Unnamed: 0,Metric,Value
0,ROC-AUC Score,0.720279
1,False Positive Rate,0.270126
2,False Negative Rate,0.40447
3,F1 Macro Avg,0.663441
4,F1 Weighted Avg,0.668073
5,Mean Cross-Validation F1 Score,0.612965
6,STD Cross-Validation F1 Score,0.006162
