In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import itertools

data_new = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

In [2]:
# Define the feature matrix and target variable using the original dataset without filtering
X = data_new[['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'is_bogo', 'is_discount', 'reward', 'difficulty', 'duration_hrs']]
y = data_new['offer_completed']

# Check the new distribution of the target variable
target_distribution_new = y.value_counts()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Define a new customer profile and offers (for demonstration)
new_customer_profile = pd.DataFrame({
    'age': [30, 40, 50, 60],
    'income': [50000, 60000, 70000, 80000],
    'days_as_member': [200, 400, 600, 800],
    'gender_F': [0, 1, 0, 1],
    'gender_M': [1, 0, 1, 0]
})

offers = pd.DataFrame({
    'is_bogo': [0, 1, 0, 1],
    'is_discount': [1, 0, 1, 0],
    'reward': [2, 5, 3, 10],
    'difficulty': [10, 5, 7, 10],
    'duration_hrs': [168, 120, 240, 168]
})

# Create a combined dataset for prediction
customer_offer_pairs = pd.DataFrame(itertools.product(new_customer_profile.index, offers.index), columns=['customer_idx', 'offer_idx'])
customer_offer_pairs = customer_offer_pairs.merge(new_customer_profile, left_on='customer_idx', right_index=True)
customer_offer_pairs = customer_offer_pairs.merge(offers, left_on='offer_idx', right_index=True)

# Predict response probability
X_new = customer_offer_pairs[['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'is_bogo', 'is_discount', 'reward', 'difficulty', 'duration_hrs']]
customer_offer_pairs['response_probability'] = model.predict_proba(X_new)[:, 1]

# Display the top recommendations for each customer
top_recommendations = customer_offer_pairs.groupby('customer_idx').apply(lambda x: x.nlargest(1, 'response_probability')).reset_index(drop=True)
top_recommendations


  top_recommendations = customer_offer_pairs.groupby('customer_idx').apply(lambda x: x.nlargest(1, 'response_probability')).reset_index(drop=True)


Unnamed: 0,customer_idx,offer_idx,age,income,days_as_member,gender_F,gender_M,is_bogo,is_discount,reward,difficulty,duration_hrs,response_probability
0,0,1,30,50000,200,0,1,1,0,5,5,120,0.65
1,1,2,40,60000,400,1,0,0,1,3,7,240,1.0
2,2,2,50,70000,600,0,1,0,1,3,7,240,1.0
3,3,1,60,80000,800,1,0,1,0,5,5,120,0.99


In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Predict on the test set
y_test_pred = model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Accuracy
accuracy = accuracy_score(y_test, y_test_pred)

# Precision
precision = precision_score(y_test, y_test_pred)

# Recall
recall = recall_score(y_test, y_test_pred)

# F1-Score
f1 = f1_score(y_test, y_test_pred)

# Classification Report
class_report = classification_report(y_test, y_test_pred)

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

(conf_matrix, accuracy, precision, recall, f1, roc_auc, class_report)


(array([[6051, 2133],
        [1744, 6639]], dtype=int64),
 0.7659805637713527,
 0.7568399452804377,
 0.7919599188834546,
 0.7740017487612941,
 0.8546874484378092,
 '              precision    recall  f1-score   support\n\n           0       0.78      0.74      0.76      8184\n           1       0.76      0.79      0.77      8383\n\n    accuracy                           0.77     16567\n   macro avg       0.77      0.77      0.77     16567\nweighted avg       0.77      0.77      0.77     16567\n')

In [4]:
from sklearn.model_selection import cross_val_score

# Cross-Validation Score
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

(mean_cv_score, std_cv_score)

(0.76129092256842, 0.004709980326464351)

Interpretation

    Confusion Matrix: The model correctly predicted 6051 true negatives and 6639 true positives, but it also had 2133 false positives and 1744 false negatives. This indicates a fairly balanced performance between both classes.

    Accuracy: The overall accuracy of the model is 76.60%, which is reasonable but indicates there is room for improvement.

    Precision, Recall, and F1-Score:
        Precision is slightly higher for class 0 (not completed offers) compared to class 1 (completed offers), indicating the model is more precise when predicting non-completion.
        Recall is higher for class 1, suggesting the model is better at identifying completed offers than non-completed ones.
        The F1-score balances precision and recall, showing a good performance for both classes.

    ROC-AUC Score: The ROC-AUC score of 85.47% indicates that the model has a good ability to distinguish between the two classes, as a score closer to 1 is better.

    Cross-Validation: The mean cross-validation score is consistent with the overall accuracy, and the low standard deviation indicates stable performance across different subsets of the data.

Conclusion

The model performs reasonably well with a good balance between precision and recall for both classes and a high ROC-AUC score. However, there is still room for improvement, especially in reducing false positives and false negatives. Potential improvements could include:

    Further tuning the model parameters.
    Trying different algorithms.
    Including additional relevant features.
    Addressing class imbalance through techniques like oversampling or undersampling.