In [7]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [8]:
data_new = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

In [9]:
### Use a decision tree classifier to determine channel importance ###
# Extract features and target variable
features = data_new[['mobile', 'social', 'web']]
target = data_new['offer_viewed']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Initialize and train the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Extract feature importances
feature_importances = clf.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': ['mobile', 'social', 'web'], 'Importance': feature_importances})

# Display the results
accuracy, importance_df

(0.8011710025955212,
   Feature  Importance
 0  mobile    0.055459
 1  social    0.932559
 2     web    0.011982)

The decision tree classifier achieved an accuracy of approximately 80.12%. The social channel was the most important factor in determining whether an offer was viewed, followed by the mobile channel and then the web channel.

In [10]:
# Initialize and train the random forest classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Make predictions
y_rf_pred = rf_clf.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, y_rf_pred)

# Extract feature importances
rf_feature_importances = rf_clf.feature_importances_

# Create a DataFrame to display feature importances
rf_importance_df = pd.DataFrame({'Feature': ['mobile', 'social', 'web'], 'Importance': rf_feature_importances})

# Display the results
rf_accuracy, rf_importance_df


(0.8011710025955212,
   Feature  Importance
 0  mobile    0.204546
 1  social    0.757655
 2     web    0.037800)

Using a Random Forest classifier, we achieved an accuracy of approximately 80.12%. The social channel remains the most significant factor

In [12]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_rf_pred)

# Classification Report
class_report = classification_report(y_test, y_rf_pred)

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:, 1])

# Convert the confusion matrix to a DataFrame
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive'])

# Convert the classification report to a DataFrame
# The output format of classification_report changed in newer versions of scikit-learn, so ensure compatibility
report_dict = classification_report(y_test, y_rf_pred, output_dict=True)
class_report_df = pd.DataFrame(report_dict).transpose()

# For the ROC-AUC Score, create a DataFrame with it as a value if needed
roc_auc_df = pd.DataFrame({'ROC-AUC Score': [roc_auc]})

# Display the DataFrames
print("Confusion Matrix:")
print(conf_matrix_df)
print("\nClassification Report:")
print(class_report_df)
print("\nROC-AUC Score:")
print(roc_auc_df)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative                1121                2691
Actual Positive                 603               12152

Classification Report:
              precision    recall  f1-score       support
0              0.650232  0.294071  0.404986   3812.000000
1              0.818702  0.952724  0.880644  12755.000000
accuracy       0.801171  0.801171  0.801171      0.801171
macro avg      0.734467  0.623398  0.642815  16567.000000
weighted avg   0.779938  0.801171  0.771197  16567.000000

ROC-AUC Score:
   ROC-AUC Score
0       0.833973


Interpretation

Confusion Matrix: The model performs well in identifying true positives (offers that were viewed). However, it struggles with false positives, predicting some offers as viewed when they were not.

Classification Report:
    Precision for class 1 (offers viewed) is 0.82, indicating that when the model predicts an offer was viewed, it is correct 82% of the time.
    Recall for class 1 is 0.95, meaning the model successfully identifies 95% of the viewed offers.
    Class 0 (offers not viewed) has lower precision and recall, indicating the model is less reliable at predicting non-viewed offers.

ROC-AUC Score: The ROC-AUC score of 0.834 indicates good model performance, as a score closer to 1 is better.

**Conclusion**
The model is fairly accurate, particularly for predicting offers that were viewed. However, it has room for improvement in predicting offers that were not viewed (class 0). Considering the context and the baseline performance, 80.12% accuracy with a good ROC-AUC score suggests the model is performing well, but you might want to focus on improving the balance between precision and recall for the less frequent class.