In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# Import data
df = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

In [3]:
def process_data(data, gender_column):
    # Filter data for the specified gender
    gender_data = data[data[gender_column] == 1]
    # Drop 'gender_F' and 'gender_M' columns
    gender_data = gender_data.drop(['gender_F', 'gender_M'], axis=1)
    # Extract features and target variable
    features = gender_data[['mobile', 'social', 'web']]
    target = gender_data['offer_viewed']
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

def train_evaluate_model(X_train, X_test, y_train, y_test):
    # Initialize and train the random forest classifier with balanced class weights
    rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')
    rf_clf.fit(X_train, y_train)
    # Make predictions
    y_pred = rf_clf.predict(X_test)
    y_prob = rf_clf.predict_proba(X_test)[:, 1]
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_prob)
    # Calculate permutation importance
    perm_importance = permutation_importance(rf_clf, X_test, y_test, n_repeats=10, random_state=42)
    perm_importance_df = pd.DataFrame({'Feature': ['mobile', 'social', 'web'],
                                       'Importance': perm_importance.importances_mean,
                                       'Importance STD': perm_importance.importances_std})
    return {
        "Accuracy": accuracy,
        "Confusion Matrix": conf_matrix,
        "Classification Report": class_report,
        "ROC-AUC Score": roc_auc,
        "Permutation Importance": perm_importance_df
    }

# Process data for female and male customers
X_train_female, X_test_female, y_train_female, y_test_female = process_data(df, 'gender_F')
X_train_male, X_test_male, y_train_male, y_test_male = process_data(df, 'gender_M')

# Train and evaluate models for female and male customers
results_female = train_evaluate_model(X_train_female, X_test_female, y_train_female, y_test_female)
results_male = train_evaluate_model(X_train_male, X_test_male, y_train_male, y_test_male)

In [9]:
### Parse model's evaluation results ###
# Initialize the data dictionary
data = {
    "Metric": [],
    "Female": [],
    "Male": []
}

# Add accuracy and ROC-AUC Score
data["Metric"].append("Accuracy")
data["Female"].append(results_female["Accuracy"])
data["Male"].append(results_male["Accuracy"])

data["Metric"].append("ROC-AUC Score")
data["Female"].append(results_female["ROC-AUC Score"])
data["Male"].append(results_male["ROC-AUC Score"])

# Add confusion matrix
data["Metric"].extend(["Confusion Matrix TN", "Confusion Matrix FP", "Confusion Matrix FN", "Confusion Matrix TP"])
data["Female"].extend(results_female["Confusion Matrix"].flatten().tolist())
data["Male"].extend(results_male["Confusion Matrix"].flatten().tolist())

# Function to extract F1-scores for macro and weighted averages
def extract_f1_scores(report):
    lines = report.split('\n')
    f1_scores = {}
    for line in lines:
        if "macro avg" in line:
            f1_scores['F1-Score (Macro)'] = line.split()[-2]  # Assuming F1-score is the second last element
        elif "weighted avg" in line:
            f1_scores['F1-Score (Weighted)'] = line.split()[-2]  # Assuming F1-score is the second last element
    return f1_scores

# Extract F1-scores for macro and weighted averages
f1_scores_female = extract_f1_scores(results_female['Classification Report'])
f1_scores_male = extract_f1_scores(results_male['Classification Report'])

# Assuming classification_metrics includes the metrics you're interested in
classification_metrics = ['F1-Score (Macro)', 'F1-Score (Weighted)']

# Loop through each metric and append the results to the data dictionary
for metric_name in classification_metrics:
    data["Metric"].append(metric_name)
    data["Female"].append(f1_scores_female[metric_name])
    data["Male"].append(f1_scores_male[metric_name])

# Add permutation importance
for index, feature in enumerate(results_female['Permutation Importance']['Feature']):
    data["Metric"].append(f'Importance of {feature}')
    data["Female"].append(results_female['Permutation Importance']['Importance'][index])
    data["Male"].append(results_male['Permutation Importance']['Importance'][index])
    
    data["Metric"].append(f'Importance STD of {feature}')
    data["Female"].append(results_female['Permutation Importance']['Importance STD'][index])
    data["Male"].append(results_male['Permutation Importance']['Importance STD'][index])

# Create the DataFrame
df = pd.DataFrame(data)

# Calculate the percent difference
# Convert 'Female' and 'Male' columns to numeric (float) to ensure calculations can be performed
df['Female'] = pd.to_numeric(df['Female'], errors='coerce')
df['Male'] = pd.to_numeric(df['Male'], errors='coerce')

# Calculate percent difference
condition_both_non_zero = (df['Female'] != 0) & (df['Male'] != 0)
condition_one_zero = (df['Female'] == 0) | (df['Male'] == 0)
condition_both_zero = (df['Female'] == 0) & (df['Male'] == 0)

# Calculate percent difference
df['Percent Difference'] = np.nan
df.loc[condition_both_non_zero, 'Percent Difference'] = ((df['Female'] - df['Male']) / ((df['Female'] + df['Male']) / 2)) * 100
df.loc[condition_one_zero & ~condition_both_zero, 'Percent Difference'] = np.nan

df.to_csv(r'data/04_fct/fct_offer_channel_importance_evaluation_results.csv', index=False)
df

Unnamed: 0,Metric,Female,Male,Percent Difference
0,Accuracy,0.75817,0.75567,0.330296
1,ROC-AUC Score,0.841335,0.829774,1.383557
2,Confusion Matrix TN,573.0,545.0,5.008945
3,Confusion Matrix FP,91.0,92.0,-1.092896
4,Confusion Matrix FN,871.0,953.0,-8.991228
5,Confusion Matrix TP,2443.0,2687.0,-9.512671
6,F1-Score (Macro),0.69,0.67,2.941176
7,F1-Score (Weighted),0.79,0.79,0.0
8,Importance of mobile,0.011689,0.008768,28.561824
9,Importance STD of mobile,0.001938,0.001088,56.200257
