In [30]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.inspection import permutation_importance

In [31]:
# Import data
data_new = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

# Filter data for male and female customers
male_data = data_new[data_new['gender_M'] == 1]
female_data = data_new[data_new['gender_F'] == 1]

In [32]:
def train_random_forest_classifier(data, features, target, test_size=0.3, random_state=42):
    """
    Trains a Random Forest classifier on the given dataset and features, then evaluates its accuracy.
    
    Parameters:
    - data: DataFrame containing the dataset.
    - features: List of column names to be used as features.
    - target: Column name of the target variable.
    - test_size: Fraction of the dataset to be used as test set.
    - random_state: Seed used by the random number generator.
    
    Returns:
    - A tuple containing the accuracy of the classifier and a DataFrame with feature importances.
    """
    # Split the data into features and target variable
    X = data[features]
    y = data[target]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Initialize and train the random forest classifier
    rf_clf = RandomForestClassifier(random_state=random_state)
    rf_clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Extract feature importances
    feature_importances = rf_clf.feature_importances_
    
    # Create a DataFrame to display feature importances
    importance_df = pd.DataFrame({'feature': features, 'importance': feature_importances})
    
    return accuracy, importance_df

In [33]:
features = ['mobile', 'social', 'web']
target = 'offer_viewed'

rf_accuracy_female, rf_importance_female = train_random_forest_classifier(female_data, features, target)
rf_accuracy_male, rf_importance_male = train_random_forest_classifier(male_data, features, target)

In [34]:
print(f'Random Forest Accurancy, female:', rf_accuracy_female)
print(f'Random Forest Accurancy, male:', rf_accuracy_male)

Random Forest Accurancy, female: 0.7986263334794681
Random Forest Accurancy, male: 0.7933431641036444


In [35]:
rf_importance_merged = pd.merge(rf_importance_female, rf_importance_male, on='feature', how='inner')
cols = {'importance_x': 'importance_female', 'importance_y': 'importance, male'}
rf_importance_merged = rf_importance_merged.rename(columns=cols)
rf_importance_merged

Unnamed: 0,feature,importance_female,"importance, male"
0,mobile,0.233115,0.194094
1,social,0.732477,0.764368
2,web,0.034408,0.041538
