In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score, roc_auc_score, classification_report)
import itertools

# Prepare Data

In [3]:
# Import Data
df = pd.read_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')

# Drop rows where bogo = 0 and discount = 0 because we're only interested in offers that can be c|ompleted
df = df[(df['is_bogo'] != 0) | (df['is_discount'] != 0)]
df.drop(columns=['is_discount'], inplace=True) # Drop is_discount to avoid collinearity

# Creating a taraget variable to indicate if an offer was viewed before being completed
df['offer_completed_viewed'] = df.apply(lambda x: 1 if x['offer_completed'] == 1 and x['viewed_before_completion'] == 1 else 0, axis=1)
df.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,total_transactions,total_transaction_amount,offer_completed_viewed
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,1,0,5,5,168,1,0,1,1,1,23.22,0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,5,5,168,1,0,1,1,1,19.89,1
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,10,10,168,1,1,0,1,1,21.72,1
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,0,5,5,120,1,1,1,1,1,21.72,0
e2127556f4f64592b11af22de27a7932,68,70000,91,0,1,1,0,0,10,2,168,1,0,1,0,0,0.0,0


# Initial Model Training

In [4]:
def train_random_forest(X, y, test_size=0.3, random_state=42, class_weight='balanced', **rf_params):
    """
    Trains a Random Forest model with the given parameters.

    Parameters:
    - X: Feature matrix.
    - y: Target variable.
    - test_size: Fraction of the dataset to be used as test set.
    - random_state: Seed used by the random number generator.
    - class_weight: Weights associated with classes. 'balanced' by default.
    - rf_params: Additional parameters to pass to the RandomForestClassifier.

    Returns:
    - model: Trained Random Forest model.
    - X_train, X_test, y_train, y_test: Split dataset.
    """
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    # Train a Random Forest model
    model = RandomForestClassifier(random_state=random_state, class_weight=class_weight, **rf_params)
    model.fit(X_train, y_train)

    return model, X_train, X_test, y_train, y_test

In [5]:
# Define features and target variable
features = ['age', 'income', 'days_as_member', 'gender_F', 'gender_M', 'is_bogo', 'reward', 'difficulty', 'duration_hrs']
X = df[features]
y = df['offer_completed_viewed']

# Check Class weights
classes, counts = np.unique(y, return_counts=True)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)

# Print the class weights
for class_label, weight in zip(classes, weights):
    print(f"Class {class_label}: Weight {weight}")

Class 0: Weight 0.9073937153419593
Class 1: Weight 1.1136569872958257


In [18]:
# Train a model with the default parameters
default_model, X_train, X_test, y_train, y_test = train_random_forest(X, y, class_weight='balanced')

default_dict = default_model.get_params()
default_params_df = pd.DataFrame(list(default_dict.items()), columns=['Parameter', 'initial_value'])
default_params_df.head()

Unnamed: 0,Parameter,initial_value
0,bootstrap,True
1,ccp_alpha,0.0
2,class_weight,balanced
3,criterion,gini
4,max_depth,


# Setup Grid Search

In [None]:
def perform_grid_search(X_train, y_train, param_grid, cv=5, scoring='accuracy', class_weight='balanced', random_state=42):
    """
    Performs grid search to find the best Random Forest parameters and returns all results.

    Parameters:
    - X_train: Training feature matrix.
    - y_train: Training target variable.
    - param_grid: Grid of parameters to search over.
    - cv: Number of cross-validation folds.
    - scoring: Strategy to evaluate the performance of the cross-validated model on the test set.
    - class_weight: Weights associated with classes.
    - random_state: Seed used by the random number generator.

    Returns:
    - cv_results_: Dictionary of all the evaluation metrics from the grid search.
    """
    try:
        rf = RandomForestClassifier(random_state=random_state, class_weight=class_weight)
        grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring=scoring, verbose=3)
        grid_search.fit(X_train, y_train)
        return grid_search.cv_results_
    except ValueError as e:
        print(f"ValueError during grid search: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during grid search: {e}")
    return {}  # Return an empty dictionary or a custom message indicating failure

In [None]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search
f1_parameters = perform_grid_search(X_train, y_train, param_grid, scoring='f1', class_weight='balanced')
f1_parameters_df = pd.DataFrame(f1_parameters)
f1_parameters_df.to_csv(r'data\04_fct\fct_gs_f1_recommendation_parameters.csv')

In [8]:
def calculate_validation_metrics(model, X_test, y_test, X, y):
    """Calculate validation metrics and return them in a DataFrame."""
    # Predict on the test set
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate ROC AUC Score
    roc_auc = roc_auc_score(y_test, y_test_proba)
    
    # Calculate False Positive and False Negative Rates
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)
    
    # Calculate Cross-Validation Score
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    mean_cv_score = cv_scores.mean()
    std_cv_score = cv_scores.std()
    
    # Collect metrics in a dictionary
    metrics_dict = {
        'ROC-AUC Score': roc_auc,
        'False Positive Rate': fpr,
        'False Negative Rate': fnr,
        'Mean Cross-Validation F1 Score': mean_cv_score,
        'STD Cross-Validation F1 Score': std_cv_score
    }
    
    # Generate classification report and extract F1 scores
    class_report = classification_report(y_test, y_test_pred, output_dict=True)
    df_main = pd.DataFrame(class_report).transpose().drop(['accuracy'])
    overall_metrics = pd.DataFrame(class_report).transpose().loc[['accuracy']]
    df_classification = pd.concat([df_main, overall_metrics])
    
    f1_scores = {
        'F1 Macro Avg': df_classification.loc['macro avg', 'f1-score'],
        'F1 Weighted Avg': df_classification.loc['weighted avg', 'f1-score'],
    }
    
    # Combine all metrics into a single DataFrame
    metrics_df = pd.DataFrame(list({**metrics_dict, **f1_scores}.items()), columns=['Metric', 'Value'])
    
    return metrics_df

# Calculate validation metrics for the default model
default_metrics_df = calculate_validation_metrics(default_model, X_test, y_test, X, y)
default_metrics_df.head()

Unnamed: 0,Metric,Value
0,ROC-AUC Score,0.720279
1,False Positive Rate,0.270126
2,False Negative Rate,0.40447
3,Mean Cross-Validation F1 Score,0.612965
4,STD Cross-Validation F1 Score,0.006162


# Train the model with the Optimized Metrics

In [17]:
best_params_df = pd.read_csv(r'data\04_fct\fct_gs_f1_recommendation_parameters.csv').sort_values(by='mean_test_score', ascending=False).head(1)
best_params_dict = best_params_df['params'].values[0]
# Display the best parameters
print("Best Parameters:")
print(best_params_dict)


Best Parameters:
{'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}


In [24]:
# Train a model with the default parameters
optimized_model, X_train, X_test, y_train, y_test = train_random_forest(X, y, class_weight='balanced', 
                                                                        max_depth=10, 
                                                                        min_samples_split=10, 
                                                                        n_estimators=300)

optimized_dict = optimized_model.get_params()
optimized_param_df = pd.DataFrame(list(optimized_dict.items()), columns=['Parameter', 'optimized_value'])
optimized_param_df.head()

Unnamed: 0,Parameter,optimized_value
0,bootstrap,True
1,ccp_alpha,0.0
2,class_weight,balanced
3,criterion,gini
4,max_depth,10


In [33]:
# Parameter differences
param_diff_df = pd.merge(default_params_df, optimized_param_df, on='Parameter')

# Show the differences
rows = ['max_depth', 'min_samples_split', 'n_estimators']
param_diff_df = param_diff_df[param_diff_df['Parameter'].isin(rows)]
param_diff_df

Unnamed: 0,Parameter,initial_value,optimized_value
4,max_depth,,10
10,min_samples_split,2.0,10
13,n_estimators,100.0,300


In [25]:
optimized_metrics_df = calculate_validation_metrics(optimized_model, X_test, y_test, X, y)
optimized_metrics_df.head()

Unnamed: 0,Metric,Value
0,ROC-AUC Score,0.758709
1,False Positive Rate,0.298467
2,False Negative Rate,0.320282
3,Mean Cross-Validation F1 Score,0.664169
4,STD Cross-Validation F1 Score,0.005176


# Compare Models

In [30]:
# Merge the default and optimized metrics DataFrames
all_metrics_df = pd.merge(default_metrics_df, optimized_metrics_df, on='Metric', suffixes=('_default', '_optimized'))

# Calculate the percent difference between the default and optimized values
all_metrics_df['Percent Difference'] = round(((all_metrics_df['Value_optimized'] - all_metrics_df['Value_default']) / all_metrics_df['Value_default']) * 100,2)

all_metrics_df

Unnamed: 0,Metric,Value_default,Value_optimized,Percent Difference
0,ROC-AUC Score,0.720279,0.758709,5.34
1,False Positive Rate,0.270126,0.298467,10.49
2,False Negative Rate,0.40447,0.320282,-20.81
3,Mean Cross-Validation F1 Score,0.612965,0.664169,8.35
4,STD Cross-Validation F1 Score,0.006162,0.005176,-15.99
5,F1 Macro Avg,0.663441,0.689683,3.96
6,F1 Weighted Avg,0.668073,0.692261,3.62


# Example Customer Predictions

In [38]:
# Define a new customer profile and offers (for demonstration)
new_customer_profile = pd.DataFrame({
    'age': [30, 40, 50, 60],
    'income': [50000, 60000, 70000, 80000],
    'days_as_member': [200, 400, 600, 800],
    'gender_F': [0, 1, 0, 1],
    'gender_M': [1, 0, 1, 0]
})

offers = pd.DataFrame({
    'is_bogo': [0, 1, 0, 1],
    'is_discount': [1, 0, 1, 0],
    'reward': [2, 5, 3, 10],
    'difficulty': [10, 5, 7, 10],
    'duration_hrs': [168, 120, 240, 168]
})

# Create a combined dataset for prediction
customer_offer_pairs = pd.DataFrame(itertools.product(new_customer_profile.index, offers.index), columns=['customer_idx', 'offer_idx'])
customer_offer_pairs = customer_offer_pairs.merge(new_customer_profile, left_on='customer_idx', right_index=True)
customer_offer_pairs = customer_offer_pairs.merge(offers, left_on='offer_idx', right_index=True)

# Predict response probability
X_new = customer_offer_pairs[features]
customer_offer_pairs['default_response_probability'] = default_model.predict_proba(X_new)[:, 1]
customer_offer_pairs['optimized_response_probability'] = optimized_model.predict_proba(X_new)[:, 1]

# Calculate top recommendations
grouped = customer_offer_pairs.groupby('customer_idx')
sorted_pairs = customer_offer_pairs.sort_values(by=['customer_idx', 'default_response_probability'], ascending=[True, False])
top_per_group = sorted_pairs.drop_duplicates(subset=['customer_idx'])
top_recommendations = top_per_group.reset_index(drop=True)

# Calculate the percent difference in response probability between the default and optimized models
top_recommendations['percent_difference'] = round(((top_recommendations['optimized_response_probability'] - top_recommendations['default_response_probability']) / top_recommendations['default_response_probability']) * 100, 2)
top_recommendations

Unnamed: 0,customer_idx,offer_idx,age,income,days_as_member,gender_F,gender_M,is_bogo,is_discount,reward,difficulty,duration_hrs,default_response_probability,optimized_response_probability,percent_difference
0,0,2,30,50000,200,0,1,0,1,3,7,240,0.62,0.533408,-13.97
1,1,3,40,60000,400,1,0,1,0,10,10,168,0.88,0.719961,-18.19
2,2,2,50,70000,600,0,1,0,1,3,7,240,0.89,0.878554,-1.29
3,3,3,60,80000,800,1,0,1,0,10,10,168,0.91,0.666976,-26.71
