In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib
from PIL import Image
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import plot_roc_curve, classification_report

In [2]:
from constants import (
    file_path, eda_images_path,
    category_list_constant, response_constant,
    keep_cols, results_images_path
)

In [3]:
def import_data(file_path):
    '''
    returns dataframe for the csv found at pth

    input:
            pth: a path to the csv
    output:
            dataframe: pandas dataframe
    '''

    dataframe = pd.read_csv(file_path)
    return dataframe

In [36]:
churn_df = import_data(file_path)

In [38]:
churn_df = import_data("invalid path")

FileNotFoundError: [Errno 2] No such file or directory: 'invalid path'

In [37]:
churn_df['Churn'] = churn_df['Attrition_Flag'].apply(lambda val: 0 if val == "Existing Customer" else 1)

### encoder helper

In [6]:
def encoder_helper(dataframe, category_list, response=response_constant):
    '''
    Helper function to turn each categorical column into a new column with
    proportion of churn for each category

    input:
        dataframe: pandas DataFrame
        category_list: list of columns that contain categorical features
        response: string of response name [optional argument that could
        be used for naming variables or index y column]

    output:
        DataFrame with new columns
    '''
    for category in category_list:
        category_groups = dataframe.groupby(category).mean()[response]
        new_column_name = f"{category}_{response}"

        dataframe[new_column_name] = dataframe[category].apply(
            lambda val, category_groups=category_groups: category_groups.loc[val])

    return dataframe

In [7]:
# Your DataFrame
df = churn_df.copy()

# Calling the function
df = encoder_helper(df, category_list_constant)

df.columns

Index(['Unnamed: 0', 'CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Churn', 'Gender_Churn', 'Education_Level_Churn',
       'Marital_Status_Churn', 'Income_Category_Churn', 'Card_Category_Churn'],
      dtype='object')

### perform_feature_engineering

In [8]:
from sklearn.preprocessing import normalize


In [9]:
df = churn_df.copy()

In [10]:
y = df['Churn']
X = pd.DataFrame()

In [11]:
# New columns
df = encoder_helper(df, category_list_constant)

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Churn,Gender_Churn,Education_Level_Churn,Marital_Status_Churn,Income_Category_Churn,Card_Category_Churn
0,0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,...,1144,42,1.625,0.061,0,0.146152,0.152012,0.151269,0.134807,0.160979
1,1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,...,1291,33,3.714,0.105,0,0.173572,0.155691,0.169414,0.171862,0.160979
2,2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,...,1887,20,2.333,0.0,0,0.146152,0.155691,0.151269,0.157655,0.160979
3,3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,...,1171,20,2.333,0.76,0,0.173572,0.152012,0.17223,0.171862,0.160979
4,4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,...,816,28,2.5,0.0,0,0.146152,0.159381,0.151269,0.134807,0.160979


In [13]:
X[keep_cols] = df[keep_cols]

In [14]:
X.head()

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Gender_Churn,Education_Level_Churn,Marital_Status_Churn,Income_Category_Churn,Card_Category_Churn
0,45,3,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0.146152,0.152012,0.151269,0.134807,0.160979
1,49,5,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,0.173572,0.155691,0.169414,0.171862,0.160979
2,51,3,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,0.146152,0.155691,0.151269,0.157655,0.160979
3,40,4,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.173572,0.152012,0.17223,0.171862,0.160979
4,40,3,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0,0.146152,0.159381,0.151269,0.134807,0.160979


In [15]:
# train test split 
features_train, features_test, target_train, target_test = train_test_split(X, y, test_size= 0.3, random_state=42)

In [16]:
print(features_train.shape)
print(features_test.shape)
print(target_train.shape)
print(target_test.shape)

(7088, 19)
(3039, 19)
(7088,)
(3039,)


### train_models()

In [22]:
%%time
cv_rfc, lrc, target_data = train_models(features_train, features_test, target_train, target_test)

Training rfc...
Training lrc...
Getting predictions...
...........................................
Logistic regression results:
Test results:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      2543
           1       0.71      0.45      0.55       496

    accuracy                           0.88      3039
   macro avg       0.81      0.71      0.74      3039
weighted avg       0.87      0.88      0.87      3039

Train results:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      5957
           1       0.72      0.50      0.59      1131

    accuracy                           0.89      7088
   macro avg       0.82      0.73      0.76      7088
weighted avg       0.88      0.89      0.88      7088

Random forest results:
Test results:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      2543
           1       0.93      0.80      0.86

In [27]:
len(target_data)

6

In [33]:
%%time
classification_report_image(target_data)

Saving logistic_results.png...
Saving rf_results.png...
CPU times: user 447 ms, sys: 40.1 ms, total: 487 ms
Wall time: 483 ms


In [28]:
%%time
feature_importance_plot(cv_rfc, features_test, results_images_path)

Saving feature_importances.png...
Feature importances saved successfully.
CPU times: user 1min 31s, sys: 525 ms, total: 1min 31s
Wall time: 1min 30s


In [34]:
def train_models(features_train, features_test, target_train, target_test):
    '''
    train, store model results: images + scores, and store models
    input:
              features_train: features training data
              features_test: features testing data
              target_train: target training data
              target_test: target testing data
    output:
              None
    '''
    # train models
    rfc = RandomForestClassifier(random_state=42)
    lrc = LogisticRegression(solver='lbfgs', max_iter=3000)
    param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [4,5,100],
    'criterion' :['gini', 'entropy']
    }
    cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
    print("Training rfc...")
    cv_rfc.fit(features_train, target_train)
    print("Training lrc...")
    lrc.fit(features_train, target_train)

    # Get the predictions
    print("Getting predictions...")
    # Randon forest
    target_train_preds_rf = cv_rfc.best_estimator_.predict(features_train)
    target_test_preds_rf = cv_rfc.best_estimator_.predict(features_test)
    # Logistic Regression
    target_train_preds_lr = lrc.predict(features_train)
    target_test_preds_lr = lrc.predict(features_test)
    print("...........................................")

    # Scores
    print('Logistic regression results:')
    print('Test results:')
    print(classification_report(target_test, target_test_preds_lr))
    print('Train results:')
    print(classification_report(target_train, target_train_preds_lr))
    print('Random forest results:')
    print('Test results:')
    print(classification_report(target_test, target_test_preds_rf))
    print('Train results:')
    print(classification_report(target_train, target_train_preds_rf))
    print("...........................................")

    # roc_curve_result.png
    print("Saving roc_curve_result.png...")
    # Create a new figure for the ROC curve
    plt.ioff()
    fig, ax_fig = plt.subplots(figsize=(15, 8))    
    # Plot ROC curve for each model
    plot_roc_curve(lrc, features_test, target_test, ax=ax_fig, 
                   alpha=0.8, name='Logistic Regression')
    plot_roc_curve(cv_rfc.best_estimator_, features_test, target_test, 
                   ax=ax_fig, alpha=0.8, name='Random Forest')    
    # Add title to the plot
    plt.title("ROC Curve")    
    # Save the plot
    save_path = f"{results_images_path}roc_curve_result.png"
    plt.savefig(save_path)
    
    # Close the plot
    plt.close(fig)
    del fig
    del ax_fig

    # save best model
    joblib.dump(cv_rfc.best_estimator_, './models/rfc_model.pkl')
    joblib.dump(lrc, './models/logistic_model.pkl')

    # Save the target data in a list
    target_data = [target_train, target_test, target_train_preds_lr, 
                   target_train_preds_rf, target_test_preds_lr, target_test_preds_rf]

    print("Train models process completed!")

    return cv_rfc, lrc, target_data

In [32]:
# results images
#def classification_report_image(target_train,
#                                target_test,
#                                target_train_preds_lr,
#                                target_train_preds_rf,
#                                target_test_preds_lr,
#                                target_test_preds_rf):
def classification_report_image(target_data):
    '''
    Produces classification report for training and testing results and stores
    the report as an image in the images folder.

    input:
            target_data: A list or tuple containing six elements:
                - target_train: training response values
                - target_test: test response values
                - target_train_preds_lr: training predictions from logistic regression
                - target_train_preds_rf: training predictions from random forest
                - target_test_preds_lr: test predictions from logistic regression
                - target_test_preds_rf: test predictions from random forest

    output:
             None
    '''

    # Split up the target_data in new variables
    (
    target_train, target_test, target_train_preds_lr, 
    target_train_preds_rf, target_test_preds_lr, target_test_preds_rf
    ) = target_data

    
    # logistic_results.png
    print("Saving logistic_results.png...")
    plt.figure(figsize=(15, 8))  # Create a new figure
    
    plt.text(
        0.01, 1.0,
        'Logistic Regression Train',
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.text(
        0.01, 0.7,
        str(classification_report(target_train, target_train_preds_lr)),
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.text(
        0.01, 0.4,
        'Logistic Regression Test',
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.text(
        0.01, 0.1,
        str(classification_report(target_test, target_test_preds_lr)),
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.axis('off')
    plt.savefig(f"{results_images_path}/logistic_results.png")
    plt.close()  # Close the figure to free up the resources
    
    # rf_results.png
    print("Saving rf_results.png...")
    plt.figure(figsize=(15, 8))  # Create a new figure
    
    plt.text(
        0.01, 1.0,
        'Random Forest Train',
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.text(
        0.01, 0.7,
        str(classification_report(target_test, target_test_preds_rf)),
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.text(
        0.01, 0.4,
        'Random Forest Test',
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.text(
        0.01, 0.1,
        str(classification_report(target_train, target_train_preds_rf)),
        {'fontsize': 10},
        fontproperties='monospace'
    )
    
    plt.axis('off')
    plt.savefig(f"{results_images_path}/rf_results.png")
    plt.close()  # Close the figure to free up the resources    

In [21]:
def feature_importance_plot(model, feature_data, output_path):
    '''
    Creates and stores the feature importances in path.
    Inputs:
        model: model object containing feature_importances_
        feature_data: pandas DataFrame of X values
        output_path: path to store the figure
    Output:
        None
    '''
    print("Saving feature_importances.png...")

    # Create a figure to hold the SHAP plot
    explainer = shap.TreeExplainer(model.best_estimator_)
    shap_values = explainer.shap_values(feature_data)
    plt.figure(figsize=(20, 5))
    shap.summary_plot(shap_values, feature_data, plot_type="bar", 
                      show=False, plot_size = (20,5))
    shap_plot_path = os.path.join(output_path, "shap_plot.png")    
    plt.savefig(shap_plot_path)
    plt.close()

    # Create and Save Feature Importance plot
    importances = model.best_estimator_.feature_importances_
    indices = np.argsort(importances)[::-1]
    names = [feature_data.columns[i] for i in indices]
    plt.figure(figsize=(20, 5))
    plt.bar(range(feature_data.shape[1]), importances[indices])    
    # Reduce font size for x-axis labels
    plt.xticks(range(feature_data.shape[1]), names, rotation=90, fontsize=8)    
    # Increase the viewing area below the graph
    plt.subplots_adjust(bottom=0.3)    
    # Add a title and ylabel
    plt.title("Feature Importance")
    plt.ylabel('Importance')    
    feature_importance_plot_path = os.path.join(output_path, "feature_importance_plot.png")
    plt.savefig(feature_importance_plot_path)
    
    # Automatically adjust spacing between subplots
    plt.tight_layout()
    
    plt.close()

    # Open the images
    img1 = Image.open(shap_plot_path)
    img2 = Image.open(feature_importance_plot_path)

    # Get dimensions
    img1_width, img1_height = img1.size
    img2_width, img2_height = img2.size

    # Create a new image with white background
    new_img = Image.new("RGB", (max(img1_width, img2_width), img1_height + img2_height), "white")

    # Paste the images
    new_img.paste(img1, (0, 0))
    new_img.paste(img2, (0, img1_height))

    # Save the new image
    new_img.save(os.path.join(output_path, "feature_importances.png"))

    # Delete the temporary plots
    if os.path.exists(shap_plot_path):
        os.remove(shap_plot_path)
    if os.path.exists(feature_importance_plot_path):
        os.remove(feature_importance_plot_path)    

    print("Feature importances saved successfully.")