In [9]:
# Data Processing
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, roc_auc_score

In [10]:
churners_df = pd.read_csv('churners_df.csv')

In [11]:
# Name X and y
X = churners_df.drop('Attrition_Flag', axis=1)
y = churners_df['Attrition_Flag']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [12]:
num_cols = X_train.select_dtypes(exclude='object').columns
cat_cols = X_train.select_dtypes(include='object').columns

In [13]:
# df.replace('Unknown', np.nan, inplace=True)

In [14]:
# Define preprocessing steps for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop',
    n_jobs=-1,
    verbose=True)

In [15]:
# Select the columns with categorical values
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print("columns_to_encode =", categorical_cols)

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_array = encoder.fit_transform(X_train[categorical_cols])

# Convert the encoded array into a DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate the encoded DataFrame with the original DataFrame
result_df = pd.concat([X_train, encoded_df], axis=1)

# Drop the original categorical columns if needed
result_df.drop(columns = categorical_cols, axis=1, inplace=True) 

print(result_df.info())
result_df.isna().sum().sort_values(ascending=False)

columns_to_encode = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
<class 'pandas.core.frame.DataFrame'>
Index: 9217 entries, 415 to 7086
Data columns (total 37 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Customer_Age                    7088 non-null   float64
 1   Dependent_count                 7088 non-null   float64
 2   Months_on_book                  7088 non-null   float64
 3   Total_Relationship_Count        7088 non-null   float64
 4   Months_Inactive_12_mon          7088 non-null   float64
 5   Contacts_Count_12_mon           7088 non-null   float64
 6   Credit_Limit                    7088 non-null   float64
 7   Total_Revolving_Bal             7088 non-null   float64
 8   Avg_Open_To_Buy                 7088 non-null   float64
 9   Total_Amt_Chng_Q4_Q1            7088 non-null   float64
 10  Total_Trans_Amt                 7088 non-null   float64

Customer_Age                      2129
Education_Level_High School       2129
Education_Level_Uneducated        2129
Education_Level_Unknown           2129
Marital_Status_Divorced           2129
Marital_Status_Married            2129
Marital_Status_Single             2129
Marital_Status_Unknown            2129
Income_Category_$120K +           2129
Income_Category_$40K - $60K       2129
Income_Category_$60K - $80K       2129
Income_Category_$80K - $120K      2129
Income_Category_Less than $40K    2129
Income_Category_Unknown           2129
Card_Category_Blue                2129
Card_Category_Gold                2129
Card_Category_Platinum            2129
Education_Level_Post-Graduate     2129
Education_Level_Graduate          2129
Dependent_count                   2129
Education_Level_Doctorate         2129
Months_on_book                    2129
Total_Relationship_Count          2129
Months_Inactive_12_mon            2129
Contacts_Count_12_mon             2129
Credit_Limit             

In [16]:
# Fit and transform the preprocesing steps on the training data
preprocessed = preprocessor.fit(X_train)

X_train_processed = preprocessed.transform(X_train)
X_test_processed = preprocessed.transform(X_test)

In [17]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_processed, y_train)

In [18]:
# Make predictions
y_pred = rf_model.predict(X_test_processed)

# Evaluate the model
print(rf_model.score(X_test_processed, y_test))

0.9512997696610728


In [19]:
# Confusion matrix
confusion_matrix(y_test, y_pred)
crosstab = pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)
display(crosstab)
print(confusion_matrix(y_test, y_pred))


Predicted,Attrited Customer,Existing Customer,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attrited Customer,376,120,496
Existing Customer,28,2515,2543
All,404,2635,3039


[[ 376  120]
 [  28 2515]]


In [20]:
# Generate classification report
report = classification_report(y_test, y_pred, zero_division=1)
print(f"Classification Report:\n{classification_report(y_test, y_pred, zero_division=1)}")

Classification Report:
                   precision    recall  f1-score   support

Attrited Customer       0.93      0.76      0.84       496
Existing Customer       0.95      0.99      0.97      2543

         accuracy                           0.95      3039
        macro avg       0.94      0.87      0.90      3039
     weighted avg       0.95      0.95      0.95      3039

