In [60]:
# Data Processing
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, roc_auc_score

In [61]:
cleaned_churners = pd.read_csv('churners_clean_df.csv')

In [62]:
# Replace Unknown values by nan in categorical data
cleaned_churners.replace('Unknown', np.nan, inplace=True)

In [63]:
# Name X and y
X = cleaned_churners.drop('Attrition_Flag', axis=1)
y = cleaned_churners['Attrition_Flag']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

Attrition_Flag
Existing Customer    4844
Attrited Customer     990
Name: count, dtype: int64

In [65]:
#sm = SMOTE(random_state=42)
#X_train, y_train = sm.fit_resample(X_train, y_train)

In [66]:
num_cols = X_train.select_dtypes(exclude='object').columns
cat_cols = X_train.select_dtypes(include='object').columns

In [67]:
# Define preprocessing steps for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop',
    n_jobs=-1,
    verbose=True)

In [68]:
# Select the columns with categorical values
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print("columns_to_encode =", categorical_cols)

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_array = encoder.fit_transform(X_train[categorical_cols])

# Convert the encoded array into a DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate the encoded DataFrame with the original DataFrame
result_df = pd.concat([X_train, encoded_df], axis=1)

# Drop the original categorical columns if needed
result_df.drop(columns = categorical_cols, axis=1, inplace=True) 

print(result_df.info())
result_df.isna().sum().sort_values(ascending=False)

columns_to_encode = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
<class 'pandas.core.frame.DataFrame'>
Index: 7604 entries, 2351 to 5832
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Customer_Age                    5834 non-null   float64
 1   Dependent_count                 5834 non-null   float64
 2   Months_on_book                  5834 non-null   float64
 3   Total_Relationship_Count        5834 non-null   float64
 4   Months_Inactive_12_mon          5834 non-null   float64
 5   Contacts_Count_12_mon           5834 non-null   float64
 6   Credit_Limit                    5834 non-null   float64
 7   Total_Revolving_Bal             5834 non-null   float64
 8   Total_Amt_Chng_Q4_Q1            5834 non-null   float64
 9   Total_Trans_Amt                 5834 non-null   float64
 10  Total_Trans_Ct                  5834 non-null   float6

Customer_Age                      1770
Income_Category_$120K +           1770
Education_Level_Uneducated        1770
Education_Level_nan               1770
Marital_Status_Divorced           1770
Marital_Status_Married            1770
Marital_Status_Single             1770
Marital_Status_nan                1770
Income_Category_$40K - $60K       1770
Education_Level_High School       1770
Income_Category_$60K - $80K       1770
Income_Category_$80K - $120K      1770
Income_Category_Less than $40K    1770
Card_Category_Blue                1770
Card_Category_Gold                1770
Card_Category_Platinum            1770
Education_Level_Post-Graduate     1770
Education_Level_Graduate          1770
Dependent_count                   1770
Total_Amt_Chng_Q4_Q1              1770
Months_on_book                    1770
Total_Relationship_Count          1770
Months_Inactive_12_mon            1770
Contacts_Count_12_mon             1770
Credit_Limit                      1770
Total_Revolving_Bal      

In [69]:
# Fit and transform the preprocesing steps on the training data
preprocessed = preprocessor.fit(X_train)

X_train_processed = preprocessed.transform(X_train)
X_test_processed = preprocessed.transform(X_test)

In [70]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_processed, y_train)

In [71]:
# Make predictions
y_pred = rf_model.predict(X_test_processed)

# Evaluate the model
print(rf_model.score(X_test_processed, y_test))

0.9480207916833266


In [72]:
# Confusion matrix
confusion_matrix(y_test, y_pred)
crosstab = pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)
display(crosstab)
print(confusion_matrix(y_test, y_pred))


Predicted,Attrited Customer,Existing Customer,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attrited Customer,283,92,375
Existing Customer,38,2088,2126
All,321,2180,2501


[[ 283   92]
 [  38 2088]]


In [73]:
# Generate classification report
report = classification_report(y_test, y_pred, zero_division=1)
print(f"Classification Report:\n{classification_report(y_test, y_pred, zero_division=1)}")

Classification Report:
                   precision    recall  f1-score   support

Attrited Customer       0.88      0.75      0.81       375
Existing Customer       0.96      0.98      0.97      2126

         accuracy                           0.95      2501
        macro avg       0.92      0.87      0.89      2501
     weighted avg       0.95      0.95      0.95      2501

