In [None]:
# Data Analaysis and manipulation
import numpy as np
import pandas as pd

# Data Visiualization
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns


# Preporcessing 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# Models

from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Metrics

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import silhouette_score

import warnings

warnings.filterwarnings("ignore")



In [None]:
data = pd.read_csv("./data/BankChurners.csv")

# Dataset Exploration and Preprocessing


- Check column info and data types
- Check for duplicates
- Check for null values
- Check Basic statistics of Dataset

In [None]:
pd.set_option('display.max_columns', 23) # View all columns

data.head()

# Renaming columns to make it more readable

In [None]:
data.rename(columns={"Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1":"naive_month_1","Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2":"naive_month_2"},inplace=True)

In [None]:
data.info() # Get info about the data type and value count in each column

# Check for null values

In [None]:
data.isnull().sum() # check for null values in dataset

# Check Duplicates

In [None]:
data.duplicated().sum() # check for duplicate values in dataset

# Basis Statistics

In [None]:
pd.options.display.float_format = '{:.2f}'.format  # Set to display 2 decimal places to make it readable more

data.describe().T #  Get basis statistics info like mean,standard deviation,percentile etc

# pd.reset_option('display.float_format') # Reset option back to default

#  Check for Outliers

In [None]:
columns = [col for col in data.columns if data[col].dtype == "int64" or data[col].dtype == "float64"]

x_axis=9
y_axis=2
start = 1
for i in range(0,len(columns)):
    plt.figure(figsize=(8,30))
    plt.subplot(x_axis,y_axis,start)
    sns.boxplot(data=data[columns[i]])
    plt.xlabel(columns[i])
    plt.show()
    start = start  + 1


### apply IQR (Inter quantile Range) to control outliers on necessary features

In [None]:
# Modifying column Total_Amt_Chng_Q4_Q1

Q3_Total_Amt_Chng_Q4_Q1 = data.Total_Amt_Chng_Q4_Q1.quantile(0.75)
Q1_Total_Amt_Chng_Q4_Q1 = data.Total_Amt_Chng_Q4_Q1.quantile(0.25)

IQR = Q3_Total_Amt_Chng_Q4_Q1 - Q3_Total_Amt_Chng_Q4_Q1

upper_limit = Q3_Total_Amt_Chng_Q4_Q1 + (1.5*IQR)
lower_limit = Q1_Total_Amt_Chng_Q4_Q1 - (1.5*IQR)

data = data[(data["Total_Amt_Chng_Q4_Q1"] <= upper_limit) & (data["Total_Amt_Chng_Q4_Q1"] >= lower_limit)]

# Modifying column Total_Ct_Chng_Q4_Q1

Q3_Total_Ct_Chng_Q4_Q1 = data.Total_Ct_Chng_Q4_Q1.quantile(0.75)
Q1_Total_Ct_Chng_Q4_Q1 = data.Total_Ct_Chng_Q4_Q1.quantile(0.25)

IQR = Q3_Total_Ct_Chng_Q4_Q1 - Q1_Total_Ct_Chng_Q4_Q1

upper_limit = Q3_Total_Ct_Chng_Q4_Q1 + (1.5*IQR)
lower_limit = Q1_Total_Ct_Chng_Q4_Q1 - (1.5*IQR)

data = data[(data["Total_Ct_Chng_Q4_Q1"] <= upper_limit) & (data["Total_Ct_Chng_Q4_Q1"] >= lower_limit)]


### insights


-  The DataFrame has no null values.
- There are no duplicate records in the DataFrame.
- At first glance, the data appears to be within reality, without filling errors in the fields.
- Found outliers in some of the columns,handled for necessary ones.
-  IQR (Inter quantile range) is a statisitcal measure used to represent spread of data by focusing middle 50%(box) of the values.

# Data Analysis and Data Visiualization

### Customer Demographics Analysis

### Check frequencey of different income categories

In [None]:
%matplotlib inline
plt.figure(figsize=(8,7))
sns.countplot(data=data,x="Income_Category")
plt.show()

### Age Distribution

In [None]:
def get_age_type(age):       # Labeling the age groups to make it easy to understand patterns

    if age >=18 and age <= 30:
        return "Young Adult"
    elif age <= 55:
        return "Middle Aged"
    else:
        return "Senior"
data["age_type"] = data.Customer_Age.apply(get_age_type)

### Check frequency of different age groups

In [None]:
plt.figure(figsize=(8,7))
sns.countplot(data=data,x="age_type")
plt.show()

### no. of age groups for every income category

In [None]:

income_age_groups = data.groupby(["Income_Category","age_type"]).CLIENTNUM.count()  # Use groupby to categorize income category
# age type of customers

income_age = income_age_groups.reset_index() # reseting index to get columns easily
plt.figure(figsize=(9,7))
sns.barplot(x="Income_Category",hue="age_type",y="CLIENTNUM",data=income_age)
plt.ylabel("Count")
plt.show()

In [None]:
sns.kdeplot(data=data,x=data.Customer_Age,fill=True,palette="crest",color="g",alpha=0.2)
plt.show()

- We found age column have normal distribution(bell shaped curve) i,e it is skewed only
- Symmetric distribution: A symmetric distribution is a type of data distribution where the left and right sides of the distribution are equal

### Martial Status Distribution

In [None]:
sns.histplot(data=data.Marital_Status)
plt.xlabel("Martial Status",fontsize=18)
plt.show()

- We found age column have normal distribution(bell shaped curve) i,e it is skewed only

### Income Distribution

In [None]:
plt.figure(figsize=(15,10))
sns.histplot(data=data.Income_Category)
plt.xlabel("Income Category",fontsize=18)
plt.show()

- We found distribution of Income Category column as positively skewed distribution
- Positively skewed distribution is a type of distribution which is longer on the right side of its peak than its left side

# Credit Usage Analysis

### Does Customer Age affects Average Utilization Ratio

In [None]:
sns.regplot(data=data,x="Customer_Age",y="Avg_Utilization_Ratio")
plt.show()

#### - Insights
- Average utlization ratio decreases from 65

### How Average open to buy changes with customer age

In [None]:
sns.lineplot(data=data,x="Customer_Age",y="Avg_Open_To_Buy")
plt.show()

#### - Insights
- Avg_open_to_buy first increases with customer age then decreases with increase in age

In [None]:
attrition_gender = data.groupby(["Attrition_Flag","Gender"]).CLIENTNUM.count()

In [None]:
attrition_gender = attrition_gender.reset_index()

sns.barplot(x="Attrition_Flag",hue="Gender",y="CLIENTNUM",data=attrition_gender)
plt.ylabel("Count")
plt.show()

#### Insights

- The above image shows women are the most attrited customers than men

In [None]:
data[["Credit_Limit","Total_Revolving_Bal"]].describe()

In [None]:
data['Credit_Limit'].hist(bins=20, edgecolor='black')
plt.title('Distribution of Credit Limit')
plt.xlabel('Credit Limit')
plt.ylabel('Frequency')
plt.show()

In [None]:
data['Total_Revolving_Bal'].hist(bins=20, edgecolor='black')
plt.title('Distribution of Total Revolving Balance')
plt.xlabel('Total Revolving Balance')
plt.ylabel('Frequency')
plt.show()

In [None]:
card_category_counts = data['Card_Category'].value_counts()
sns.barplot(x=card_category_counts.index, y=card_category_counts.values)
plt.title('Distribution of Customers by Card Category')
plt.xlabel('Card Category')
plt.ylabel('Count of Customers')
plt.show()

# Check Correlation between different features

In [None]:
numerical_demographics = ['Customer_Age', 'Dependent_count']  # Adjust as needed
corr = data[numerical_demographics + ['Credit_Limit', 'Total_Revolving_Bal']].corr()

# Heatmap visualization
sns.heatmap(corr, annot=True)
plt.title('Correlation Matrix')
plt.show()

In [None]:

# Customer Age vs Credit Limit
sns.lineplot(data=data,x="Customer_Age",y="Credit_Limit")
plt.xlabel("Customer Age")
plt.ylabel("Credit Limit")
plt.title(f"Credit Limit VS Customer Age")
plt.show()

# Customer Age vs total revolving balance

plt.scatter(data['Customer_Age'], data['Total_Revolving_Bal'])
plt.xlabel('Customer Age')
plt.ylabel('Total Revolving Balance')
plt.title('Total Revolving Balance vs. Customer Age')
plt.show()

# Preprocessing

In [None]:
data.drop(["CLIENTNUM","naive_month_2","naive_month_1","age_type"],axis=1,inplace=True)

In [None]:
numerical_cols = [col for col in data.columns if data[col].dtype == "float64" or data[col].dtype == "int64"]


categorical_cols = [col for col in data.columns if data[col].dtype == "object"]

In [None]:
# Column Transformer is data preprocessing tool used transform columns separatly
preprocessor = ColumnTransformer(
[
    ("one-hot",OneHotEncoder(handle_unknown="ignore"),categorical_cols),
    ("standard",StandardScaler(),numerical_cols)
])
preprocessor.fit_transform(data)

In [None]:
scaled_df = pd.DataFrame(preprocessor.fit_transform(data),columns=preprocessor.get_feature_names_out())

# Check optimal no. of Clusters

### Elbow Method 

In [None]:
inertia = []
for i in range(1,11):
    model = KMeans(n_clusters=i,init="k-means++",random_state=40,max_iter=300)
    model.fit(scaled_df)
    inertia.append(model.inertia_)
    

### Silhouette Score

- Silhouette score (S) tells how well a data point lies within its assigned cluster. it ranges from -1 to 1

In [None]:
silhouet_score = []
for i in range(2,11):
    model = KMeans(n_clusters=i,init="k-means++",random_state=40,max_iter=300)
    model.fit(scaled_df)
    score = silhouette_score(scaled_df,model.labels_)
    silhouet_score.append(score)

In [None]:
sns.lineplot(data=inertia)
plt.xlabel("Inertia")
plt.title("Elbow Method")
plt.show()

In [None]:
plt.plot(range(2,11),silhouet_score)
plt.xlabel("Clusters")
plt.title("Silhouette Method")
plt.show()

# Insights

- While checking the silhouette plot we can see the silhouette score is high on cluster 4
- we will take optimal no. of clusters as 4



# Model Building

In [None]:
model = KMeans(n_clusters=4,init="k-means++",random_state=40,max_iter=300)
model.fit(scaled_df)

In [None]:
model.inertia_

In [None]:
model.labels_

In [None]:
cluster_centers = model.cluster_centers_

In [None]:
labels = model.labels_

In [None]:
data.head(4)

In [None]:
data.columns

# PCA


- It's a technique used in data analysis to simplify complex datasets.it helps in dimensionality reduction and helps to priortize the variables which have most variation in the list.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [None]:

pipeline = Pipeline(
[
    ("preprocessing",preprocessor),
    ("pca",PCA(n_components=2,random_state=40)),
    ("clustering",KMeans(n_clusters=4,init="k-means++",random_state=42,max_iter=300)),
])


In [None]:
pipeline.fit(data)

In [None]:
df = data.copy()

df["cluster"] = pipeline["clustering"].labels_

In [None]:
df.head(4)

In [None]:
# Count each cluster in the dataset

cluster_count = df["cluster"].value_counts()

In [None]:
cluster_count

In [None]:
plt.figure(figsize=(5,5))
plt.pie(cluster_count,labels=cluster_count.index,autopct="%1.1f%%",startangle=140,labeldistance=1.1)
plt.title("Cluster Distribution")
plt.show()

- Most of the customers either fall in 0 or 1 cluster

In [None]:
df_pca = pd.DataFrame(
pipeline[:-1].fit_transform(data),
columns=pipeline[:-1].get_feature_names_out())
df_pca.head(5)

In [None]:
centroids = pipeline["clustering"].cluster_centers_

In [None]:
centroids

In [None]:
print(f'Explained Variance Ratio of component 1: {round(pipeline["pca"].explained_variance_ratio_[0],2)}')
print(f'Explained Variance Ratio of component 2: {round(pipeline["pca"].explained_variance_ratio_[1],2)}')

In [None]:
df_pca["cluster"] = pipeline["clustering"].labels_

In [None]:
plt.figure(figsize=(8, 8))
   
sp = sns.scatterplot(
        x="pca0",
        y="pca1",
        s=50,
        data=df_pca,
        hue="cluster",
        palette='tab10'
    )
   

plt.title("Clustering results from credit card use",fontsize=20)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
   
plt.show()

# Segment Analysis

## Analyze the clusters

#### Attrition Flag

In [None]:
fig, axs = plt.subplots(ncols=2,nrows=2, figsize=(20,14))

df[df['cluster'] == 0]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[0,0], title='Cluster 1')
df[df['cluster'] == 1]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[0,1], title='Cluster 2')
df[df['cluster'] == 2]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[1,0], title='Cluster 3')
df[df['cluster'] == 3]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[1,1], title='Cluster 4')

plt.show()

## Insights

- It looke like cluster 2 and 3 have highest  customer attrition rate,while as cluster 1 and cluster have low attrition

In [None]:
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(data[numerical_cols]),columns=numerical_cols)
scaled_df["cluster"] = df["cluster"]

In [None]:
fix, axs = plt.subplots(ncols=2,nrows=2, figsize=(20,32))

scaled_df[scaled_df['cluster'] == 0][numerical_cols].mean().plot.barh(ax=axs[0,0], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Cluster 1')
scaled_df[scaled_df['cluster'] == 1][numerical_cols].mean().plot.barh(ax=axs[0,1], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Cluster 2')
scaled_df[scaled_df['cluster'] == 2][numerical_cols].mean().plot.barh(ax=axs[1,0], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Cluster 3')
scaled_df[scaled_df['cluster'] == 3][numerical_cols].mean().plot.barh(ax=axs[1,1], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Cluster 4')

plt.show()

### Inshights

#### Group 1

- Looks like total transaction amount and total transaction count is high, while as Total relationship count is low

#### Group 2

- Looks like total transaction amount and total transaction count is high, while as credit limit,Average open to buy,total relationship count and months on book are low


#### Group 3

- Almost everything looks neutral


#### Group 4

- Looks like total transaction amount, total transaction count,months_inactive and dependent count is higher,while as total relationship count,total count change in Q4 and total amuont change in Q4 are low

In [None]:
scaled_df.head(4)

#### Customer Age

In [None]:
fig, axs = plt.subplots(ncols=2,nrows=2, figsize=(12,14))

sns.histplot(df[df['cluster'] == 0]['Customer_Age'], ax=axs[0,0], binrange=(20,70), binwidth=5).set(title='Cluster 1', ylabel='', xlabel='')
sns.histplot(df[df['cluster'] == 1]['Customer_Age'], ax=axs[0,1], binrange=(20,70), binwidth=5).set(title='Cluster 2', ylabel='', xlabel='')
sns.histplot(df[df['cluster'] == 2]['Customer_Age'], ax=axs[1,0], binrange=(20,70), binwidth=5).set(title='Cluster 3', ylabel='', xlabel='')
sns.histplot(df[df['cluster'] == 3]['Customer_Age'], ax=axs[1,1], binrange=(20,70), binwidth=5).set(title='Cluster 4', ylabel='', xlabel='')
plt.show()

### Inshights

- Almost every cluster have high age count between 40 to 55
- Cluster 1 have most of the customers in age group 40 to 50

In [None]:
#### Age


fix, axs = plt.subplots(ncols=2,nrows=2, figsize=(12,14))

df[df['cluster'] == 0]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[0,0], ylabel='', title='Cluster 1', autopct='%.1f%%')
df[df['cluster'] == 1]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[0,1], ylabel='', title='Cluster 2', autopct='%.1f%%')
df[df['cluster'] == 2]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[1,0], ylabel='', title='Cluster 3', autopct='%.1f%%')
df[df['cluster'] == 3]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[1,1], ylabel='', title='Cluster 4', autopct='%.1f%%')

plt.show()

#### Martial Status

In [None]:
fix, axs = plt.subplots(ncols=2,nrows=2, figsize=(12,14))

df[df['cluster'] == 0]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[0,0], ylabel='', title='Cluster 1', colors=['r','b','g','y'], autopct='%.1f%%')
df[df['cluster'] == 1]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[0,1], ylabel='', title='Cluster 2', colors=['r','b','g','y'], autopct='%.1f%%')
df[df['cluster'] == 2]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[1,0], ylabel='', title='Cluster 3', colors=['r','b','g','y'], autopct='%.1f%%')
df[df['cluster'] == 3]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[1,1], ylabel='', title='Cluster 4', colors=['r','b','g','y'], autopct='%.1f%%')

plt.show()

#### Insights

- Cluster 1 have highest rate  of married % compared with other clusters

#### Income Level

In [None]:
fix, axs = plt.subplots(ncols=2,nrows=2, figsize=(12,14))

df[df['cluster'] == 0]['Income_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[0,0], ylabel='', title='Cluster 1', colors=['r','b','g','gray','y','pink'], autopct='%.1f%%')
df[df['cluster'] == 1]['Income_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[0,1], ylabel='', title='Cluster 2', colors=['r','b','g','gray','y','pink'], autopct='%.1f%%')
df[df['cluster'] == 2]['Income_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[1,0], ylabel='', title='Cluster 3', colors=['r','b','g','gray','y','pink'], autopct='%.1f%%')
df[df['cluster'] == 3]['Income_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[1,1], ylabel='', title='Cluster 4', colors=['r','b','g','gray','y','pink'], autopct='%.1f%%')
plt.show()

#### Insghits

- Cluster 2 have highest level of income with 27.2% of having $120K

- Cluster 1 have lowest level of income because 55.5% are having <$40K

#### Card Type

In [None]:
fix, axs = plt.subplots(ncols=2,nrows=2, figsize=(12,14))

df[df['cluster'] == 0]['Card_Category'].value_counts().sort_index().plot.barh(ax=axs[0,0], ylabel='', title='Cluster 1')
df[df['cluster'] == 1]['Card_Category'].value_counts().sort_index().plot.barh(ax=axs[0,1], ylabel='', title='Cluster 2')
df[df['cluster'] == 2]['Card_Category'].value_counts().sort_index().plot.barh(ax=axs[1,0], ylabel='', title='Cluster 3')
df[df['cluster'] == 3]['Card_Category'].value_counts().sort_index().plot.barh(ax=axs[1,1], ylabel='', title='Cluster 4')

plt.show()

#### Insghits
- Looks like every cluster have almost users using Blue card Type
- Cluster 2 have good number of users using Silver Card Type

# Predictive Modeling

### Prepare Data

In [None]:
encoder = LabelEncoder()  # Using label encoder to encode categorical column
data["Attrition_Flag"] = encoder.fit_transform(data.Attrition_Flag)

In [None]:
X  = data.drop("Attrition_Flag",axis=1)
y = data.Attrition_Flag

### Split Data

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
categorical_cols.remove("Attrition_Flag")

In [None]:
preprocessor = ColumnTransformer(
[
    ("onehot",OneHotEncoder(handle_unknown="ignore"),categorical_cols),
    ("standard",StandardScaler(),numerical_cols),

])

### Model Evaluation

In [None]:
# utility function to test model performence
# Pipeline is a structured approach to build machine learning model.

def train_test_model(model,model_name):
    pipeline = Pipeline(steps=
[
    ("preprocessor",preprocessor),
    ("model",model)
])
    
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_valid)
    print("*"*20)
    print(f"Model {model_name} accuracy is {round(accuracy_score(y_valid,y_pred),2)}")
    print("Classification Report")
    print(classification_report(y_valid,y_pred))
    print("Confusion Matrix")
    sns.heatmap(confusion_matrix(y_valid,y_pred),annot=True,fmt="d")
    plt.show()

In [None]:
train_test_model(LogisticRegression(),"Logistic Regression")

train_test_model(DecisionTreeClassifier(),"Decision Tree Classifier")

train_test_model(RandomForestClassifier(),"Random Forest Classifier")

# Strength and Weakness of Models used

### Logistic Regression

#### Strengths:

-    Understand feature influence through coefficients.
-    Easy to understand and implement
-    training is fast and it's suitable for large datasets.
-    Works well with binary classification.

#### Weaknesses:

-    Limited to binary classification.
-    May struggle with non-linear relationships.
-    Prone to overfitting
-    Limited ability to handle complex relationships between features

### Decision Tree Classifier

#### Strengths:

-    Handles complex, non-linear relationships between features and the target variable effectively, often without requiring explicit feature engineering.
-    Decision trees are insensitive to feature scaling. We don't need to standardize or normalize your data beforehand, simplifying the preprocessing step.
-    Less sensitive to outliers in the data compared to models.
-    ItCan handle both categorical and numerical features directly, without the need for separate encoding of categorical features.

#### Weaknesses:

-   Prone to overfitting the training data if not pruned or regularized.Leads to poor performance on unseen data.
-   Small changes in the training data can lead to significant changes in the tree structure, resulting in high variance and potentially unstable models.
-    Predicting new data points can be computationally expensive compared to simpler models like logistic regression

### Random 


#### Strengths:

-    Makes good predictions by combining many decision trees.
-    Works well with different data types and can handle missing values.
-    Less prone to overfitting: Harder to learn the training data too closely.

#### Weaknesses:

-    Difficult to understand how it arrives at specific predictions.
-    Training large datasets with many trees can be time-consuming.
-    Making predictions can be slower than some simpler models.



# How model performence can be Improved

- Feature Engineering i,e feature selection etc

- Regularization Techniques like L1(Lasso) and L2(Ridge) can be used to overcome overfitting

- Hyperparamter tuning can be done on models

- Testing models on different subsets of dataset to get acutal accuracy using Cross Validation Score and KFold