# Import dependencies

In [53]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [57]:
plt.style.use('fivethirtyeight')

## Read the data

In [19]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [15]:
df.head()

## How many values do we have?

In [31]:
print(f'There are {df.shape[0]} rows with {df.shape[1]} columns')

## What columns do we have?

In [21]:
df.columns

## Understanding each column

* ClIENTNUM: Unique ID for each customer
* Attrition_Flag: If the customer account is open or not.
* Customer Age: The age of the customer.
* Gender: The gender of the customer (M/F)
* Dependent_count: Number of dependents.
* Education_Level: Qualification of the customer.
* Marital_Status: Married, Single, Divorced, Unknown.
* Income_Category: Range of income.
* Card_Category: Customer card category.
* Monts_on_book: Months with account on the bank.
* Total_Relationship_Count: Number of products by customer.
* Months_Inactive_12_mon: Number of months inactive on the last 12 months.
* Contacts_Count_12_mon: Number of contacs on the last 12 months.
* Credit_Limit: Customer's credit limit.
* Total_Revolving_Bal: Revolving balance of the credit card.
* Avg_Open_To_Buy: Open to buy on credit card.
* Total_Amt_Chgn_Q4_Q1: Change in transaction Q4 to Q1.
* Total_Trans_Amt: Total amount of transactions.
* Total_Trans_Ct: Total count of transactions.
* Total_Ct_Chng_Q4_Q1: Total count of change Q4 to Q1.
* Avg_Utilization_Ratio: Ratio-Average utilization of the credit card.


Let's drop the two last ones, as indicated on the dataset info.

In [20]:
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True)

## Do we have nan values?

In [32]:
df[df.isna().any(axis=1)]

# Lets explore the statistics of the data

In [40]:
round(df.describe(exclude = 'object'), 2).T

In [41]:
round(df.describe(exclude = ['float', 'int64']),2).T

### The most common values are existing customers, females, graduate education, married, with an income lower to 40k and a blue caterogy.

## Exploring each column

### Clientnum: Unique ID of each customer.

In [72]:
for attr in df.columns:
    your_bins=6
    data=[]
    plt.figure(figsize=(10,5))
    arr=plt.hist(df[attr],bins=your_bins)
    plt.title(f'{attr}')
    for i in range(your_bins):
        plt.text(arr[1][i],arr[0][i],str(arr[0][i]))
    plt.show()

## There does not seem to be any outliers on the data.

## Convert target variable to numerical.

In [82]:
df['Attrition_Flag'].unique()

In [84]:
df.replace({'Existing Customer': 0, 'Attrited Customer': 1}, inplace=True)

In [89]:
df['Attrition_Flag'].sample(5)

# Lets explore the correlation of between coluns.

In [78]:
import seaborn as sns

In [91]:
corrmap = df.corr()
cols =  corrmap.index

plt.figure(figsize=(20,15))

g=sns.heatmap(df[cols].corr(),annot=True,linewidths=3.5,
    linecolor='white')

# Correlation of each column with the target variable.

In [103]:
plt.figure(figsize=(20,5))
chart_df = pd.DataFrame(df.drop('Attrition_Flag', axis=1).corrwith(df['Attrition_Flag']))
chart_df.columns = ['corr']
sns.barplot(y=chart_df.index, x=chart_df['corr'], zorder=3, edgecolor='black', linewidth=3)

## The most correlated are the total number of transactions, count change on the quarters, usage, ratio, contacts count and total revolving balance.

# Lets encode the categorical features.

In [122]:
df_new=pd.get_dummies(df, columns=['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'],drop_first=False)
print(df_new.shape)

## Take a look at the correlations.

In [123]:
plt.figure(figsize=(20,10))
chart_df = pd.DataFrame(df_new.drop('Attrition_Flag', axis=1).corrwith(df_new['Attrition_Flag']))
chart_df.columns = ['corr']
sns.barplot(y=chart_df.index, x=chart_df['corr'], zorder=3, edgecolor='black', linewidth=3)

# Lets start the Modeling

## Standarize the data

In [120]:
from sklearn.preprocessing import MinMaxScaler

In [132]:
df_new.columns

In [133]:
df = df_new[['CLIENTNUM', 'Customer_Age', 'Dependent_count',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Gender_F', 'Gender_M', 'Education_Level_College',
       'Education_Level_Doctorate', 'Education_Level_Graduate',
       'Education_Level_High School', 'Education_Level_Post-Graduate',
       'Education_Level_Uneducated', 'Education_Level_Unknown',
       'Marital_Status_Divorced', 'Marital_Status_Married',
       'Marital_Status_Single', 'Marital_Status_Unknown',
       'Income_Category_$120K +', 'Income_Category_$40K - $60K',
       'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K',
       'Income_Category_Less than $40K', 'Income_Category_Unknown',
       'Card_Category_Blue', 'Card_Category_Gold', 'Card_Category_Platinum',
       'Card_Category_Silver','Attrition_Flag']]

In [134]:
scaler = MinMaxScaler()
scaler.fit_transform(df.drop('Attrition_Flag',axis=1))
scaled_features = scaler.transform(df.drop('Attrition_Flag',axis=1))

In [135]:
scaled_features

In [136]:
scaled = pd.DataFrame(scaled_features, columns = df.columns[:-1])
scaled.head(5)

# We now have our X (Variables) and y (Target).
## Lets separate the data into training and testing.

In [137]:
X = scaled
y = df['Attrition_Flag']

In [139]:
from sklearn.model_selection import train_test_split

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [142]:
print(f'There are {X_train.shape[0]} training examples and {X_test.shape[0]} test examples.')

# Lets test different tree algorithms to find the best one.

In [143]:
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve ,KFold
from sklearn.metrics import roc_curve,accuracy_score,f1_score,auc,confusion_matrix,roc_auc_score,plot_confusion_matrix
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier

## Create folds 

In [144]:
kfold = StratifiedKFold(n_splits=8,shuffle=True, random_state=42)

In [145]:
rs = 142
clrs = []

clrs.append(AdaBoostClassifier(random_state=rs))
clrs.append(GradientBoostingClassifier(random_state=rs))
clrs.append(RandomForestClassifier(random_state=rs))
clrs.append(ExtraTreesClassifier(random_state = rs))
clrs.append(DecisionTreeClassifier(random_state = rs))


## Get the accuracy, mean, std of each model.

In [146]:
cv_results = []
for clr in clrs :
    cv_results.append(cross_val_score(clr, X_train, y_train , scoring = 'accuracy', cv = kfold, n_jobs=-1))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

In [164]:
cv_df = pd.DataFrame({"CrossVal_Score_Means":cv_means,"CrossValerrors": cv_std,"Algo":["RandomForestClassifier","AdaBoostClassifier","Gradient Boosting",'ExtraTreesClassifier','DecisionTreeClassifier']})
cv_df

# Lets use AdaBoosClassifier 🌳

In [165]:
 from sklearn.ensemble import AdaBoostClassifier

In [166]:
clf = AdaBoostClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Test the trained model.

In [167]:
y_pred = clf.predict(X_test)

## Explore the results

In [176]:
cm = confusion_matrix(y_pred,y_test)

In [178]:
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues');  #annot=True to annotate cells, ftm='g' to disable scientific notation

In [189]:
c_r = classification_report(y_test, y_pred, output_dict=True)
cr_df = pd.DataFrame(c_r).transpose()
cr_df

In [195]:
print(f'The accuracy of the model is {round(accuracy_score(y_test, y_pred),4)}')

# Result is great!, Lets try to make it better with Tuning

In [199]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [200]:
from sklearn.model_selection import GridSearchCV
grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'min_samples_split' : range(2, 10, 1),
    'min_samples_leaf' : range(2, 10, 1)
}

grid_search = GridSearchCV(dtc, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

In [201]:
dtc = grid_search.best_estimator_
y_pred = dtc.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [202]:
print(grid_search.best_params_)
print(grid_search.best_score_)

In [204]:
ada = AdaBoostClassifier(base_estimator = dtc)

parameters = {
    'n_estimators' : [120, 180, 200],
    'learning_rate' : [0.1, 1, 10],
    'algorithm' : ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(ada, parameters, n_jobs = -1, cv = 10, verbose = 1)
grid_search.fit(X_train, y_train)

In [205]:
print(grid_search.best_params_)
print(grid_search.best_score_)

## Train with this Parameters

In [206]:
ada = AdaBoostClassifier(base_estimator = dtc, algorithm = 'SAMME', learning_rate = 1, n_estimators = 180)
ada.fit(X_train, y_train)

In [207]:
plot_confusion_matrix(ada,
                      X_test, y_test,
                      cmap=plt.cm.Blues,
                      display_labels = ['Attritionn_0','Attritionn_1'])
plt.grid(False)
plt.show();

In [208]:
print(classification_report(y_test, y_pred))

In [209]:
print(accuracy_score(y_test, y_pred))


# How important was each feature?

In [210]:
feature = pd.Series(ada.feature_importances_, index = X_train.columns).sort_values(ascending = False)
print(feature)

In [211]:
plt.figure(figsize = (28,14))
sns.barplot(x = feature, y = feature.index,color = '#0078d7')
plt.title("Feature Importance")
plt.xlabel('Score')
plt.ylabel('Features')
plt.show()