# Imbalanced Classification

**Problem Statement**

Beta Bank customers are leaving: little by little, chipping away every month. The bankers figured out it’s cheaper to save the existing customers rather than to attract new ones.
We need to predict whether a customer will leave the bank soon. You have the data on clients’ past behavior and termination of contracts with the bank.
Build a model with the maximum possible F1 score. To pass the project, you need an F1 score of at least 0.59. Check the F1 for the test set.
Additionally, measure the AUC-ROC metric and compare it with the F1.


In [None]:
import pandas as pd

df = pd.read_csv("https://bit.ly/2XZK7Bo")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [None]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure             float64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [None]:
df.isnull().sum()

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64

In [None]:
#Remove null values
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
df.duplicated().sum()

0

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

target = df['Exited']
features = df.drop(['Exited','RowNumber','CustomerId','Surname','Geography','Gender'], axis=1)

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = DecisionTreeClassifier(random_state=12345)
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)

print(f1_score(target_valid, predicted_valid))

0.45708154506437765


#Prediction of our class is close to 0 meaning it has failed, next we will try to improve it and see whether that changes.

#Fixing Imbalance using Class Weight Adjustment

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

target = df['Exited']
features = df.drop(['Exited','RowNumber','CustomerId','Surname','Geography','Gender'], axis=1)

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = LogisticRegression(
    random_state=42, class_weight='balanced', solver='liblinear'
)

model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)

print(f1_score(target_valid, predicted_valid))

0.4606240713224368


Now Using Threshold Adjustment

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

target = df['Exited']
features = df.drop(['Exited','RowNumber','CustomerId','Surname','Geography','Gender'], axis=1)

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = LogisticRegression(
    random_state=42, class_weight='balanced', solver='liblinear'
)

model.fit(features_train, target_train)

probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

for threshold in np.arange(0, 0.3, 0.02):
    predicted_valid = probabilities_one_valid > threshold
    precision = precision_score(target_valid, predicted_valid)
    recall = recall_score(target_valid, predicted_valid)
    print(
        'Threshold = {:.2f} | Precision = {:.3f}, Recall = {:.3f}'.format(
            threshold, precision, recall
        )
    )

Threshold = 0.00 | Precision = 0.198, Recall = 1.000
Threshold = 0.02 | Precision = 0.198, Recall = 1.000
Threshold = 0.04 | Precision = 0.198, Recall = 1.000
Threshold = 0.06 | Precision = 0.198, Recall = 1.000
Threshold = 0.08 | Precision = 0.199, Recall = 1.000
Threshold = 0.10 | Precision = 0.200, Recall = 1.000
Threshold = 0.12 | Precision = 0.201, Recall = 0.998
Threshold = 0.14 | Precision = 0.203, Recall = 0.998
Threshold = 0.16 | Precision = 0.206, Recall = 0.993
Threshold = 0.18 | Precision = 0.210, Recall = 0.991
Threshold = 0.20 | Precision = 0.214, Recall = 0.984
Threshold = 0.22 | Precision = 0.221, Recall = 0.980
Threshold = 0.24 | Precision = 0.226, Recall = 0.971
Threshold = 0.26 | Precision = 0.233, Recall = 0.962
Threshold = 0.28 | Precision = 0.240, Recall = 0.942


We see as threshhold increases from 0.00 precision also increases and recall reduces gradually.
While recall are closer to or equal to 1 this indicates that the model is good at identifying true positive

In [None]:
#If we test with the initial set(large set)
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

target = df['Exited']
features = df.drop(['Exited','RowNumber','CustomerId','Surname','Geography','Gender'], axis=1)

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

model = LogisticRegression(
    random_state=42, class_weight='balanced', solver='liblinear'
)

model.fit(features_train, target_train)

#Subject the model to the initial set
probabilities_valid = model.predict_proba(features)
probabilities_one_valid = probabilities_valid[:, 1]

for threshold in np.arange(0, 0.3, 0.02):
    predicted_valid = probabilities_one_valid > threshold
    precision = precision_score(target, predicted_valid)
    recall = recall_score(target, predicted_valid)
    print(
        'Threshold = {:.2f} | Precision = {:.3f}, Recall = {:.3f}'.format(
            threshold, precision, recall
        )
    )

Threshold = 0.00 | Precision = 0.204, Recall = 1.000
Threshold = 0.02 | Precision = 0.204, Recall = 1.000
Threshold = 0.04 | Precision = 0.204, Recall = 1.000
Threshold = 0.06 | Precision = 0.204, Recall = 1.000
Threshold = 0.08 | Precision = 0.204, Recall = 0.999
Threshold = 0.10 | Precision = 0.205, Recall = 0.999
Threshold = 0.12 | Precision = 0.206, Recall = 0.998
Threshold = 0.14 | Precision = 0.208, Recall = 0.997
Threshold = 0.16 | Precision = 0.210, Recall = 0.992
Threshold = 0.18 | Precision = 0.214, Recall = 0.990
Threshold = 0.20 | Precision = 0.218, Recall = 0.983
Threshold = 0.22 | Precision = 0.223, Recall = 0.976
Threshold = 0.24 | Precision = 0.229, Recall = 0.964
Threshold = 0.26 | Precision = 0.235, Recall = 0.953
Threshold = 0.28 | Precision = 0.241, Recall = 0.937
