# Lab | Imbalanced data
We will be using the files_for_lab/customer_churn.csv dataset to build a churn predictor.



In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import MinMaxScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample


In [20]:
# I create this function to avoid repetition later on the step "evaluation"
def evaluate_model(model,train,test,label=None):
    X_train_norm,y_train,X_test_norm,y_test = train[0], train[1], test[0], test[1]

    pred_train = model.predict(X_train_norm)
    print('Evaluating the model with its own data:')
    print('accuracy: %.2f' %model.score(X_train_norm,y_train))
    print('precision: %.2f' %precision_score(y_train,pred_train,pos_label=label))
    print("recall: %.2f" %recall_score(y_train,pred_train,pos_label=label))
    print("f1: %.2f" %f1_score(y_train,pred_train,pos_label=label))

    pred_test = model.predict(X_test_norm)
    print('\nEvaluating the model with test data:')
    print('accuracy: %.2f' %model.score(X_test_norm,y_test))
    print('precision: %.2f' %precision_score(y_test,pred_test,pos_label=label))
    print("recall: %.2f" %recall_score(y_test,pred_test,pos_label=label))
    print("f1: %.2f" %f1_score(y_test,pred_test,pos_label=label))
    
    conf_mtx = pd.DataFrame(confusion_matrix(y_test, pred_test),columns=['Predicted No','Predicted Yes'],index=['Actual No','Actual Yes'])
    display(conf_mtx)
    return conf_mtx

#### 1. Load the dataset and explore the variables.


In [55]:
data = pd.read_csv("./files_for_lab/customer_churn.csv")

# I convert to snake_case
data.columns = map(lambda x:re.sub(r"([a-z]{1})([A-Z]{1})", r"\1 \2",x),data.columns)
data.columns = list(map(lambda x: x.lower(), data.columns))
data.columns = data.columns.str.replace(' ','_')

# exploring numerical and categorical variables
display(data.select_dtypes(np.number))
display(data.select_dtypes(object))

Unnamed: 0,senior_citizen,tenure,monthly_charges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.30
4,0,2,70.70
...,...,...,...
7038,0,24,84.80
7039,0,72,103.20
7040,0,11,29.60
7041,1,4,74.40


Unnamed: 0,customer_id,gender,partner,dependents,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,total_charges,churn
0,7590-VHVEG,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No
1,5575-GNVDE,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,1889.5,No
2,3668-QPYBK,Male,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,108.15,Yes
3,7795-CFOCW,Male,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),1840.75,No
4,9237-HQITU,Female,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,1990.5,No
7039,2234-XADUH,Female,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),7362.9,No
7040,4801-JZAZL,Female,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,346.45,No
7041,8361-LTMKD,Male,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,306.6,Yes


#### 2 We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.
#### 3. Extract the target variable.
#### 4. Extract the independent variables and scale them.


In [4]:
# Those variables happen to be the numeric ones, so:
X = data.select_dtypes(np.number)
y = data['churn']

In [5]:
# Split TRAIN-TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling TRAIN
transformer = MinMaxScaler().fit(X_train)
X_train_norm = pd.DataFrame(transformer.transform(X_train),columns=X_train.columns)
X_train_norm

# Scaling TEST
X_test_norm = pd.DataFrame(transformer.transform(X_test),columns=X_test.columns)


#### 5. Build the logistic regression model.


In [56]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')
classification.fit(X_train_norm, y_train);

#### 6. Evaluate the model.


In [46]:
log_conf_mtx = evaluate_model(classification,[X_train_norm,y_train],[X_test_norm,y_test],label='Yes')

Evaluating the model with its own data:
accuracy: 0.79
precision: 0.65
recall: 0.44
f1: 0.52

Evaluating the model with test data:
accuracy: 0.80
precision: 0.69
recall: 0.47
f1: 0.56


Unnamed: 0,Predicted No,Predicted Yes
Actual No,959,77
Actual Yes,199,174


#### 7. Even a simple model will give us more than 70% accuracy. Why?

In [8]:
# Because of the imbalance in data.
print('A dump model that targets everything as No would have an accuracy of %.2f.' %(len(y[y=='No'])/len(y)))

A dump model that targets everything as No would have an accuracy of 0.73.


#### 8. SMOTE

In [24]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_norm,y_train)
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
smote_conf_mtx =evaluate_model(LR,train=[X_train_SMOTE,y_train_SMOTE],test=[X_test_norm,y_test],label='Yes')
print('\nWe can see that the precision went down quite a bit, but we got a much higher recall (with this model we only miss 33% of Yes down from 53% in the first model) ')

Evaluating the model with its own data:
accuracy: 0.73
precision: 0.73
recall: 0.74
f1: 0.73

Evaluating the model with test data:
accuracy: 0.75
precision: 0.51
recall: 0.77
f1: 0.61


Unnamed: 0,Predicted No,Predicted Yes
Actual No,764,272
Actual Yes,87,286



We can see that the precision went down quite a bit, but we got a much higher recall (with this model we only miss 33% of Yes down from 53% in the first model) 


##### Pre-sampling: separating data on target category

In [13]:
# do we need to reset the index for y_test...?
y_train = y_train.reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
train = pd.concat([X_train_norm, y_train],axis=1) 
# separate majority/minority classes
no_churn = train[train['churn']=='No']
yes_churn = train[train['churn']=='Yes']

#### OVERSAMPLING

In [25]:
# Oversampling minority
yes_churn_oversample = resample(yes_churn,replace=True, n_samples = len(no_churn),random_state=0)
train_oversampled = pd.concat([no_churn,yes_churn_oversample],axis=0)
y_train_over = train_oversampled['churn'].copy()
X_train_over = train_oversampled.drop('churn',axis = 1).copy()

# Creating the model
over_model = LogisticRegression(random_state=0, solver='lbfgs')
over_model.fit(X_train_over, y_train_over)

# Evaluating model
over_conf_mtx = evaluate_model(over_model,train=[X_train_over,y_train_over],test=[X_test_norm,y_test],label='Yes')
print('\nWe get a really similar result as in the SMOTE model because both uses oversampling (one with duplicates and other creates "virtual" rows with KNN')

Evaluating the model with its own data:
accuracy: 0.73
precision: 0.73
recall: 0.74
f1: 0.74

Evaluating the model with test data:
accuracy: 0.75
precision: 0.51
recall: 0.77
f1: 0.62


Unnamed: 0,Predicted No,Predicted Yes
Actual No,764,272
Actual Yes,86,287



We get a really similar result as in the SMOTE model because both uses oversampling (one with duplicates and other creates "virtual" rows with KNN


#### UNDERSAMPLING

In [51]:
# Undersampling majority
no_churn_undersample = resample(no_churn,replace=False,n_samples = len(yes_churn),random_state=0)
train_undersampled = pd.concat([yes_churn,no_churn_undersample], axis=0)
y_train_under = train_undersampled['churn'].copy()
X_train_under = train_undersampled.drop('churn',axis = 1).copy()

# Creating model
under_model = LogisticRegression(random_state=0, solver='lbfgs')
under_model.fit(X_train_under, y_train_under)

# Evaluating model
under_conf_mtx = evaluate_model(under_model,train=[X_train_under,y_train_under],test=[X_test_norm,y_test],label='Yes')
print('\nWe get a similar answer the undersampling model.')

Evaluating the model with its own data:
accuracy: 0.73
precision: 0.73
recall: 0.74
f1: 0.73

Evaluating the model with test data:
accuracy: 0.74
precision: 0.51
recall: 0.77
f1: 0.62


Unnamed: 0,Predicted No,Predicted Yes
Actual No,761,275
Actual Yes,85,288



We get a similar answer the undersampling model.


#### Comparing all the confusion matrixes side by side

In [54]:
# Comparing
conf_matrixes = pd.concat([log_conf_mtx,smote_conf_mtx,over_conf_mtx,under_conf_mtx],axis=1)
models = ['logistic','smote','oversample','undersample']
columns = []
for column in models:
    columns.append((column,'Predicted No'))
    columns.append((column,'Predicted Yes'))
conf_matrixes.columns = pd.MultiIndex.from_tuples(columns)

display(conf_matrixes)

Unnamed: 0_level_0,logistic,logistic,smote,smote,oversample,oversample,undersample,undersample
Unnamed: 0_level_1,Predicted No,Predicted Yes,Predicted No,Predicted Yes,Predicted No,Predicted Yes,Predicted No,Predicted Yes
Actual No,959,77,764,272,764,272,761,275
Actual Yes,199,174,87,286,86,287,85,288


So we basically see that the non_logistic models perform almost twice times better at identifying a Yes (we can see that on the table, and also because the recall score is way higher). By doing this, we're losing accuracy and recall on "No" because with the new models we are more prone to predict a No as a Yes. 

In the end, depending on the problem, we'll try to focus on one category or the other. In this case, I'm not sure but I think churn means someone ended their suscription,  in which case I guess a company would prefer to have a nice recall on Yes so that they can identify as many leaving customers as possible (even if some of those won't) so that maybe they can offer them a better plan. So I would say the smote/over/under sampling are the best models to use.