In [2]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle


import numpy as np

# Step 1: Open the file and study the general information

In [3]:
data = pd.read_csv("/datasets/Churn.csv")

print(data.info())
print(data.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             9091 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB
None
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio 

A first glance at our dataset shows that our tenure column has several null values. We'll make the assumption that null means 0 years of tenure.

Let's try to determine the features that are useful for our machine learning model - by removing any features that aren't useful in predicting our target, we'll reduce the time it takes to train our ML model. In this case, **RowNumber, CustomerId, and Surname** are features that won't particularly help us. 

After removing these features, we'll convert our categorical features into numerical features. These include **Geography and Gender**.

Lastly, we'll split our data into our **training, test, and validation sets**, with a 3:1:1 ratio.

Our target is **exited**, with 1 specifying that the customer has stopped using our services.

# Step 2: Prepare the data

In [5]:
#fill tenure
data['Tenure'] = data['Tenure'].fillna(0)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [6]:
#drop unnecessary columns
ml_data = data.drop(['RowNumber','CustomerId','Surname'], axis=1)

ml_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(6), object(2)
memory usage: 859.5+ KB


In [7]:
#convert categorical to numerical
data_ohe = pd.get_dummies(ml_data, drop_first=True) #the drop_first parameter removes the Geography_France and Female columns.

print(data_ohe.info())
print(data_ohe.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
CreditScore          10000 non-null int64
Age                  10000 non-null int64
Tenure               10000 non-null float64
Balance              10000 non-null float64
NumOfProducts        10000 non-null int64
HasCrCard            10000 non-null int64
IsActiveMember       10000 non-null int64
EstimatedSalary      10000 non-null float64
Exited               10000 non-null int64
Geography_Germany    10000 non-null uint8
Geography_Spain      10000 non-null uint8
Gender_Male          10000 non-null uint8
dtypes: float64(3), int64(6), uint8(3)
memory usage: 732.5 KB
None
   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619   42     2.0       0.00              1          1   
1          608   41     1.0   83807.86              1          0   
2          502   42     8.0  159660.80              3          1   
3          699   39     1.0       0.00     

In [10]:
#split 3 ways - training, validating, and test.
df_train, df_temp = train_test_split(data_ohe, test_size=0.40, random_state=12345)
df_valid, df_test = train_test_split(df_temp, test_size=0.50, random_state=12345)

print(
df_train.shape,
df_valid.shape,
df_test.shape)

#features and target for training
features_train = df_train.drop('Exited', axis=1)
target_train = df_train['Exited']

#features and target for validating
features_valid = df_valid.drop('Exited', axis=1)
target_valid = df_valid['Exited']

#features and target for testing
features_test = df_test.drop('Exited', axis=1)
target_test = df_test['Exited']

(6000, 12) (2000, 12) (2000, 12)


Now that we've completed preprocessing the data and splitting the data into different sets, we can work with it. We'll first check the balance of classes.

# Step 3: Check the balance of classes

In [8]:
print("Portion of customers that have left the bank: ", data['Exited'].mean())

Portion of customers that have left the bank:  0.2037


Checking the mean shows us a significant imbalance in classes, with an approximate 1:4 ratio of ones to zeros for the 'Exited' column. Let's first create a model without regard for the imbalance, and see our results.

In [27]:
#create model
model = LogisticRegression(random_state=12345, solver='liblinear')

#train model
model.fit(features_train, target_train)

#make prediction
predicted_valid = model.predict(features_valid)

#check F1 score
print("Logistic regression model F1 score:", f1_score(target_valid, predicted_valid))

#create model
model = DecisionTreeClassifier(random_state=123)

#train model
model.fit(features_train, target_train)

#make prediction
predicted_valid = model.predict(features_valid)

#check F1 score
print("Decision tree model F1 score:", f1_score(target_valid, predicted_valid))

#create model
model = RandomForestClassifier(random_state=123, n_estimators=10)

#train model
model.fit(features_train, target_train)

#make prediction
predicted_valid = model.predict(features_valid)

#check F1 score
print("Random forest model F1 score:", f1_score(target_valid, predicted_valid))


Logistic regression model F1 score: 0.33389544688026984
Decision tree model F1 score: 0.4737484737484738
Random forest model F1 score: 0.5168195718654434


# Step 4: Improve the quality of the model

Our quick test for our F1 score shows that our model has a low F1 score. This shows that our model has both low precision, and low recall. Let's try to improve this.

## Feature Scaling

Let's first make sure our the data in our model is scaled - this way, the different columns aren't weighed differently due to varied dispersions.

In [10]:
scaler = StandardScaler()

numeric = ['CreditScore', 'Age', 'Tenure', 'Balance','NumOfProducts','EstimatedSalary']
scaler.fit(features_train[numeric])

features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])

print(features_train.head(5))

      CreditScore       Age    Tenure   Balance  NumOfProducts  HasCrCard  \
7479    -0.886751 -0.373192  1.104696  1.232271      -0.891560          1   
3411     0.608663 -0.183385  1.104696  0.600563      -0.891560          0   
6027     2.052152  0.480939 -0.503694  1.027098       0.830152          0   
1247    -1.457915 -1.417129  0.461340 -1.233163       0.830152          1   
3716     0.130961 -1.132419 -0.825373  1.140475      -0.891560          0   

      IsActiveMember  EstimatedSalary  Geography_Germany  Geography_Spain  \
7479               0        -0.187705                  0                1   
3411               0        -0.333945                  0                0   
6027               1         1.503095                  1                0   
1247               0        -1.071061                  0                0   
3716               0         1.524268                  1                0   

      Gender_Male  
7479            1  
3411            0  
6027          

Done.

## Class weight adjustment

We can improve our F1 score by adjusting the weight of our classes in our model. We'll make it so that class "1" weighs more.

In [11]:
#create model
model = LogisticRegression(random_state=12345, solver='liblinear', class_weight='balanced') #specifying the class_weight parameter allows us to adjust our class weight

#train model
model.fit(features_train, target_train)

#make prediction
predicted_valid = model.predict(features_valid)

#check F1 score
print("F1:", f1_score(target_valid, predicted_valid))

F1: 0.4888888888888888


By doing so, our F1 score has already significantly improved.

## Downsampling/Upsampling

Let's define a method to downsample our data.

In [12]:
##def method for downsampling, which takes a set of features and target, and fraction of data to return
def downsample(features, target, fraction):
    #split training sample into negative and positive observations
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    #randomly remove negative class observations to fraction specified in parameter, and combine them with the positive class observations
    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    #do the same for the target
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    #shuffle data to improve training
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    #return the upsampled features and target
    return features_downsampled, target_downsampled


And define a method to upsample our data.

In [13]:
#def method for upsampling, which takes a set of features and target, and number of repetitions
def upsample(features, target, repeat):
    #split training sample into negative and positive observations
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    #duplicate positive class observations, and combine them with the negative class observations
    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    #do the same for the target
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    #shuffle data to improve training
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    #return the upsampled features and target
    return features_upsampled, target_upsampled

Now let's test our data, combining different fractions for downsampling and number of repeats for upsampling.

In [14]:
for i in np.arange(0.1, 1.0, 0.1):
    features_train_downsampled, target_train_downsampled = downsample(features_train, target_train, i)
    #create model
    model = LogisticRegression(random_state=12345, solver='liblinear')
    #train model
    model.fit(features_train_downsampled, target_train_downsampled)

    #make prediction
    predicted_valid = model.predict(features_valid)

    #check F1 score
    print("Downsampling with fraction of", round(i, 1), ":\n""F1:", f1_score(target_valid, predicted_valid))

Downsampling with fraction of 0.1 :
F1: 0.42986425339366513
Downsampling with fraction of 0.2 :
F1: 0.4791344667697063
Downsampling with fraction of 0.3 :
F1: 0.4985835694050992
Downsampling with fraction of 0.4 :
F1: 0.5054229934924078
Downsampling with fraction of 0.5 :
F1: 0.4664224664224664
Downsampling with fraction of 0.6 :
F1: 0.42219215155615697
Downsampling with fraction of 0.7 :
F1: 0.3883211678832117
Downsampling with fraction of 0.8 :
F1: 0.365891472868217
Downsampling with fraction of 0.9 :
F1: 0.36038961038961037


In [15]:
for i in range(2, 6):
    features_train_upsampled, target_train_upsampled = upsample(features_train, target_train, i)
    #create model
    model = LogisticRegression(random_state=12345, solver='liblinear')

    #train model
    model.fit(features_train_upsampled, target_train_upsampled)

    #make prediction
    predicted_valid = model.predict(features_valid)

    #check F1 score
    print("Upsampling with", i, "repeats:\n""F1:", f1_score(target_valid, predicted_valid))

Upsampling with 2 repeats:
F1: 0.4672435105067985
Upsampling with 3 repeats:
F1: 0.5014985014985014
Upsampling with 4 repeats:
F1: 0.4888888888888888
Upsampling with 5 repeats:
F1: 0.48433919022154315


We've tested our model using logistic regression, along with upsampling and downsampling separately. As we can see, **downsampling with a fraction of .4 and upsampling by multiplying our positive class observations by 3** provides the greatest increase of our F1 score by **0.5054229934924078 and 0.5014985014985014**, respectively. This is already an improvement from using the balancer! We'll try to use upsampling and downsampling together to see, and observe our results.

Because it's inneficient to call two methods to upsample and downsample, we'll create a function called **create_sample** that combines upsampling and downsampling.

In [16]:
def create_sample(features, target, fraction, repeat):
    #split training sample into negative and positive observations
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    #randomly remove negative class observations to fraction specified in parameter, and combine them with the positive class observations
    features_sampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones] * repeat)
    #do the same for the target
    target_sampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones] * repeat)
    
    #shuffle data to improve training
    features_sampled, target_sampled = shuffle(
        features_sampled, target_sampled, random_state=12345)
    
    #return the upsampled features and target
    return features_sampled, target_sampled

Let's define a function called **find_best_f1** so we can test different models as well.

In [17]:
#function takes features, target, a model, and a toggle for showing all calculations.
def find_best_f1(features, target, model, show_all):
    maxf1 = 0
    frac_repeat = ()
    for fraction in np.arange(0,1, 0.1):
        if fraction > 0:
            features_sampled, target_sampled = downsample(features, target, fraction)
            #train model
            model.fit(features_sampled, target_sampled)

            #make prediction
            predicted_valid = model.predict(features_valid)
            
            #check F1 score
            f1 = f1_score(target_valid, predicted_valid)
            frac = round(fraction, 1)
            if(show_all):
                print("sample with fraction of", frac, "and 0 repeats:\nF1:", f1)
            if(f1>maxf1):
                maxf1 = f1
                frac_repeat = (frac, 0)
        
        for repeat in range(2,6):
            if fraction==0:
                features_sampled, target_sampled = upsample(features, target, repeat)
            else:
                features_sampled, target_sampled = create_sample(features, target, fraction, repeat)

            #train model
            model.fit(features_sampled, target_sampled)

            #make prediction
            predicted_valid = model.predict(features_valid)

            #check F1 score
            f1 = f1_score(target_valid, predicted_valid)
            frac = round(fraction, 1)
            if(show_all):
                print("sample with fraction of", frac, "and", repeat, "repeats:\nF1:", f1)
            if(f1>maxf1):
                maxf1 = f1
                frac_repeat = (frac, repeat)
    
    print("Max f1 score =", maxf1, "\nDownsample fraction:", frac_repeat[0], "Upsample repetitions:", frac_repeat[1])
    return maxf1

Now let's try to find the best ratios for upsampling and downsampling for our given model.

In [18]:
model = LogisticRegression(random_state=12345, solver='liblinear')
find_best_f1(features_train, target_train, model, True)

sample with fraction of 0.0 and 2 repeats:
F1: 0.4672435105067985
sample with fraction of 0.0 and 3 repeats:
F1: 0.5014985014985014
sample with fraction of 0.0 and 4 repeats:
F1: 0.4888888888888888
sample with fraction of 0.0 and 5 repeats:
F1: 0.48433919022154315
sample with fraction of 0.1 and 0 repeats:
F1: 0.42986425339366513
sample with fraction of 0.1 and 2 repeats:
F1: 0.3788300835654596
sample with fraction of 0.1 and 3 repeats:
F1: 0.36055603822762816
sample with fraction of 0.1 and 4 repeats:
F1: 0.3552911177220569
sample with fraction of 0.1 and 5 repeats:
F1: 0.3497907949790795
sample with fraction of 0.2 and 0 repeats:
F1: 0.4791344667697063
sample with fraction of 0.2 and 2 repeats:
F1: 0.42626599888703387
sample with fraction of 0.2 and 3 repeats:
F1: 0.3946850393700787
sample with fraction of 0.2 and 4 repeats:
F1: 0.3768913342503439
sample with fraction of 0.2 and 5 repeats:
F1: 0.3671840354767184
sample with fraction of 0.3 and 0 repeats:
F1: 0.4985835694050992
sample

0.5054229934924078

And let's see what happens when we combine upsampling/downsampling with the balancer.

In [19]:
model = LogisticRegression(random_state=12345, solver='liblinear', class_weight='balanced')
find_best_f1(features_train, target_train, model, False)

Max f1 score = 0.49482758620689654 
Downsample fraction: 0.3 Upsample repetitions: 0


0.49482758620689654

Looking at our results, the max F1 score we can achieve using a logistic regression model is by excluding balancer, using the downsample fraction of **0.4**, and the upsample repetition amount of **0**. Let's now see if we can get better results with other models.

Decision Tree Classifier

In [20]:
best_depth = 0
max_f1 = 0
for depth in range(2,10):
    #create model
    dt_model = DecisionTreeClassifier(random_state=123, max_depth=depth)
    
    print("Decision Tree with depth of {}:".format(depth))
    f1 = find_best_f1(features_train, target_train, dt_model, False)
    
    if f1 > max_f1:
        best_depth = depth
        max_f1 = f1

print("Best depth = {}, with F1 score of {}".format(best_depth, max_f1))

Decision Tree with depth of 2:
Max f1 score = 0.541015625 
Downsample fraction: 0.0 Upsample repetitions: 3
Decision Tree with depth of 3:
Max f1 score = 0.558282208588957 
Downsample fraction: 0.0 Upsample repetitions: 5
Decision Tree with depth of 4:
Max f1 score = 0.572987721691678 
Downsample fraction: 0.0 Upsample repetitions: 2
Decision Tree with depth of 5:
Max f1 score = 0.5963791267305644 
Downsample fraction: 0.0 Upsample repetitions: 4
Decision Tree with depth of 6:
Max f1 score = 0.5961538461538461 
Downsample fraction: 0.0 Upsample repetitions: 2
Decision Tree with depth of 7:
Max f1 score = 0.5885486018641812 
Downsample fraction: 0.0 Upsample repetitions: 2
Decision Tree with depth of 8:
Max f1 score = 0.5889724310776943 
Downsample fraction: 0.6 Upsample repetitions: 0
Decision Tree with depth of 9:
Max f1 score = 0.5791411042944784 
Downsample fraction: 0.0 Upsample repetitions: 2
Best depth = 5, with F1 score of 0.5963791267305644


In [21]:
best_trees = 0
max_f1 = 0
for trees in range(10,100,10):
    #create model
    rf_model = RandomForestClassifier(random_state=123,n_estimators=trees)
    
    print("Random forest with {} trees:".format(trees))
    f1 = find_best_f1(features_train, target_train, rf_model, False)
    
    if f1 > max_f1:
        best_trees = trees
        max_f1 = f1

print("Best number of trees = {}, with F1 score of {}".format(best_trees, max_f1))

Random forest with 10 trees:
Max f1 score = 0.6086956521739131 
Downsample fraction: 0.6 Upsample repetitions: 2
Random forest with 20 trees:
Max f1 score = 0.6093750000000001 
Downsample fraction: 0.6 Upsample repetitions: 0
Random forest with 30 trees:
Max f1 score = 0.6181818181818183 
Downsample fraction: 0.6 Upsample repetitions: 0
Random forest with 40 trees:
Max f1 score = 0.6234817813765183 
Downsample fraction: 0.9 Upsample repetitions: 2
Random forest with 50 trees:
Max f1 score = 0.6269070735090152 
Downsample fraction: 0.7 Upsample repetitions: 0
Random forest with 60 trees:
Max f1 score = 0.6236263736263736 
Downsample fraction: 0.7 Upsample repetitions: 0
Random forest with 70 trees:
Max f1 score = 0.6217616580310881 
Downsample fraction: 0.9 Upsample repetitions: 3
Random forest with 80 trees:
Max f1 score = 0.6242350061199511 
Downsample fraction: 0.5 Upsample repetitions: 0
Random forest with 90 trees:
Max f1 score = 0.6221142162818954 
Downsample fraction: 0.5 Upsampl

The model with the highest F1 score of approximately **0.63** is our **random forest model with 50 trees**, downsampling our negative class observations by 70%. If we want a faster model that still meets our benchmark of .59, we can use a decision tree with a max depth of 5, multiplying our positive class observations by 4. Creating a model using logical regression gives us significantly worse results than with a decision tree or random forest.

# Step 5: Final testing

Now that we know which model to use and the parameters necessary for our model, we'll combine our training and validation sets, balance the classes, create our final model, and test it.

In [28]:
#combine our train and validation data sets
features = pd.concat([features_train, features_valid])
target = pd.concat([target_train, target_valid])

#downsample
features_downsampled, target_downsampled = create_sample(features, target, .5, 3)

#create model
model = RandomForestClassifier(random_state=123, n_estimators=50)

#train model
model.fit(features_downsampled, target_downsampled)

#make prediction
predicted_test = model.predict(features_test)

#get F1 score
print("F1:", f1_score(target_test, predicted_test))

#get class probability
probabilities_valid = model.predict_proba(features_test)
probabilities_one_valid = probabilities_valid[:, 1]

#measure AUC-ROC
auc_roc = roc_auc_score(target_test, probabilities_one_valid)

print("AUC-ROC score:", auc_roc)

F1: 0.5953389830508474
AUC-ROC score: 0.8404547641855216


While our F1 score is at about .60, our AUC-ROC score of approximately .84 shows us that our model has a good measure of separability.