In [1]:
# Importing the packages
using Pkg
using DataFrames
using CSV
using Plots
using GLM
using StatsBase
using Lathe
using MLBase
using ClassImbalance
using ROCAnalysis

# Enable printing of 1000 columns
ENV["COLUMNS"] = 1000

1000

In [4]:
# Load the dataset
df = DataFrame(CSV.File("Churn_Modelling.csv"))
first(df,5)

Unnamed: 0_level_0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
Unnamed: 0_level_1,Int64,Int64,String,Int64,String,String,Int64,Int64,Float64,Int64,Int64,Int64,Float64,Int64
1,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101349.0,1
2,2,15647311,Hill,608,Spain,Female,41,1,83807.9,1,0,1,112543.0,0
3,3,15619304,Onio,502,France,Female,42,8,159661.0,3,1,0,113932.0,1
4,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.6,0
5,5,15737888,Mitchell,850,Spain,Female,43,2,125511.0,1,1,1,79084.1,0


In [7]:
println(size(df))
describe(df)

(10000, 14)


Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Union…,Nothing,DataType
1,RowNumber,5000.5,1,5000.5,10000,,,Int64
2,CustomerId,15690900.0,15565701,15690700.0,15815690,,,Int64
3,Surname,,Abazu,,Zuyeva,2932.0,,String
4,CreditScore,650.529,350,652.0,850,,,Int64
5,Geography,,France,,Spain,3.0,,String
6,Gender,,Female,,Male,2.0,,String
7,Age,38.9218,18,37.0,92,,,Int64
8,Tenure,5.0128,0,5.0,10,,,Int64
9,Balance,76485.9,0.0,97198.5,2.50898e5,,,Float64
10,NumOfProducts,1.5302,1,1.0,4,,,Int64


In [8]:
names(df)

14-element Vector{Symbol}:
 :RowNumber
 :CustomerId
 :Surname
 :CreditScore
 :Geography
 :Gender
 :Age
 :Tenure
 :Balance
 :NumOfProducts
 :HasCrCard
 :IsActiveMember
 :EstimatedSalary
 :Exited

In [9]:
countmap(df.Exited)

Dict{Int64, Int64} with 2 entries:
  0 => 7963
  1 => 2037

In [19]:
# One Hot Encoding
Lathe.preprocess.OneHotEncode(df,:Geography)
Lathe.preprocess.OneHotEncode(df,:Gender)
select!(df,Not([:RowNumber, :CustomerId, :Surname, :Geography, :Gender, :Male]))
first(df,10)

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Spain,Germany,Female
Unnamed: 0_level_1,Int64,Int64,Int64,Float64,Int64,Int64,Int64,Float64,Int64,Bool,Bool,Bool,Bool
1,619,42,2,0.0,1,1,1,101349.0,1,1,0,0,1
2,608,41,1,83807.9,1,0,1,112543.0,0,0,1,0,1
3,502,42,8,159661.0,3,1,0,113932.0,1,1,0,0,1
4,699,39,1,0.0,2,0,0,93826.6,0,1,0,0,1
5,850,43,2,125511.0,1,1,1,79084.1,0,0,1,0,1
6,645,44,8,113756.0,2,1,0,149757.0,1,0,1,0,0
7,822,50,7,0.0,2,1,1,10062.8,0,1,0,0,0
8,376,29,4,115047.0,4,1,0,119347.0,1,0,0,1,1
9,501,44,4,142051.0,2,0,1,74940.5,0,1,0,0,0
10,684,27,2,134604.0,1,1,1,71725.7,0,1,0,0,0


In [20]:
# Train Test Splitting
using Lathe.preprocess: TrainTestSplit
train, test = TrainTestSplit(df,.75)

(7419×13 typename(DataFrame)
│ Row  │ CreditScore │ Age   │ Tenure │ Balance   │ NumOfProducts │ HasCrCard │ IsActiveMember │ EstimatedSalary │ Exited │ France │ Spain │ Germany │ Female │
│      │ [90mInt64[39m       │ [90mInt64[39m │ [90mInt64[39m  │ [90mFloat64[39m   │ [90mInt64[39m         │ [90mInt64[39m     │ [90mInt64[39m          │ [90mFloat64[39m         │ [90mInt64[39m  │ [90mBool[39m   │ [90mBool[39m  │ [90mBool[39m    │ [90mBool[39m   │
├──────┼─────────────┼───────┼────────┼───────────┼───────────────┼───────────┼────────────────┼─────────────────┼────────┼────────┼───────┼─────────┼────────┤
│ 1    │ 608         │ 41    │ 1      │ 83807.9   │ 1             │ 0         │ 1              │ 1.12543e5       │ 0      │ 0      │ 1     │ 0       │ 1      │
│ 2    │ 850         │ 43    │ 2      │ 1.25511e5 │ 1             │ 1         │ 1              │ 79084.1         │ 0      │ 0      │ 1     │ 0       │ 1      │
│ 3    │ 645         │ 44    │ 8      │ 1

## Model Building

In [21]:
first(df)

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Spain,Germany,Female
Unnamed: 0_level_1,Int64,Int64,Int64,Float64,Int64,Int64,Int64,Float64,Int64,Bool,Bool,Bool,Bool
1,619,42,2,0.0,1,1,1,101349.0,1,1,0,0,1


In [22]:
fm = @formula(Exited ~ CreditScore + Age + Tenure + Balance + NumOfProducts + HasCrCard + IsActiveMember + EstimatedSalary + France + Spain + Female)
logistic = glm(fm, train, Binomial(), ProbitLink())

StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, ProbitLink}, GLM.DensePredChol{Float64, LinearAlgebra.Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}

Exited ~ 1 + CreditScore + Age + Tenure + Balance + NumOfProducts + HasCrCard + IsActiveMember + EstimatedSalary + France + Spain + Female

Coefficients:
───────────────────────────────────────────────────────────────────────────────────────
                        Coef.   Std. Error       z  Pr(>|z|)     Lower 95%    Upper 95%
───────────────────────────────────────────────────────────────────────────────────────
(Intercept)      -1.91922      0.167726     -11.44    <1e-29  -2.24795      -1.59048
CreditScore      -0.000392382  0.000186009   -2.11    0.0349  -0.000756952  -2.78116e-5
Age               0.0423363    0.00169851    24.93    <1e-99   0.0390073     0.0456653
Tenure           -0.0039426    0.00614484    -0.64    0.5211  -0.0159863     0.00810107
Balance           1.

## Prediction and Evaluation

In [23]:
prediction = predict(logistic, test)

2581-element Vector{Union{Missing, Float64}}:
 0.1201182458186032
 0.3522898188765328
 0.21897886981226367
 0.09373403688056439
 0.1277204089438332
 0.16536981416150381
 0.06393970529594005
 0.01961151276650637
 0.051612450274931845
 0.24978219674412305
 0.08214045031291889
 0.03158792326629185
 0.16608906028176412
 ⋮
 0.017549439116784946
 0.4023077451172583
 0.04509998177105662
 0.20336254082595934
 0.06974240218653867
 0.5302306075299398
 0.1771230015438225
 0.3620328386819123
 0.5985400783479266
 0.1574813100268641
 0.11029670236776096
 0.15173615491930442

In [26]:
# converting classes i.e. 0 or 1. Probability score less than 0.5 would be treated as 0 and greater than 0.5 as 1.
prediction_class = [if x < 0.5 0 else 1 end for x in prediction];

prediction_df = DataFrame(y_actual = test.Exited, y_predicted = prediction_class, prob_predicted = prediction)
prediction_df.correctly_classified = prediction_df.y_actual .== prediction_df.y_predicted

2581-element BitVector:
 0
 0
 1
 1
 1
 1
 1
 1
 1
 0
 1
 1
 1
 ⋮
 1
 0
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

In [27]:
# Accuracy Score
accuracy = mean(prediction_df.correctly_classified)

0.8020147229755908

In [28]:
# Confusion Matrix
confusion_matrix = MLBase.roc(prediction_df.y_actual, prediction_df.y_predicted)

ROCNums{Int64}
  p = 530
  n = 2051
  tp = 105
  tn = 1965
  fp = 86
  fn = 425
