# Predicting whether to contact a customer because they are at risk of churning

## Part 1: Load and examine the data

In [4]:
data_bucket = 'mlforbusiness'
subfolder = 'ch03'
dataset = 'churn_data.csv'

In [5]:
import pandas as pd
from time import sleep
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df.head()

Unnamed: 0,churned,id,customer_code,co_name,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta
0,0,1,1826,Hoffman Martinez and Chandler,68567.34,0.81,0.02,0.74,1.45,-0.79,0.72,0.71
1,0,2,772,Lee Martin and Escobar,74335.27,1.87,1.02,1.29,1.19,-0.85,0.27,-0.1
2,0,3,479,Hobbs Mcdaniel and Baker,48746.22,1.21,0.7,1.04,2.12,-0.51,0.34,1.08
3,0,4,1692,Williams-Harris,64416.7,0.75,2.08,2.4,2.02,1.33,0.32,-0.38
4,0,5,2578,Beck-Snyder,71623.2,2.33,0.66,1.97,1.6,-1.67,1.31,-0.37


In [7]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['churned'].value_counts())

Number of rows in dataset: 2999
0    2833
1     166
Name: churned, dtype: int64


## Part 2: Get the data into the right shape

In [8]:
columns = df.columns.tolist()
encoded_data = df.drop(['id', 'customer_code', 'co_name'], axis=1)
encoded_data.head()

Unnamed: 0,churned,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta
0,0,68567.34,0.81,0.02,0.74,1.45,-0.79,0.72,0.71
1,0,74335.27,1.87,1.02,1.29,1.19,-0.85,0.27,-0.1
2,0,48746.22,1.21,0.7,1.04,2.12,-0.51,0.34,1.08
3,0,64416.7,0.75,2.08,2.4,2.02,1.33,0.32,-0.38
4,0,71623.2,2.33,0.66,1.97,1.6,-1.67,1.31,-0.37


## Part 3: Create training, validation and test data sets

In [9]:
y = encoded_data['churned']
train_df, test_and_val_data, _, _ = train_test_split(encoded_data, y, test_size=0.3, stratify=y, random_state=0)

y = test_and_val_data['churned']
val_df, test_df, _, _ = train_test_split(test_and_val_data, y, test_size=0.333, stratify=y, random_state=0)

print(train_df.shape, val_df.shape, test_df.shape)
print()
print('Number of rows in Train dataset: {train_df.shape[0]}')
print(train_df['churned'].value_counts())
print()
print('Number of rows in Validate dataset: {val_df.shape[0]}')
print(val_df['churned'].value_counts())
print()
print('Number of rows in Test dataset: {test_df.shape[0]}')
print(test_df['churned'].value_counts())

(2099, 9) (600, 9) (300, 9)

Number of rows in Train dataset: {train_df.shape[0]}
0    1983
1     116
Name: churned, dtype: int64

Number of rows in Validate dataset: {val_df.shape[0]}
0    567
1     33
Name: churned, dtype: int64

Number of rows in Test dataset: {test_df.shape[0]}
0    283
1     17
Name: churned, dtype: int64


In [11]:
train_input = train_df[train_df.columns[train_df.columns != "churned"]].values
test_input = test_df[train_df.columns[train_df.columns != "churned"]].values
val_input = val_df[train_df.columns[train_df.columns != "churned"]].values


train_output = train_df["churned"]
test_output = test_df["churned"]
val_output = val_df["churned"]

## Part 4: Train the model

In [12]:
classifier = LogisticRegression(solver="lbfgs")
classifier.fit(train_input, train_output)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Part 5: Test the model

In [14]:
test_data_predictions = classifier.predict(test_input)

test_df['prediction'] = test_data_predictions
test_df[:10]

Unnamed: 0,churned,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta,prediction
147,0,76897.46,0.56,2.29,1.14,2.23,1.73,-1.15,1.09,0
1851,0,19604.63,1.95,2.04,0.82,1.62,0.09,-1.22,0.8,0
1955,0,23369.6,1.11,1.54,1.55,1.14,0.43,0.01,-0.41,0
913,1,40709.47,2.4,1.87,0.07,0.61,-0.53,-1.8,0.54,0
293,0,69953.52,2.01,1.2,1.05,1.41,-0.81,-0.15,0.36,0
559,0,71939.07,0.54,1.17,0.21,2.29,0.63,-0.96,2.08,0
220,0,45930.53,0.08,1.43,0.41,1.34,1.35,-1.02,0.93,0
1500,0,47080.25,1.54,0.68,0.8,0.54,-0.86,0.12,-0.26,0
1199,0,35506.83,1.37,0.93,1.7,0.67,-0.44,0.77,-1.03,0
2278,0,39188.12,0.4,1.86,0.1,0.82,1.46,-1.76,0.72,0


In [23]:
print(test_df['churned'].value_counts())
print(test_df['prediction'].value_counts())
print(metrics.accuracy_score(test_df['churned'],test_df['prediction']))

0    283
1     17
Name: churned, dtype: int64
0    297
1      3
Name: prediction, dtype: int64
0.9466666666666667


In [24]:
print(metrics.confusion_matrix(test_df['churned'],test_df['prediction']))

[[282   1]
 [ 15   2]]


In [25]:
val_data_predictions = classifier.predict(val_input)

val_df['prediction'] = val_data_predictions
val_df[:10]

Unnamed: 0,churned,total_spend,week_minus_4,week_minus_3,week_minus_2,last_week,4-3_delta,3-2_delta,2-1_delta,prediction
1391,0,25572.55,0.21,1.13,0.75,1.85,0.92,-0.38,1.1,0
1339,0,21262.57,0.59,1.18,2.03,0.57,0.59,0.85,-1.46,0
2111,0,32621.64,2.17,0.48,0.74,2.28,-1.69,0.26,1.54,0
2673,0,12787.66,0.78,0.29,2.46,1.31,-0.49,2.17,-1.15,0
356,1,76798.13,1.2,0.11,2.5,1.04,-1.09,2.39,-1.46,0
176,0,66534.88,0.17,0.41,0.37,2.43,0.24,-0.04,2.06,0
662,0,4955.13,1.8,1.6,1.66,1.14,-0.2,0.06,-0.52,0
1570,0,9865.15,0.98,0.84,1.44,1.05,-0.14,0.6,-0.39,0
2508,0,57514.96,1.81,0.01,1.41,1.97,-1.8,1.4,0.56,0
1379,0,53222.6,0.3,0.2,0.9,0.75,-0.1,0.7,-0.15,0


In [26]:
print(val_df['churned'].value_counts())
print(val_df['prediction'].value_counts())
print(metrics.accuracy_score(val_df['churned'],val_df['prediction']))

0    567
1     33
Name: churned, dtype: int64
0    594
1      6
Name: prediction, dtype: int64
0.9383333333333334


In [27]:
print(metrics.confusion_matrix(val_df['churned'],val_df['prediction']))

[[562   5]
 [ 32   1]]
