In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, plot_confusion_matrix
from sklearn.tree import plot_tree

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Data/DATA_Customer-Churn.csv")
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


### check datatypes of all columns

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

### Convert column `TotalCharges` to numeric and replace null values in dataframe

In [10]:
df["TotalCharges"] = np.nanmedian(pd.to_numeric(df["TotalCharges"], errors="coerce"))

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

### Create dataframe with subset of features

In [13]:
X = df[["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]]
X

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,1397.475
1,0,34,56.95,1397.475
2,0,2,53.85,1397.475
3,0,45,42.30,1397.475
4,0,2,70.70,1397.475
...,...,...,...,...
7038,0,24,84.80,1397.475
7039,0,72,103.20,1397.475
7040,0,11,29.60,1397.475
7041,1,4,74.40,1397.475


In [14]:
y = df["Churn"]
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

### split data into training and test set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,random_state=42)
X_train.shape, \
X_test.shape, \
y_train.shape, \
y_test.shape

((5634, 4), (1409, 4), (5634,), (1409,))

### scale features

In [24]:
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((5634, 4), (1409, 4))

### KNN Classifier

In [34]:
neigh = KNeighborsClassifier(n_neighbors = 23)
neigh.fit(X_train, y_train)
y_pred_train = neigh.predict(X_train)
y_pred_test = neigh.predict(X_test)

In [42]:
print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))

Accuracy train: 0.802626908058218
Accuracy test: 0.7927608232789212

Precision train: 0.686046511627907
Precision test: 0.6483516483516484

Recall train: 0.4732620320855615
Recall test: 0.4745308310991957


### Decision Tree Classifier

In [44]:
dectree = DecisionTreeClassifier(max_depth = 6)
dectree.fit(X_train, y_train)
y_pred_train = dectree.predict(X_train)
y_pred_test = dectree.predict(X_test)

In [45]:
print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))

Accuracy train: 0.7969471068512602
Accuracy test: 0.7885024840312278

Precision train: 0.6654135338345865
Precision test: 0.6363636363636364

Recall train: 0.4732620320855615
Recall test: 0.4691689008042895
