In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('files_for_lab/customer_churn.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
df.columns = df.columns.str.strip()

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
df = df.dropna()


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [8]:
df.set_index(df["customerID"])
y = df["Churn"]
df = df.drop(["customerID", "Churn"], axis = 1)


In [9]:
df_cat = pd.get_dummies(df.select_dtypes("object"), drop_first = True)

df_num = df.select_dtypes("number")
df_num = df_num.drop("SeniorCitizen", axis = 1)


In [10]:
X = pd.concat([df_num, df_cat, df["SeniorCitizen"]], axis = 1, sort = False)
X.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen
0,1,29.85,29.85,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1,34,56.95,1889.5,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,2,53.85,108.15,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,45,42.3,1840.75,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,2,70.7,151.65,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,1,0,0


In [11]:
transformer = StandardScaler().fit(X)
scaled_x = pd.DataFrame(transformer.transform(X))
scaled_x.columns = X.columns

In [12]:
scaled_x.describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,SeniorCitizen
count,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,...,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0,7032.0
mean,-1.214741e-16,9.652878000000001e-17,-1.172113e-16,3.104583e-16,2.420798e-16,-5.429862e-16,-1.526936e-15,1.529414e-15,1.171671e-15,1.817532e-16,...,4.478784e-16,-6.211692e-16,-3.857993e-16,5.2574550000000003e-17,-2.813449e-17,-2.508738e-16,4.386581e-16,2.751875e-16,5.347606e-16,1.946443e-16
std,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,...,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071,1.000071
min,-1.280248,-1.547283,-0.9990692,-1.00943,-0.9656081,-0.6523049,-3.056334,-0.3271894,-0.8543356,-0.8868965,...,-0.7901858,-0.5251304,-0.7968492,-0.5145369,-0.5613644,-1.206361,-0.5253508,-0.7118634,-0.5436035,-0.4403271
25%,-0.9542963,-0.9709769,-0.8302488,-1.00943,-0.9656081,-0.6523049,0.3271894,-0.3271894,-0.8543356,-0.8868965,...,-0.7901858,-0.5251304,-0.7968492,-0.5145369,-0.5613644,-1.206361,-0.5253508,-0.7118634,-0.5436035,-0.4403271
50%,-0.1394171,0.184544,-0.3908151,0.990658,-0.9656081,-0.6523049,0.3271894,-0.3271894,-0.8543356,-0.8868965,...,-0.7901858,-0.5251304,-0.7968492,-0.5145369,-0.5613644,0.8289392,-0.5253508,-0.7118634,-0.5436035,-0.4403271
75%,0.9199259,0.8331482,0.6668271,0.990658,1.035617,1.533025,0.3271894,-0.3271894,1.1705,1.127527,...,1.265525,-0.5251304,1.254943,-0.5145369,-0.5613644,0.8289392,-0.5253508,1.404764,-0.5436035,-0.4403271
max,1.612573,1.793381,2.824261,0.990658,1.035617,1.533025,0.3271894,3.056334,1.1705,1.127527,...,1.265525,1.904289,1.254943,1.943495,1.781374,0.8289392,1.90349,1.404764,1.839576,2.271039


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.2, random_state=37)

In [14]:
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model is: 0.80 


In [15]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model is: 0.47 


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, labels=['Yes','No']))

              precision    recall  f1-score   support

         Yes       0.66      0.55      0.60       378
          No       0.84      0.89      0.87      1029

    accuracy                           0.80      1407
   macro avg       0.75      0.72      0.73      1407
weighted avg       0.79      0.80      0.80      1407



In [21]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

X_sm, y_sm = smote.fit_sample(X, y)
y_sm = y_sm.to_numpy()
y_sm = pd.DataFrame(data=y_sm.flatten())
y_sm.value_counts()


Yes    5163
No     5163
dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=58)

classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

from sklearn.metrics import cohen_kappa_score
print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, labels=['Yes','No']))

The accuracy of the logistic_regression model is: 0.82 
The kappa of the logistic regression model is: 0.65 
              precision    recall  f1-score   support

         Yes       0.81      0.85      0.83      1029
          No       0.84      0.80      0.82      1037

    accuracy                           0.82      2066
   macro avg       0.82      0.82      0.82      2066
weighted avg       0.82      0.82      0.82      2066



In [23]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X, y)
y_tl = y_tl.to_numpy()
y_tl = pd.DataFrame(data=y_tl.flatten())
y_tl.value_counts()


No     4591
Yes    1869
dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.2, random_state=58)

classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

from sklearn.metrics import cohen_kappa_score
print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, labels=['Yes','No']))

The accuracy of the logistic_regression model is: 0.81 
The kappa of the logistic regression model is: 0.53 
              precision    recall  f1-score   support

         Yes       0.70      0.63      0.67       386
          No       0.85      0.89      0.87       906

    accuracy                           0.81      1292
   macro avg       0.78      0.76      0.77      1292
weighted avg       0.81      0.81      0.81      1292

