In [1]:
import pandas as pd
import numpy as np

In [26]:
churnData = pd.read_csv('Customer-Churn.csv')

In [27]:
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [28]:
churnData.dtypes

Unnamed: 0,0
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
OnlineSecurity,object
OnlineBackup,object
DeviceProtection,object
TechSupport,object


In [29]:
#Convert this column into numeric type using pd.to_numeric function.
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [30]:
#Check for null values in the dataframe. Replace the null values.
churnData.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
OnlineSecurity,0
OnlineBackup,0
DeviceProtection,0
TechSupport,0


In [31]:
#Replace the null values
churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean(), inplace=True)


In [32]:
#Scale the features either by using standard scaler
from sklearn.preprocessing import StandardScaler
features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']

churnData_1 = churnData[features].copy()

scaler = StandardScaler()
churnData_scaled = scaler.fit_transform(churnData_1)
churnData_scaled = pd.DataFrame(churnData_scaled, columns=features)
churnData_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,-1.277445,-0.439916,-1.160323,-0.994971
1,0.066327,-0.439916,-0.259629,-0.173876
2,-1.236724,-0.439916,-0.36266,-0.960399
3,0.514251,-0.439916,-0.746535,-0.1954
4,-1.236724,-0.439916,0.197365,-0.941193


In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(churnData_scaled, churnData['Churn'], test_size=0.2, random_state=42)

In [36]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [37]:
model.score(X_test, y_test)

0.8076650106458482

In [38]:
#Check for the imbalance.
churnData['Churn'].value_counts()

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,5174
Yes,1869


In [62]:
from sklearn.utils import resample

In [63]:
train = pd.concat([X_train, y_train],axis=1)
display(train.shape)
train.head()

(5634, 5)

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
2142,-0.463037,-0.439916,0.002935,-0.417911,No
1623,0.880735,-0.439916,1.078118,1.256667,No
6074,-1.277445,-0.439916,-1.373033,-0.997797,Yes
1362,-1.155283,-0.439916,0.180747,-0.903088,Yes
6754,-1.318165,-0.439916,-0.095111,0.0,No


In [72]:
#upsampled
from sklearn.utils import resample

no_churn = train[train['Churn'] == "No"]
yes_churn = train[train['Churn'] == "Yes"]

yes_churn_upsampled = resample(
    yes_churn,
    replace=True,
    n_samples=len(no_churn),
    random_state=42)

churnData_balanced = pd.concat([no_churn, yes_churn_upsampled])

print(churnData_balanced['Churn'].value_counts())

Churn
No     4138
Yes    4138
Name: count, dtype: int64


In [73]:
X_train_balanced = churnData_balanced.drop('Churn', axis=1)
y_train_balanced = churnData_balanced['Churn']

model.fit(X_train_balanced, y_train_balanced)

In [74]:
model.score(X_test, y_test)

0.7444996451383961

In [75]:
#downsampled
no_churn_downsampled = resample(
    no_churn,
    replace=False,
    n_samples=len(yes_churn),
    random_state=42)


churnData_balanced = pd.concat([no_churn_downsampled, yes_churn])


print(churnData_balanced['Churn'].value_counts())

Churn
No     1496
Yes    1496
Name: count, dtype: int64


In [76]:
X_train_balanced_ = churnData_balanced.drop('Churn', axis=1)
y_train_balanced_ = churnData_balanced['Churn']

model.fit(X_train_balanced_, y_train_balanced_)

In [77]:
model.score(X_test, y_test)

0.7430801987224982