In [57]:
import pandas as pd

In [58]:
churn_df = pd.read_csv('Customer-Churn.csv')

In [59]:
churn_df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [60]:
# Checking data types
churn_df.dtypes

Unnamed: 0,0
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
OnlineSecurity,object
OnlineBackup,object
DeviceProtection,object
TechSupport,object


In [61]:
churn_df['TotalCharges'] = pd.to_numeric(churn_df['TotalCharges'], errors='coerce')

In [62]:
# Checking data types again
churn_df.dtypes

Unnamed: 0,0
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
OnlineSecurity,object
OnlineBackup,object
DeviceProtection,object
TechSupport,object


In [63]:
# Check for the nulls in the dataframe
churn_df.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
OnlineSecurity,0
OnlineBackup,0
DeviceProtection,0
TechSupport,0


In [64]:
# checking rows where TotalCharges are null
churn_df[churn_df['TotalCharges'].isnull()]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,Yes,No,Yes,Yes,Yes,No,Two year,52.55,,No
753,Male,0,No,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No
936,Female,0,Yes,Yes,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Two year,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.75,,No
1340,Female,0,Yes,Yes,0,No,Yes,Yes,Yes,Yes,Yes,No,Two year,56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,Yes,No,Two year,73.35,,No


In [65]:
# replace null values in TotalCharges by zero
churn_df['TotalCharges'] = churn_df['TotalCharges'].fillna(0)

In [66]:
# Checking null values in df again
churn_df.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
OnlineSecurity,0
OnlineBackup,0
DeviceProtection,0
TechSupport,0


In [67]:
# Use tenure, SeniorCitizen, MonthlyCharges and TotalCharges as features. Churn as target

X = churn_df[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churn_df['Churn']

In [68]:
# Scale the data using normalizer

from sklearn.preprocessing import Normalizer
scaler = Normalizer()
X = scaler.fit_transform(X)

In [69]:
# Split the data into training set and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
# Fit a logistic regression model on the training data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [71]:
# Check the accuracy on the test data

from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7629524485450674

In [72]:
# Check the imbalance of the dataset

churn_df['Churn'].value_counts()

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,5174
Yes,1869


In [73]:
# Use resampling to balance the data by upsampling

from sklearn.utils import resample
df_majority = churn_df[churn_df['Churn'] == 'No']
df_minority = churn_df[churn_df['Churn'] == 'Yes']
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [74]:
# Check if the samples are equal

df_upsampled['Churn'].value_counts()

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,5174
Yes,5174


In [75]:
# Check the accuracy with the upsampled model

Xup = df_upsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
yup = df_upsampled['Churn']

Xup_train, Xup_test, yup_train, yup_test = train_test_split(Xup, yup, test_size=0.2, random_state=42)

model_up = LogisticRegression()
model_up.fit(Xup_train, yup_train)

yup_pred = model_up.predict(Xup_test)
accuracy_score(yup_test, yup_pred)

0.7318840579710145

In [42]:
# Use resampling to balance the data by downsampling

df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])


In [76]:
df_downsampled['Churn'].value_counts()

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,1869
Yes,1869


In [77]:
# Check the accuracy with the downsampled model

Xdown = df_downsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
ydown = df_downsampled['Churn']

Xdown_train, Xdown_test, ydown_train, ydown_test = train_test_split(Xdown, ydown, test_size=0.2, random_state=42)

model_down = LogisticRegression()
model_down.fit(Xdown_train, ydown_train)

ydown_pred = model_down.predict(Xdown_test)
accuracy_score(ydown_test, ydown_pred)

0.7553475935828877

In [78]:
# Resampling by using synthetic sample generation

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
Xsmote, ysmote = smote.fit_resample(X, y)

Xsmote_train, Xsmote_test, ysmote_train, ysmote_test = train_test_split(Xsmote, ysmote, test_size=0.2, random_state=42)

model_smote = LogisticRegression()
model_smote.fit(Xsmote_train, ysmote_train)

ysmote_pred = model_smote.predict(Xsmote_test)
accuracy_score(ysmote_test, ysmote_pred)

0.6782608695652174

In [79]:
# Sampling with ensemble methods - bagging and pasting

import numpy as np

# Define number of bags
n_bags = 5

# Lists to store each Xbag and ybag
Xbags = []
ybags = []

# Generate bags for X and y
for i in range(n_bags):
    # Resample X and y with replacement to create a bag
    X_bag, y_bag = resample(X, y, replace=True, random_state=i)

    # Append the generated Xbag and ybag to lists
    Xbags.append(X_bag)
    ybags.append(y_bag)

# Now that we have our bags, let's use one of them for training a model and evaluating it
# We will use the first bag as an example here

# Split the first bag into training and testing sets
Xbag_train, Xbag_test, ybag_train, ybag_test = train_test_split(Xbags[0], ybags[0], test_size=0.2, random_state=42)

# Train the model on the bagged data
model_bag = LogisticRegression(max_iter=200)
model_bag.fit(Xbag_train, ybag_train)

# Make predictions and evaluate the model
ybag_pred = model_bag.predict(Xbag_test)
accuracy = accuracy_score(ybag_test, ybag_pred)
print(f'Accuracy: {accuracy}')




Accuracy: 0.7374024130589071


In [80]:
# Sampling by clustering the abundant part

from sklearn.cluster import KMeans

# Features for clustering
X_majority = df_majority[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]

# Number of clusters (you might need to tune this)
n_clusters = len(df_minority)

# Apply KMeans clustering to the majority class
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_majority)

# Get cluster labels for majority class samples
cluster_labels = kmeans.labels_

# Select one sample from each cluster
cluster_centers = kmeans.cluster_centers_
X_majority_clustered = []
y_majority_clustered = []
for i in range(n_clusters):
  cluster_points = X_majority[cluster_labels == i]
  # Choose a random sample or the nearest one to the center (you can also choose another center)
  # random_index = np.random.choice(cluster_points.index)
  random_index = np.argmin(np.linalg.norm(cluster_points - cluster_centers[i], axis=1))
  X_majority_clustered.append(cluster_points.iloc[random_index].values)
  y_majority_clustered.append(df_majority.iloc[random_index]['Churn'])

X_majority_clustered = np.array(X_majority_clustered)
y_majority_clustered = np.array(y_majority_clustered)

# Combine the clustered majority samples with the minority samples
X_clustered = np.concatenate([X_majority_clustered, df_minority[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']].values])
y_clustered = np.concatenate([y_majority_clustered, df_minority['Churn'].values])

In [81]:
# Split the clustered data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clustered, y_clustered, test_size=0.2, random_state=42)

# Train the model on the clustered data
model_clustered = LogisticRegression()
model_clustered.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model_clustered.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Output only the accuracy
print(f'Accuracy: {accuracy}')

Accuracy: 0.7245989304812834
