# Customer Churn Prediction using SMOTE and Tomek Links

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import TomekLinks

file_path = './files_for_lab/Customer-Churn.csv'  
df = pd.read_csv(file_path)

In [6]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [7]:
# Converting TotalCharges to numeric, coercing errors
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Filling any missing values in TotalCharges with the median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Splitting the data into features and target
X = df.drop(columns=['Churn'])
y = df['Churn']

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(X_train.shape)
print(X_test.shape)

(4930, 15)
(2113, 15)


## Applying SMOTE for Upsampling

In [8]:
# Applying SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Fitting the logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_smote, y_train_smote)

# Predicting and calculating accuracy for logistic regression
y_pred_log_reg = log_reg.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)

# Fitting the decision tree classifier
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_smote, y_train_smote)

# Predicting and calculate accuracy for decision tree
y_pred_tree = tree_clf.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)

# Comparing the accuracies
accuracy_log_reg, accuracy_tree


(0.7463322290582111, 0.7155702792238523)

## Applying Tomek Links for Downsampling

In [9]:

# Applying Tomek Links to remove Tomek links from the majority class
tomek = TomekLinks()
X_train_tomek, y_train_tomek = tomek.fit_resample(X_train, y_train)

# Fitting the logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_tomek, y_train_tomek)

# Predicting and calculating accuracy for logistic regression
y_pred_log_reg = log_reg.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)

# Fitting the decision tree classifier
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_tomek, y_train_tomek)

# Predicting and calculating accuracy for decision tree
y_pred_tree = tree_clf.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)

# Comparing the accuracies
accuracy_log_reg, accuracy_tree

# Applying Tomek Links a second time
X_train_tomek_2, y_train_tomek_2 = tomek.fit_resample(X_train_tomek, y_train_tomek)

# Checking the class distribution before and after second Tomek Links application
initial_class_distribution = y_train.value_counts()
first_tomek_class_distribution = y_train_tomek.value_counts()
second_tomek_class_distribution = y_train_tomek_2.value_counts()

initial_class_distribution, first_tomek_class_distribution, second_tomek_class_distribution


(Churn
 0    3622
 1    1308
 Name: count, dtype: int64,
 Churn
 0    3211
 1    1308
 Name: count, dtype: int64,
 Churn
 0    3087
 1    1308
 Name: count, dtype: int64)