# Building the Model

## 1. Data Preprocessing

### 1.1. Importing the libraries and dataset

#### 1.1.1. Loading the libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

#### 1.1.2. Importing the dataset

In [2]:
dataset_uncleaned = pd.read_csv('/Users/eseoseodion/Documents/Portfolio/Customer Churn Prediction/data/telco_customer_churn.csv')

### 1.2. Cleaning the data

#### 1.2.1. Converting 'TotalCharges' to a numeric column

In [None]:
dataset_uncleaned['TotalCharges'] = pd.to_numeric(dataset_uncleaned['TotalCharges'], errors='coerce')

#### 1.2.2. Dropping rows with NaN (missing) values

In [None]:
dataset = dataset_uncleaned.dropna()

#### 1.2.3. Confirming if there are still any missing values

In [5]:
# Check for missing values
print(dataset.isnull().sum())

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


Dataset is now clean.

In [6]:
dataset.shape

(7032, 21)

In [28]:
numerical = dataset.select_dtypes(include=['number']).columns
categorical = dataset.select_dtypes(include=['object']).columns

print("Numerical columns:\n", numerical)
print("\nCategorical columns:\n", categorical)

Numerical columns:
 Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

Categorical columns:
 Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')


In [8]:
columns = ['customerID', 'gender', 'SeniorCitizen', 'Partner',
            'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 
            'InternetService', 'OnlineSecurity', 'OnlineBackup', 
            'DeviceProtection', 'TechSupport', 'StreamingTV', 
            'StreamingMovies', 'Contract', 'PaperlessBilling', 
            'PaymentMethod', 'MonthlyCharges', 'TotalCharges']

for col in columns:
    print(f"Unique Values for the '{col}' Column")
    print(dataset[col].unique())
    print('=' * 50)

Unique Values for the 'customerID' Column
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
Unique Values for the 'gender' Column
['Female' 'Male']
Unique Values for the 'SeniorCitizen' Column
[0 1]
Unique Values for the 'Partner' Column
['Yes' 'No']
Unique Values for the 'Dependents' Column
['No' 'Yes']
Unique Values for the 'tenure' Column
[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
Unique Values for the 'PhoneService' Column
['No' 'Yes']
Unique Values for the 'MultipleLines' Column
['No phone service' 'No' 'Yes']
Unique Values for the 'InternetService' Column
['DSL' 'Fiber optic' 'No']
Unique Values for the 'OnlineSecurity' Column
['No' 'Yes' 'No internet service']
Unique Values for the 'OnlineBackup' Column
['Yes' 'No' 'No internet service']
Unique Values for the 'DeviceProt

### 1.3. Feature Engineering

#### 1.3.1. Reducing Redundancy and Long Labelling for Readability

In [None]:
cols_to_edit = {
    'MultipleLines': 'No phone service',
    ('OnlineSecurity',  'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'): 'No internet service'
}

for key, value in cols_to_edit.items():
    if isinstance(key, tuple):
        for col in key:
            dataset[col] = dataset[col].replace(value, 'No')
    else:
        dataset[key] = dataset[key].replace(value, 'No')

In [None]:
dataset['PaymentMethod'] = dataset['PaymentMethod'].replace({
    'Bank transfer (automatic)': 'Bank Transfer',
    'Credit card (automatic)': 'Credit Card'
})

In [11]:
for i in columns:
    print(f"Unique Values for the '{i}' Column")
    print(dataset[i].unique())
    print('=' * 50)

Unique Values for the 'customerID' Column
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
Unique Values for the 'gender' Column
['Female' 'Male']
Unique Values for the 'SeniorCitizen' Column
[0 1]
Unique Values for the 'Partner' Column
['Yes' 'No']
Unique Values for the 'Dependents' Column
['No' 'Yes']
Unique Values for the 'tenure' Column
[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39]
Unique Values for the 'PhoneService' Column
['No' 'Yes']
Unique Values for the 'MultipleLines' Column
['No' 'Yes']
Unique Values for the 'InternetService' Column
['DSL' 'Fiber optic' 'No']
Unique Values for the 'OnlineSecurity' Column
['No' 'Yes']
Unique Values for the 'OnlineBackup' Column
['Yes' 'No']
Unique Values for the 'DeviceProtection' Column
['No' 'Yes']
Unique Values for the 'TechSupport'

In [12]:
# Defining the matrix of features
X_untransformed = dataset.iloc[:, 1:-1].copy()
print(type(X_untransformed))
print("\n")
print(X_untransformed)

<class 'pandas.core.frame.DataFrame'>


      gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0     Female              0     Yes         No       1           No   
1       Male              0      No         No      34          Yes   
2       Male              0      No         No       2          Yes   
3       Male              0      No         No      45           No   
4     Female              0      No         No       2          Yes   
...      ...            ...     ...        ...     ...          ...   
7038    Male              0     Yes        Yes      24          Yes   
7039  Female              0     Yes        Yes      72          Yes   
7040  Female              0     Yes        Yes      11           No   
7041    Male              1     Yes         No       4          Yes   
7042    Male              0      No         No      66          Yes   

     MultipleLines InternetService OnlineSecurity OnlineBackup  \
0               No             DSL       

In [13]:
X_untransformed.shape

(7032, 19)

### 1.4. Encoding the Categorical Columns

In [None]:
# Identify columns
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 
               'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
               'DeviceProtection', 'TechSupport', 'StreamingTV', 
               'StreamingMovies','PaperlessBilling']  

multi_cat_cols = ['InternetService', 'Contract', 'PaymentMethod']  # Multi-class

numeric_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'] 

In [15]:
# Make sure these columns exist before encoding
print("Checking columns in X:", X_untransformed.columns.tolist())
missing_cols = [col for col in multi_cat_cols if col not in X_untransformed.columns]
if missing_cols:
    print("These expected columns are missing from X:", missing_cols)
else:
    print("All multi-class categorical columns are present.")

Checking columns in X: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges']
All multi-class categorical columns are present.


#### 1.4.1. Encoding the Independent Variables with Label Encoding

In [16]:
from sklearn.preprocessing import LabelEncoder

# Label encode binary categorical columns

X_untransformed = dataset.iloc[:, 1:-1].copy()

from sklearn.preprocessing import LabelEncoder

for col in binary_cols:
    le = LabelEncoder()
    X_untransformed[col] = le.fit_transform(X_untransformed[col])

print(X_untransformed.head())  # Show first few rows

   gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0       0              0        1           0       1             0   
1       1              0        0           0      34             1   
2       1              0        0           0       2             1   
3       1              0        0           0      45             0   
4       0              0        0           0       2             1   

   MultipleLines InternetService  OnlineSecurity  OnlineBackup  \
0              0             DSL               0             1   
1              0             DSL               1             0   
2              0             DSL               1             1   
3              0             DSL               1             0   
4              0     Fiber optic               0             0   

   DeviceProtection  TechSupport  StreamingTV  StreamingMovies  \
0                 0            0            0                0   
1                 1            0            

#### 1.4.2. Encoding the Independent Variables with One-Hot Encoding

In [None]:
# One-hot encode multi-class categorical columns while still a DataFrame
X_untransformed = pd.get_dummies(X_untransformed, columns=multi_cat_cols)

# Now convert to NumPy array before feeding into model
X_untransformed = X_untransformed.applymap(lambda val: int(val) if isinstance(val, bool) else val) #  Convert boolean columns to int (0/1)
X = X_untransformed.values 

  X_untransformed = X_untransformed.applymap(lambda val: int(val) if isinstance(val, bool) else val) #  Convert boolean columns to int (0/1)


In [18]:
print(X)

X_df = pd.DataFrame(X)
print("\n")
X_df.head(50)

[[0. 0. 1. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 1. ... 0. 1. 0.]
 [1. 1. 1. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 0.]]




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,34.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,45.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.0,0.0,0.0,8.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,1.0,0.0,0.0,1.0,22.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.0,0.0,1.0,0.0,28.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,1.0,0.0,0.0,1.0,62.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [19]:
X.shape

(7032, 26)

#### 1.4.3. Encoding the Dependent Variable Vector

In [20]:
# Defining the dependent variable vector
y = dataset.iloc[:, -1].values
print(y)

['No' 'No' 'Yes' ... 'No' 'Yes' 'No']


In [21]:
y = le.fit_transform(y)

print(y)

[0 0 1 ... 0 1 0]


### 1.5. Splitting the dataset into training and testing

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

### 1.6. Using the SMOTE (Synthetic Minority Oversampling Technique) To handle imbalanced classes

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

This is a mild to moderate imbalance (not extreme like 95:5), so both oversampling and undersampling are viable options — but each has trade-offs.


Your dataset is not huge → oversampling won’t be too slow
Minority class is still decently sized (1869) → SMOTE can create diverse synthetic samples
Undersampling would discard too much useful data (removing from 5174 to ~1869)

Why I Didn't Use Undersampling (alone)?

The dataset is mild to moderately imbalanced for the target variable, so both oversampling and undersampling would've been viable options. However, there was a high likelihood that a lot of the the majority class would've been thrown away, which would've have created a risk of underfitting and poor generalization.

In [None]:
# Optional: Check class distribution after resampling

from collections import Counter
print(Counter(y_train_resampled))

In [23]:
print(X_train)

[[0. 1. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 0. 1.]
 [1. 1. 0. ... 0. 1. 0.]]


In [24]:
print(X_test)

[[1. 0. 1. ... 1. 0. 0.]
 [0. 1. 1. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [1. 0. 1. ... 1. 0. 0.]]


In [25]:
print(y_train)

[0 1 1 ... 1 1 1]


In [26]:
print(y_test)

[0 1 0 ... 0 0 0]
