# Data Preprocessing

### Importing Libraries



In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Loading the new Dataset

In this step, we will load the modified dataset that we obtaine from EDA step.

In [12]:
clean_df = pd.read_csv('../data/clean_churn.csv')
print(clean_df.head())

   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  ...  \
0  No phone service             DSL             No          Yes  ...   
1                No             DSL            Yes           No  ...   
2                No             DSL            Yes          Yes  ...   
3  No phone service             DSL            Yes           No  ...   
4                No     Fiber optic             No           No  ...   

  TechSupport StreamingTV StreamingMovies        Contract PaperlessBilling  \
0          No          No              No  Month-to-month              Yes   
1 

### Checking missing or null values

In [13]:
miss = clean_df.isnull().sum()
print(miss)


gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
Churn_Encoded        0
dtype: int64


### Filling the null values

Since we have null values in the 'TotalCharges' column, we can fill it by using the forward fill method. 

In [14]:
clean_df.fillna(method='ffill', inplace=True)

#### Missing Value Imputation - Forward and Backward fill methods

These 2 methods are used to fill the missing values in the dataset. In forward fill method, the values from the previous known values are filled in place of null values. In backward fill method, the value of the next value is filled in place of null value. 

### Encoding categorical Variables 

In this step, we will convert the categorical values into numerical values. This step is essential while building an ML model because ML models always require numerical inputs. In this step, we are using one-hot encoding to convert the categorical values inot numerical values. 

In [15]:
new_catcols = clean_df.select_dtypes(include=['object','category']).columns.tolist()
clean_df = pd.get_dummies(clean_df, columns=new_catcols, drop_first=True)
clean_df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_Encoded,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,0,0,1,0,0,1,...,0,0,0,0,0,1,0,1,0,0
1,0,34,56.95,1889.5,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
2,0,2,53.85,108.15,1,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,1
3,0,45,42.3,1840.75,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,0,2,70.7,151.65,1,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,1


### Feature Scaling


In [16]:
num_cols = clean_df.select_dtypes(include=['float64', 'int64']).columns.tolist()

#do not scale target

if 'Churn' in num_cols:
    num_cols.remove('Churn')

Scaler = StandardScaler()
clean_df[num_cols] = Scaler.fit_transform(clean_df[num_cols])

The StandardScaler rescales the value so that the mean is 0 and standard deviation 1. This step is important when training models like KNN, SVM and logistic regression. 

### Split the Features and Target



In [19]:
# Use Churn_Encoded as target
y = clean_df['Churn_Encoded']
X = clean_df.drop(['Churn_Encoded', 'Churn_Yes'], axis=1)


In [20]:
from sklearn.model_selection import train_test_split

# Split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X,                # features
    y,                # target
    test_size=0.2,    # 20% for testing
    random_state=42,  # reproducibility
    stratify=y        # maintain class distribution (important for churn imbalance)
)

# Check the shapes of resulting sets
print("Training set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


Training set: (5634, 30) (5634,)
Test set: (1409, 30) (1409,)


In [21]:
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
