In [1]:
# Feature Engineering 

import pandas as pd

dataset = pd.read_csv('Cleaned_Dataset.CSV')  # Getting the data from the cleaned dataset csv file

print(dataset.shape)
dataset.head()

(1500, 22)


Unnamed: 0,customer_id,customer_name,gender,senior_citizen,partner,dependents,tenure,contract,paperless_billing,payment_method,...,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,churn
0,CUST0001,John doe,male,0,yes,no,64,month to month,yes,electronic check,...,no,no,dsl,,,,,,,yes
1,CUST0002,Lauren Vaughan,female,1,,,25,month to month,yes,,...,no,,none,,yes,,,,,
2,CUST0003,Mrs. Elizabeth Turner DDS,male,1,,no,61,one year,no,electronic check,...,,no,none,,,no,,,no,
3,CUST0004,Tiffany Johnson,female,1,,yes,49,,yes,,...,no,no,dsl,yes,no,,,,,yes
4,CUST0005,Dr. Janice Berg,female,1,no,yes,47,,no,,...,no,,,,,no,yes,,no,yes


In [2]:
# Checking the null valuses count
dataset.isnull().sum()

customer_id            0
customer_name          0
gender               298
senior_citizen         0
partner              292
dependents           729
tenure                 0
contract             312
paperless_billing    282
payment_method       309
monthly_charges        0
total_charges          0
phone_service        493
multiple_lines       373
internet_service     290
online_security      485
online_backup        469
device_protection    511
tech_support         512
streaming_tv         738
streaming_movies     730
churn                514
dtype: int64

In [3]:
# Droping the unwanted columns

dataset.drop(['customer_id', 'customer_name'], axis=1, inplace=True)

In [4]:
# Identify categorical columns
cat_cols = dataset.select_dtypes(include='object').columns

# Fill NaN with 'unknown'
dataset[cat_cols] = dataset[cat_cols].fillna('unknown')

# Check again
dataset.isnull().sum()

gender               0
senior_citizen       0
partner              0
dependents           0
tenure               0
contract             0
paperless_billing    0
payment_method       0
monthly_charges      0
total_charges        0
phone_service        0
multiple_lines       0
internet_service     0
online_security      0
online_backup        0
device_protection    0
tech_support         0
streaming_tv         0
streaming_movies     0
churn                0
dtype: int64

In [5]:
# one-hot encoding on your categorical variables for converting to 0 and 1 for Machine Learning modeling
dataset = pd.get_dummies(dataset, drop_first=True) # drop_first=True to remove the churn_no 
print(dataset)

      senior_citizen  tenure  monthly_charges  total_charges  gender_male  \
0                  0      64            67.59        1000.50         True   
1                  1      25            67.59        1000.50        False   
2                  1      61            67.59        1000.50         True   
3                  1      49           115.37        2698.62        False   
4                  1      47            67.59        5180.88        False   
...              ...     ...              ...            ...          ...   
1495               0      67            67.59        1000.50         True   
1496               1       8            47.97        1000.50         True   
1497               0       7            67.59        1000.50        False   
1498               1      34            57.74        1000.50        False   
1499               1      13            67.59        1000.50         True   

      gender_unknown  partner_unknown  partner_yes  dependents_unknown  \
0

In [6]:
from sklearn.preprocessing import StandardScaler

# Only scale numerical columns for z score
numeric_cols = ['tenure', 'monthly_charges', 'total_charges']
scaler = StandardScaler()   
dataset[numeric_cols] = scaler.fit_transform(dataset[numeric_cols])

In [7]:
print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 42 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   senior_citizen                          1500 non-null   int64  
 1   tenure                                  1500 non-null   float64
 2   monthly_charges                         1500 non-null   float64
 3   total_charges                           1500 non-null   float64
 4   gender_male                             1500 non-null   bool   
 5   gender_unknown                          1500 non-null   bool   
 6   partner_unknown                         1500 non-null   bool   
 7   partner_yes                             1500 non-null   bool   
 8   dependents_unknown                      1500 non-null   bool   
 9   dependents_yes                          1500 non-null   bool   
 10  contract_month-to-month                 1500 non-null   bool

In [8]:
# Exporting a CSV file for the cleaned dataset
dataset.to_csv('Model_Buliding_Dataset.CSV', index=False)