In [657]:
import pandas as pd
import numpy as np

### Step 1: Data Analysis

In [658]:
df = pd.read_csv('data/customer_churn_data.csv')
print(f'Number of rows: %d\nNumber of columns: %d' %df.shape)
df.head()

Number of rows: 1000
Number of columns: 10


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,,2208.32,No,Yes


In [659]:
print('Missing values per column:')
df.isna().sum()

Missing values per column:


CustomerID           0
Age                  0
Gender               0
Tenure               0
MonthlyCharges       0
ContractType         0
InternetService    297
TotalCharges         0
TechSupport          0
Churn                0
dtype: int64

In [660]:
# Analyze the InternetService row
print(df['InternetService'].value_counts())

column_to_compare = 'TotalCharges'
print()
print(df.groupby(['InternetService'])[column_to_compare].mean())

InternetService
Fiber Optic    395
DSL            308
Name: count, dtype: int64

InternetService
DSL            1390.197013
Fiber Optic    1408.323924
Name: TotalCharges, dtype: float64


### Step 2: Handle missing values
#### Option A: Delete InternetService column

In [661]:
df_no_internet_column = df.dropna(axis=1)
df_no_internet_column.shape

(1000, 9)

#### Option B: Delete InternetService Nan rows

In [662]:
df_no_internet_rows = df.dropna()
df_no_internet_rows.shape

(703, 10)

#### Option C: Fill Nan

In [663]:
df_filled_internet = df.ffill() # Arbitrarily used 'ffill', also 'bfill' could be used
df_filled_internet.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,DSL,2208.32,No,Yes


### Step 3: Handle categorial features

##### OneHot Encoder to create a new labeled dataframe

In [664]:
from sklearn.preprocessing import OneHotEncoder

oh_enc = OneHotEncoder(sparse_output=False) # Set sparse as False to make sure we get a dense matrix
cat_cols = ['Gender', 'ContractType', 'InternetService', 'TechSupport']
cat_cols_reduced = ['Gender', 'ContractType', 'TechSupport'] # For the DF with no InternetService column

# Create new Dataframes with OH encoded columns
encoded_no_internet_column = pd.DataFrame(oh_enc.fit_transform(df_no_internet_column[cat_cols_reduced]), 
                                          columns=oh_enc.get_feature_names_out(cat_cols_reduced))

encoded_no_internet_rows = pd.DataFrame(oh_enc.fit_transform(df_no_internet_rows[cat_cols]),
                                        columns=oh_enc.get_feature_names_out(cat_cols))

encoded_filled_internet = pd.DataFrame(oh_enc.fit_transform(df_filled_internet[cat_cols]),
                                       columns=oh_enc.get_feature_names_out(cat_cols))

##### Concatenate the OH labeled dataframe with the original one

In [665]:
df_no_internet_column.drop(columns=cat_cols_reduced, axis=1, inplace=True)
df_no_internet_column = pd.concat([df_no_internet_column, encoded_no_internet_column], axis=1)

df_no_internet_rows.drop(columns=cat_cols, axis=1, inplace=True)
encoded_no_internet_rows.index = df_no_internet_rows.index # Need to match encoded indexes with the original DF indexes
df_no_internet_rows = pd.concat([df_no_internet_rows, encoded_no_internet_rows], axis=1)

df_filled_internet.drop(columns=cat_cols, axis=1, inplace=True)
df_filled_internet = pd.concat([df_filled_internet, encoded_filled_internet], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_internet_column.drop(columns=cat_cols_reduced, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_internet_rows.drop(columns=cat_cols, axis=1, inplace=True)


##### Label Encoder for the target [Yes --> 1; No --> 0]

In [666]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_no_internet_column['Churn'] = le.fit_transform(df_no_internet_column['Churn'])
df_no_internet_rows['Churn'] = le.fit_transform(df_no_internet_rows['Churn'])
df_filled_internet['Churn'] = le.fit_transform(df_filled_internet['Churn'])

### Step 4: Prepare data for training and testing

##### Separate the target and features

In [667]:
y_col = df_no_internet_column.Churn
y_row = df_no_internet_rows.Churn
y_fill = df_filled_internet.Churn

df_no_internet_column.drop(['Churn', 'CustomerID'], axis=1, inplace=True)
df_no_internet_rows.drop(['Churn', 'CustomerID'], axis=1, inplace=True)
df_filled_internet.drop(['Churn', 'CustomerID'], axis=1, inplace=True)

X_col = df_no_internet_column.copy()
X_row = df_no_internet_rows.copy()
X_fill = df_filled_internet.copy()

##### Standarize the data (except the OneHot encoded)

In [668]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
cols_std = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']

X_col[cols_std] = std_scaler.fit_transform(X_col[cols_std])
X_row[cols_std] = std_scaler.fit_transform(X_row[cols_std])
X_fill[cols_std] = std_scaler.fit_transform(X_fill[cols_std])

### Step 5: Select the model, fit, predict and evaluate

In [669]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier(n_estimators=100, random_state=2)
skf = StratifiedKFold(n_splits=10, shuffle=True)

# Saves the accuracy score for each split and returns the mean
def get_accuracy(model, X_data, y_data):
    train_acc = np.array([])
    test_acc = np.array([])
    for i, (train_index, test_index) in enumerate(skf.split(X_data, y_data)):
        X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
        y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]

        model.fit(X_train, y_train)
        train_acc = np.append(
            train_acc,
            accuracy_score(model.predict(X_train), y_train)
        )
        test_acc = np.append(
            test_acc,
            accuracy_score(model.predict(X_test), y_test)
        )
    return train_acc.mean(), test_acc.mean()

In [670]:
tr_acc_col, tst_acc_col = get_accuracy(rf_model, X_col, y_col)

print('Accuracy for data without Internet Service column:')
print(f'Train accuracy: {tr_acc_col*100:.2f}%\nTest accuracy: {tst_acc_col*100:.2f}%')

Accuracy for data without Internet Service column:
Train accuracy: 100.00%
Test accuracy: 99.70%


In [671]:
tr_acc_row, tst_acc_row = get_accuracy(rf_model, X_row, y_row)

print('Accuracy for data without Internet Service column:')
print(f'Train accuracy: {tr_acc_row*100:.2f}%\nTest accuracy: {tst_acc_row*100:.2f}%')

Accuracy for data without Internet Service column:
Train accuracy: 100.00%
Test accuracy: 99.72%


In [672]:
tr_acc_fill, tst_acc_fill = get_accuracy(rf_model, X_fill, y_fill)

print('Accuracy for data without Internet Service column:')
print(f'Train accuracy: {tr_acc_fill*100:.2f}%\nTest accuracy: {tst_acc_fill*100:.2f}%')

Accuracy for data without Internet Service column:
Train accuracy: 100.00%
Test accuracy: 99.90%
