## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train a machine learning model to predict customer churn.

In [1]:
import pandas as pd

In [51]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 
train.drop("customerID", axis=1, inplace=True)

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [22]:
from sklearn.preprocessing import LabelEncoder

In [52]:
le = LabelEncoder()

le.fit(train['gender'])

#le.transform(train['gender'])

transfromed_column = le.transform(train['gender'])
transfromed_column

array([1, 1, 1, ..., 0, 0, 0])

In [64]:
train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,No,No,5,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,Month-to-month,No,Bank transfer (automatic),75.15,392.65,No
1,Male,0,Yes,No,66,Yes,Yes,DSL,No,No,No,Yes,No,Yes,Month-to-month,No,Electronic check,63.85,4264.6,No
2,Male,0,Yes,Yes,42,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,No,Electronic check,73.15,3088.25,No
3,Male,0,No,No,19,Yes,Yes,DSL,No,No,No,No,Yes,Yes,Month-to-month,No,Mailed check,69.60,1394.55,No
4,Male,0,No,No,59,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),20.20,1192.3,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,Male,0,No,No,1,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Bank transfer (automatic),20.20,20.2,No
5278,Male,0,Yes,No,2,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Electronic check,76.40,151.8,Yes
5279,Female,0,Yes,No,58,Yes,Yes,DSL,Yes,No,No,Yes,No,Yes,One year,Yes,Electronic check,68.40,3972.25,No
5280,Female,0,No,No,1,Yes,No,Fiber optic,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,75.70,75.7,Yes


In [67]:
train.MonthlyCharges.describe

<bound method NDFrame.describe of 0       75.15
1       63.85
2       73.15
3       69.60
4       20.20
        ...  
5277    20.20
5278    76.40
5279    68.40
5280    75.70
5281    84.05
Name: MonthlyCharges, Length: 5282, dtype: float64>

In [68]:
train.TotalCharges.describe

<bound method NDFrame.describe of 0        392.65
1        4264.6
2       3088.25
3       1394.55
4        1192.3
         ...   
5277       20.2
5278      151.8
5279    3972.25
5280       75.7
5281     333.55
Name: TotalCharges, Length: 5282, dtype: object>

In [53]:
train.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [56]:
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

In [57]:
train[categorical_columns]

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Male,No,No,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,Month-to-month,No,Bank transfer (automatic),No
1,Male,Yes,No,Yes,Yes,DSL,No,No,No,Yes,No,Yes,Month-to-month,No,Electronic check,No
2,Male,Yes,Yes,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,No,Electronic check,No
3,Male,No,No,Yes,Yes,DSL,No,No,No,No,Yes,Yes,Month-to-month,No,Mailed check,No
4,Male,No,No,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,Male,No,No,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Bank transfer (automatic),No
5278,Male,Yes,No,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Electronic check,Yes
5279,Female,Yes,No,Yes,Yes,DSL,Yes,No,No,Yes,No,Yes,One year,Yes,Electronic check,No
5280,Female,No,No,Yes,No,Fiber optic,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,Yes


In [58]:
column_mapper = {}

for column in categorical_columns:
    le = LabelEncoder()
    le.fit(train.loc[:, column])
    column_mapper[column] = le

In [27]:
column_mapper

{'gender': LabelEncoder(),
 'Partner': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'PhoneService': LabelEncoder(),
 'MultipleLines': LabelEncoder(),
 'InternetService': LabelEncoder(),
 'OnlineSecurity': LabelEncoder(),
 'OnlineBackup': LabelEncoder(),
 'DeviceProtection': LabelEncoder(),
 'TechSupport': LabelEncoder(),
 'StreamingTV': LabelEncoder(),
 'StreamingMovies': LabelEncoder(),
 'Contract': LabelEncoder(),
 'PaperlessBilling': LabelEncoder(),
 'PaymentMethod': LabelEncoder(),
 'Churn': LabelEncoder()}

In [None]:
customer_dict = {
    "col1": 1,
    "col2": 2
}

pd.DataFrame

In [28]:
def pre_process_data(df, label_encoder_dict):
    df_out = df.copy()
    df_out.replace(" ", 0, inplace=True)
    df_out.loc[:, 'TotalCharges'] = pd.to_number(df)

    if 'customerID' in df_out.columns:
        df_out.drop('customerID', axis=1, inplace=True)

    for column, le in label_encoder_dict.items():
        df_out.loc[:, column] = le.transform(df_out.loc[:, column])

    return df_out


In [31]:
train_processed = pre_process_data(train, column_mapper)
val_processed = pre_process_data(val, column_mapper)


In [32]:
x_train = train_processed.drop('Churn', axis=1)
y_train = train_processed.loc[:, 'Churn'].astype(int)

y_train

0       0
1       0
2       0
3       0
4       0
       ..
5277    0
5278    1
5279    0
5280    1
5281    1
Name: Churn, Length: 5282, dtype: int64

In [33]:
from sklearn.linear_model import LogisticRegression


In [35]:
model = LogisticRegression(max_iter=1000)

model.fit(x_train, y_train)

In [38]:
pd.DataFrame(model.coef_, columns=x_val.columns)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,-0.031459,0.294731,0.093503,-0.208003,-0.065247,-0.776583,0.091777,0.107447,-0.298611,-0.190368,-0.118745,-0.230823,0.018438,0.025645,-0.718365,0.341511,0.022749,0.020424,0.000366


In [36]:
x_val = val_processed.drop('Churn', axis=1)
y_val = val_processed.loc[:, 'Churn'].astype(int)

predictions = model.predict(x_val)
predictions

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,

In [46]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_val, predictions)
print(f"Classification: {round(accuracy, 3)}")

Classification: 0.834


In [69]:
import pickle

with open("./models/churn_prediction_model.pkl", "wb") as pickeler:
    pickle.dump(model, pickeler)

with open("./models/churn_prediction_label_encoder.pkl", "wb") as pickeler:
    pickle.dump(column_mapper, pickeler)

In [63]:
# train['SeniorCitizen'].value.count()

In [59]:
# getting validation data

val = pd.read_csv("./data/validation_data.csv")

In [None]:
customer_row = {
    'gender': gender,
    'SeniorCitizen': SeniorCitizen
}