## Data preparation

Use the same data set and model from previous lesson

### Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [15]:
df = pd.read_csv("C:\\Users\\Geral\\Desktop\\VSCODE\\ml_zoomcamp\\03_classification\\Telco.csv")

In [16]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Data pre-processing

- Column names consistent

In [17]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

- Get the categorical features

In [18]:
categorical = list(df.dtypes[df.dtypes == object].index)

- Make the variables in categorical columns consistent

In [19]:
for c in categorical:
    df[c] = df[c].str.lower().str.replace(" ", "_")

- Change the column totalcharges to numeric

In [20]:
df.totalcharges = pd.to_numeric(df["totalcharges"], errors = "coerce")

- Fill the missing values with 0

In [21]:
df.isnull().sum()[df.isnull().sum() > 0]

totalcharges    11
dtype: int64

In [22]:
df.totalcharges = df.totalcharges.fillna(0)

- Change the column churn to integer

In [23]:
df["churn"] = (df["churn"] == "yes").astype(int)

In [24]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1


### Model - Logistic Regression

- Split the data set

In [25]:
df_train_full, df_test = train_test_split(df, test_size = 0.2, random_state = 1)

In [26]:
df_train, df_val = train_test_split(df_train_full, test_size = 0.25, random_state = 1)

- Reset the indexes

In [27]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

- Get the y vectors and delete the target variable from the data sets

In [28]:
y_train = df_train["churn"]
y_val = df_val["churn"]
y_test = df_test["churn"]

In [29]:
del df_train["churn"]
del df_val["churn"]
del df_test["churn"]

- Get a list of numerical and categorical columns. Get the customerid out of the categorical list

In [31]:
numerical = ["tenure", "monthlycharges", "totalcharges"]

In [32]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

- Convert the train to dictionary

In [33]:
train_dicts = df_train[categorical + numerical].to_dict(orient = "records")

- Get the feature matrix

In [34]:
dv = DictVectorizer(sparse = False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

- Convert the validation set to a dictionary and get the matrix

In [35]:
val_dicts = df_val[categorical+numerical].to_dict(orient = "records")
X_val = dv.transform(val_dicts)

- Train the model

In [36]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

- Apply and get the predictions

In [37]:
y_pred = model.predict_proba(X_val)[:,1]

- Evaluate the accuracy based on a churn_decision value

In [38]:
churn_decision = (y_pred > 0.5)
(y_val == churn_decision).mean()

0.8034066713981547

## Accuracy and Dummy model

**Accuracy**: measures the fraction of correct predictions. Divides the number of correct predictions by the total number of predictions.