# 4. Evaluation Metrics for Classification

In [62]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [64]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# fix name of columns
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Get list of categorical type columns
categorical_columns = list(df.dtypes[df.dtypes.eq("object")].index)

for col in categorical_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")

# Parse totalcharges to numeric and replace invalids with NAN
df["totalcharges"] = pd.to_numeric(df.totalcharges, errors="coerce").fillna(0)

# Parse yes to 1 no to 0
df["churn"] = df.churn.eq("yes").astype(int)

In [65]:
df_full_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=1,
)
df_train, df_val = train_test_split(
    df_full_train,
    test_size=(0.2 / 0.8),
    random_state=1,
)

# Reset index and get y vectors
target = "churn"

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

del df_train[target]
del df_val[target]
del df_test[target]

In [66]:
numerical = list(df_full_train.dtypes[df_full_train.dtypes.ne("object")].index)[1:-1]
print(numerical)
categorical = [col for col in list(df_full_train.dtypes.index) if col not in numerical][1:-1]
print(categorical)

['tenure', 'monthlycharges', 'totalcharges']
['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']


In [68]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train[categorical + numerical].to_dict(orient="records")

# fit and transform
X_train = dv.fit_transform(train_dicts)

model = LogisticRegression()
model.fit(X_train, y_train)

In [69]:
val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.fit_transform(val_dicts)

# 1st column is prob for 0, 2nd for 1
y_pred = model.predict_proba(X_val)[:,1]

# People will churn if prob >= 0.5
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean()

0.8034066713981547

## 4.2 Accuracy and dummy model