In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

In [8]:
df = pd.read_csv("../../03-classification/data/telco-customer-churn.csv")

df.columns = df.columns.str.lower().str.replace(" ", "_")

cat_cols = df.dtypes[df.dtypes == "object"].index
for c in cat_cols:
    df[c] = df[c].str.lower().str.replace(" ", "_")

df.totalcharges = pd.to_numeric(df.totalcharges, errors="coerce")
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == "yes").astype(int)

In [10]:
num_vars = ['tenure', 'monthlycharges', 'totalcharges']
cat_vars = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling','paymentmethod']

In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
y_full_train = df_full_train.churn.values
y_test = df_test.churn.values

In [21]:
def train(df, y, C=1.0):
    X_dict = df[num_vars + cat_vars].to_dict(orient="records")

    pipeline = make_pipeline(
        DictVectorizer(),
        LogisticRegression(solver="liblinear", C=C)
    )

    pipeline.fit(X_dict, y)

    return pipeline

In [29]:
def predict(df, pipeline):
    X_dict = df[num_vars + cat_vars].to_dict(orient="records")

    y_pred = pipeline.predict_proba(X_dict)

    return y_pred

In [30]:
C=1.0
n_splits = 5
scores = []

kf = KFold(n_splits=n_splits, shuffle=True, random_state=1)

for train_idx, val_idx in kf.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    pipeline = train(df_train, y_train)
    y_pred = predict(df_val, pipeline)[:, 1]

    score = roc_auc_score(y_val, y_pred)
    scores.append(score)

print(f"Score: {np.mean(scores)} +-{np.std(scores)}")

Score: 0.8411775349981994 +-0.007389213851662656


In [41]:
pipeline = train(df_full_train, y_full_train)
y_pred = predict(df_test, pipeline)[:, 1]

score = roc_auc_score(y_test, y_pred)
print(f"Final model's score: {score}")

Final model's score: 0.8579400803839363


Checking data for Pydantic

In [43]:
for n in num_vars:
    print(df[n].describe(), end="\n\n")

for c in cat_vars:
    print(df[c].value_counts(), end="\n\n")

count    7043.000000
mean       32.371149
std        24.559481
min         0.000000
25%         9.000000
50%        29.000000
75%        55.000000
max        72.000000
Name: tenure, dtype: float64

count    7043.000000
mean       64.761692
std        30.090047
min        18.250000
25%        35.500000
50%        70.350000
75%        89.850000
max       118.750000
Name: monthlycharges, dtype: float64

count    7043.000000
mean     2279.734304
std      2266.794470
min         0.000000
25%       398.550000
50%      1394.550000
75%      3786.600000
max      8684.800000
Name: totalcharges, dtype: float64

gender
male      3555
female    3488
Name: count, dtype: int64

seniorcitizen
0    5901
1    1142
Name: count, dtype: int64

partner
no     3641
yes    3402
Name: count, dtype: int64

dependents
no     4933
yes    2110
Name: count, dtype: int64

phoneservice
yes    6361
no      682
Name: count, dtype: int64

multiplelines
no                  3390
yes                 2971
no_phone_service  