In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os

from dotenv import load_dotenv
load_dotenv()

# Variables:
PROJECT_ID=os.getenv('PROJECT_ID')
BUCKET_NAME=os.getenv('BUCKET_NAME')
USER=os.getenv('USER')
LOCATION=os.getenv('LOCATION')
SERVICE_ACCOUNT=os.getenv('SERVICE_ACCOUNT')

# Model Deployment

In this notebook, we'll test some modifications to the code in the "Churn Prediction with Vertex" notebook to add the validation and evaluation we implemented in the "Eval Classification" notebook. Also, making the necessary edits to prepare for model deployment (based on the [custom prediction lab](https://codelabs.developers.google.com/vertex-cpr-sklearn#4)).

After the tests, we formalize this code on the "src/theory/04_deployment" directory, where we store our operational training code (vertex pipelines).

In [None]:
num_features = ['tenure', 'monthlycharges']
cat_features = [ 'gender',
                'seniorcitizen',
                'partner',
                'dependents',
                'phoneservice',
                'multiplelines',
                'internetservice',
                'onlinesecurity',
                'onlinebackup',
                'deviceprotection',
                'techsupport',
                'streamingtv',
                'streamingmovies',
                'contract',
                'paperlessbilling',
                'paymentmethod',
                ]
label = "churn"
C = 1
kfold_splits = 5

### 1. Getting splitted data

In [None]:
import pandas as pd

df_train_path = "gs://dz-d-stg-us-ml-zoomcamp/04_deployment/pipeline_root/532579765435/pipeline-log-reg-20231014041425/split-data_1606769530544062464/out_df_train.pkl"
df_test_path = "gs://dz-d-stg-us-ml-zoomcamp/04_deployment/pipeline_root/532579765435/pipeline-log-reg-20231014041425/split-data_1606769530544062464/out_df_test.pkl"

df_train = pd.read_pickle(df_train_path)
df_test = pd.read_pickle(df_test_path)

In [None]:
df_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no internet service,...,no internet service,no internet service,no internet service,no internet service,two year,no,mailed check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one year,no,credit card (automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two year,no,bank transfer (automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one year,no,electronic check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one year,no,electronic check,70.4,2044.75,0


### 2. Training code with preprocessing and K-Folds evaluation

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline

y_train = df_train[label].copy()
x_train = df_train[cat_features + num_features].copy()

def train(x_train, y_train, cat_features):
    ohe = OneHotEncoder(
                drop='first', # Whether to drop one of the features
                sparse=False, # Will return sparse matrix if set True
                handle_unknown='error' # Whether to raise an error 
            ) 
    column_transform = make_column_transformer(
                (ohe, cat_features),
                remainder='passthrough',
            )
    regr = LogisticRegression(C=C, max_iter=1000)
    my_pipeline = make_pipeline(column_transform, regr)
    my_pipeline.fit(x_train, y_train)

    return my_pipeline

In [None]:
kfold = KFold(n_splits=kfold_splits, shuffle=True, random_state=1)
k_folds_scores = []
fold = 0

# Validate:
for train_idx, val_idx in kfold.split(x_train):
    x = x_train.iloc[train_idx].copy()
    x_val = df_train.iloc[val_idx].copy()
    
    y = y_train.iloc[train_idx]
    y_val = y_train.iloc[val_idx]

    my_pipeline = train(x, y, cat_features)

    y_pred = my_pipeline.predict_proba(x_val)[:,1]

    auc = roc_auc_score(y_val, y_pred)
    k_folds_scores.append(auc)
    
    print(f'auc on fold {fold} is {auc}')
    fold += 1

auc on fold 0 is 0.8430619117520076
auc on fold 1 is 0.8430526585882665
auc on fold 2 is 0.8311097732289785
auc on fold 3 is 0.8322788098740167
auc on fold 4 is 0.8483731291769948


In [None]:
# Train:
my_pipeline = train(x_train, y_train, cat_features)

In [None]:
import joblib

joblib.dump(my_pipeline, 'model.joblib')

['model.joblib']

In [None]:
y_test = df_test[label].copy()
x_test = df_test[cat_features + num_features].copy()

new_pipeline = joblib.load('model.joblib')

y_pred = new_pipeline.predict_proba(x_test)[:,1]

auc = roc_auc_score(y_test, y_pred)
print(f'auc={auc}')

auc=0.8566481956948011


In [None]:
x_test.iloc[0]

gender                                 female
seniorcitizen                               0
partner                                    no
dependents                                 no
phoneservice                              yes
multiplelines                              no
internetservice                           dsl
onlinesecurity                            yes
onlinebackup                               no
deviceprotection                          yes
techsupport                               yes
streamingtv                               yes
streamingmovies                           yes
contract                             one year
paperlessbilling                          yes
paymentmethod       bank transfer (automatic)
tenure                                     41
monthlycharges                          79.85
Name: 0, dtype: object

New prediction:

In [None]:
customer = pd.json_normalize(
    [{
    "gender": "female",
    "seniorcitizen": 0,
    "partner": "yes",
    "dependents": "no",
    "phoneservice": "no",
    "multiplelines": "no phone service",
    "internetservice": "dsl",
    "onlinesecurity": "no",
    "onlinebackup": "yes",
    "deviceprotection": "no",
    "techsupport": "no",
    "streamingtv": "no",
    "streamingmovies": "no",
    "contract": "month-to-month",
    "paperlessbilling": "yes",
    "paymentmethod": "electronic check",
    "tenure": 24,
    "monthlycharges": 29.85,
},
    ]
)

In [None]:
customer

Unnamed: 0,gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,tenure,monthlycharges
0,female,0,yes,no,no,no phone service,dsl,no,yes,no,no,no,no,month-to-month,yes,electronic check,24,29.85


In [None]:
new_pipeline.predict_proba(customer)[:,1]

array([0.36275167])