In [1]:
import sys
import os
import pandas as pd
import numpy as np
import cdsw
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.compose import ColumnTransformer
from lime.lime_tabular import LimeTabularExplainer
from churnexplainer import ExplainedModel, CategoricalEncoder

In [2]:
df = pd.read_csv('data/data.csv')
df.head()
df.replace({"SeniorCitizen": {1: "Yes", 0: "No"}}, inplace=True)

In [3]:
labelcol = "Churn"

df = (df
      .replace(r"^\s$", np.nan, regex=True).dropna().reset_index()
      # drop unnecessary and personally identifying information
      .drop(columns=['customerID'])
     )
try:
    # when loading from external data source, this column has str dtype
    df.replace({"SeniorCitizen": {"1": "Yes", "0": "No"}}, inplace=True)
except:
    # when loading from local data source, this column has int dtype 
    df.replace({"SeniorCitizen": {1: "Yes", 0: "No"}}, inplace=True)
    
df['TotalCharges'] = df['TotalCharges'].astype('float')
df.index.name='id'


# separate target variable column from feature columns
datadf, labels = df.drop(labelcol, axis=1), df[labelcol]

# recast all columns that are "object" dtypes to Categorical
for colname, dtype in zip(datadf.columns, datadf.dtypes):
  if dtype == "object":
    datadf[colname] = pd.Categorical(datadf[colname])

  
# Prepare data for Sklearn model and create train/test split
ce = CategoricalEncoder()
X = ce.fit_transform(datadf)
y = labels.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
ct = ColumnTransformer(
    [("ohe", OneHotEncoder(), list(ce.cat_columns_ix_.values()))],
    remainder="passthrough",
)

# Experiments options
# If you are running this as an experiment, pass the cv, solver and max_iter values
# as arguments in that order. e.g. `5 lbfgs 100`.
if len(sys.argv) == 4:
    try:
        cv = int(sys.argv[1])
        solver = str(sys.argv[2])
        max_iter = int(sys.argv[3])
    except:
        sys.exit("Invalid Arguments passed to Experiment")
else:
    cv = 5
    solver = "lbfgs"  # one of newton-cg, lbfgs, liblinear, sag, saga
    max_iter = 100

# Instantiate the model
clf = LogisticRegressionCV(cv=cv, solver=solver, max_iter=max_iter)
pipe = Pipeline([("ct", ct), ("scaler", StandardScaler()), ("clf", clf)])

# Train the model
pipe.fit(X_train, y_train)

# Capture train and test set scores
train_score = pipe.score(X_train, y_train)
test_score = pipe.score(X_test, y_test)
print("train", train_score)
print("test", test_score)
print(classification_report(y_test, pipe.predict(X_test)))
datadf[labels.name + " probability"] = pipe.predict_proba(X)[:, 1]


# Create LIME Explainer
feature_names = list(ce.columns_)
categorical_features = list(ce.cat_columns_ix_.values())
categorical_names = {i: ce.classes_[c] for c, i in ce.cat_columns_ix_.items()}
class_names = ["No " + labels.name, labels.name]
explainer = LimeTabularExplainer(
    ce.transform(datadf),
    feature_names=feature_names,
    class_names=class_names,
    categorical_features=categorical_features,
    categorical_names=categorical_names,
)


# Create and save the combined Logistic Regression and LIME Explained Model.
explainedmodel = ExplainedModel(
    data=datadf,
    labels=labels,
    categoricalencoder=ce,
    pipeline=pipe,
    explainer=explainer,
)
explainedmodel.save(model_name='telco_linear')


# If running as as experiment, this will track the metrics and add the model trained in this
# training run to the experiment history.
cdsw.track_metric("train_score", round(train_score, 2))
cdsw.track_metric("test_score", round(test_score, 2))
#cdsw.track_metric("model_path", explainedmodel.model_path)
#cdsw.track_file(explainedmodel.model_path)

train 0.8058399696624953
test 0.7912400455062572
              precision    recall  f1-score   support

          No       0.84      0.89      0.86      1300
         Yes       0.62      0.51      0.56       458

    accuracy                           0.79      1758
   macro avg       0.73      0.70      0.71      1758
weighted avg       0.78      0.79      0.78      1758

