In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from miceforest import ImputationKernel


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

import l0learn



In [2]:
df_raw = pd.read_csv('../data/telecom_customer_churn.csv')
df_population = pd.read_csv('../data/telecom_zipcode_population.csv')

df = df_raw.copy()
df = df.query('`Customer Status` != "Joined"').reset_index(drop=True)

# join population by zip-code
df = pd.merge(left=df, right=df_population, on='Zip Code')

# remove unwanted columns
dropped_columns = ['Customer ID', 'Churn Category', 'Churn Reason', 'Latitude', 'Longitude', 'Zip Code']
df = df[[col for col in df.columns if col not in dropped_columns]]

# impute categoricals as NA
impute_na = ['Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 
             'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Multiple Lines']

df[impute_na] = df[impute_na].fillna('NA')
df['Offer'] = df['Offer'].fillna('No Offer')

# fix data types
categorical_columns = ['Gender', 'Married', 'City', 'Offer', 'Phone Service', 'Multiple Lines', 'Internet Service', 
                       'Internet Type', 'Online Security', 'Online Backup', 'Device Protection Plan', 
                       'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 
                       'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method',
                       'Customer Status']
df[categorical_columns] = df[categorical_columns].astype('category')

arranged_columns = list(df.columns[:-2])
arranged_columns.extend(['Population', 'Customer Status'])
df = df[arranged_columns]

df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], 
                                                                test_size=0.2, 
                                                                random_state=42)

rare_cities = df_X_train['City'].value_counts().index[df_X_train['City'].value_counts() <= 5]

df_X_train['City'] = df_X_train['City'].astype('object')
df_X_train.loc[df_X_train['City'].isin(rare_cities), 'City'] = 'Rare City'
df_X_train['City'] = df_X_train['City'].astype('category')

mice_kernel = ImputationKernel(
    data = df_X_train,
    save_all_iterations = True,
    random_state = 42
)

mice_kernel.mice(2)
mice_imputation = mice_kernel.complete_data()
mice_imputation.head()

df_X_train_imp = mice_imputation.copy()

  warn(


In [3]:
ebm = ExplainableBoostingClassifier()
ebm.fit(df_X_train_imp, df_y_train)

In [4]:
ebm_global = ebm.explain_global()
show(ebm_global)


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [5]:
ebm_local = ebm.explain_local(df_X_train_imp, df_y_train)
show(ebm_local)


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [6]:
df_X_test['City'] = df_X_test['City'].astype('object')
df_X_test.loc[df_X_test['City'].isin(rare_cities), 'City'] = 'Rare City'
df_X_test['City'] = pd.Categorical(df_X_test['City'], categories=df_X_train['City'].cat.categories)

df_X_test_imp = mice_kernel.impute_new_data(new_data=df_X_test).complete_data()

In [7]:
ebm.predict(df_X_test_imp)

array(['Churned', 'Stayed', 'Churned', ..., 'Stayed', 'Stayed', 'Stayed'],
      dtype='<U7')

In [31]:
preds = pd.Categorical(ebm.predict(df_X_test_imp))
print(confusion_matrix(df_y_test, preds))


f1_score(df_y_test.values, preds, pos_label='Churned')

[[282  89]
 [ 53 894]]


0.7988668555240793