In [1]:
!where python

C:\Users\daani\Anaconda3\python.exe
C:\Users\daani\AppData\Local\Microsoft\WindowsApps\python.exe


In [2]:
%load_ext autoreload
%autoreload 2

In [7]:
#Standard imports
import os
import pickle
import sys
sys.path.append('..')

#Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.inspection import plot_partial_dependence
from sklearn.metrics import roc_curve
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Local imports
from src.localpaths import *
from src.data.make_dataset import load_training_data
from src.data.make_dataset import load_test_data
from src.models.train_model import store_model_and_results, print_model_results, load_model_results, load_pickled_models

In [4]:
pd.set_option('display.max_columns', 100)

# Load Featurized Data

In [24]:
X_train, y_train = load_training_data(final=True)

In [26]:
X_train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,0,1,25.3,25.3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,7,1,1,75.15,525.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0,0,1,4,1,1,20.05,85.5,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,29,1,1,76.0,2215.25,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,0,0,0,3,1,1,75.1,270.7,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Inspect Feature multicollinearity

In [27]:
vifs = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

In [28]:
sorted(zip(X_train.columns, vifs), key=lambda x: x[1], reverse=True)

[('InternetService_No', inf),
 ('OnlineSecurity_No internet service', inf),
 ('OnlineBackup_No internet service', inf),
 ('DeviceProtection_No internet service', inf),
 ('TechSupport_No internet service', inf),
 ('StreamingTV_No internet service', inf),
 ('StreamingMovies_No internet service', inf),
 ('PhoneService', 1795.2936129533036),
 ('MonthlyCharges', 879.4071653278096),
 ('InternetService_Fiber optic', 150.413015558894),
 ('MultipleLines_No phone service', 61.269312275148884),
 ('StreamingTV_Yes', 24.573123044650014),
 ('StreamingMovies_Yes', 24.400188765894438),
 ('TotalCharges', 10.871762430587555),
 ('tenure', 7.528269480250208),
 ('MultipleLines_Yes', 7.388728481143913),
 ('DeviceProtection_Yes', 6.990713871204524),
 ('OnlineBackup_Yes', 6.911783532524882),
 ('TechSupport_Yes', 6.498500323508462),
 ('OnlineSecurity_Yes', 6.478979774573932),
 ('Contract_Two year', 2.6312318158444183),
 ('PaymentMethod_Electronic check', 1.9381234819755606),
 ('PaymentMethod_Mailed check', 1.8

## Test out our high-VIF dropping function

In [29]:
def drop_high_vif_features(X_train):
    """Drops features with a variance inflation factor of greater than 10.
    """
    finished = False
    while not finished:
        vifs = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
        high_vifs = sorted(zip(X_train.columns, vifs), key=lambda x: x[1], reverse=True)
        high_vif_col, high_vif_value = high_vifs[0]
        if high_vif_value >= 10:
            print(f"dropping column {high_vif_col} with vif value of {high_vif_value:.1f}")
            X_train = X_train.drop(columns=[high_vif_col])
        else:
            print("finished dropping columns")
            finished = True
    
    return X_train

In [30]:
X_train=drop_high_vif_features(X_train)

dropping column InternetService_No with vif value of inf
dropping column OnlineSecurity_No internet service with vif value of inf
dropping column OnlineBackup_No internet service with vif value of inf
dropping column DeviceProtection_No internet service with vif value of inf
dropping column TechSupport_No internet service with vif value of inf
dropping column StreamingTV_No internet service with vif value of inf
dropping column PhoneService with vif value of 1795.3
dropping column MonthlyCharges with vif value of 43.2
dropping column TotalCharges with vif value of 20.0
finished dropping columns


In [31]:
X_train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PaperlessBilling,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,7,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0,0,1,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,29,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,0,0,0,3,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [32]:
vifs = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
high_vifs = sorted(zip(X_train.columns, vifs), key=lambda x: x[1], reverse=True)

In [33]:
high_vifs

[('tenure', 7.4774441220077295),
 ('Contract_Two year', 3.4313527485856485),
 ('InternetService_Fiber optic', 3.1828069347558534),
 ('Partner', 2.8221516198532606),
 ('StreamingMovies_Yes', 2.695791666181272),
 ('PaperlessBilling', 2.693628715361594),
 ('StreamingTV_Yes', 2.663586240957822),
 ('StreamingMovies_No internet service', 2.6378573344746887),
 ('MultipleLines_Yes', 2.489538491152523),
 ('DeviceProtection_Yes', 2.2453701155471633),
 ('PaymentMethod_Electronic check', 2.2253009456462447),
 ('OnlineBackup_Yes', 2.1123220943419883),
 ('Contract_One year', 2.0715371315045408),
 ('TechSupport_Yes', 2.020772666726921),
 ('Dependents', 1.9453155700993516),
 ('OnlineSecurity_Yes', 1.9039746521866978),
 ('gender', 1.8450275914324004),
 ('PaymentMethod_Mailed check', 1.7828477091189332),
 ('PaymentMethod_Credit card (automatic)', 1.7219223454312147),
 ('MultipleLines_No phone service', 1.373487099456584),
 ('SeniorCitizen', 1.3630154116601583)]

# Test loading final data without high-VIF columns

In [34]:
X_train, y_train = load_training_data(final=True)

In [35]:
X_train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PaperlessBilling,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,7,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0,0,1,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,29,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,0,0,0,3,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
