In [None]:
# basic dataframe and operations
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# manipulation and preprocessing
from sklearn.preprocessing import Normalizer, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# measuring results
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_absolute_error, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# warning suppression
import warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
    # Import the csv training dataset to a pandas dataframe
# The dataset is expected in the same directory as this notebook
#   under a subfolder path datasets/
data_raw_input = pd.read_csv('datasets/train.csv')
# Show the shape of the dataset
data_raw_input.shape


(5634, 21)

In [3]:
# Display a few rows from the training data
data_raw_input.head(20)

Unnamed: 0,id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,label
0,1815,Male,0,Yes,Yes,12,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.7,258.35,0
1,5947,Female,0,No,No,42,Yes,No,DSL,Yes,...,Yes,Yes,No,Yes,One year,No,Credit card (automatic),73.9,3160.55,1
2,3882,Male,0,Yes,No,71,Yes,Yes,DSL,Yes,...,No,Yes,No,No,Two year,No,Bank transfer (automatic),65.15,4681.75,0
3,2390,Male,0,Yes,Yes,71,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,No,Electronic check,85.45,6300.85,0
4,3677,Male,0,No,No,30,Yes,No,DSL,Yes,...,No,Yes,Yes,No,One year,No,Electronic check,70.4,2044.75,0
5,612,Female,0,Yes,Yes,9,Yes,No,DSL,Yes,...,Yes,Yes,No,No,Month-to-month,No,Mailed check,65.0,663.05,1
6,589,Male,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,...,Yes,No,No,Yes,Two year,Yes,Electronic check,99.15,7422.1,0
7,2688,Male,0,No,No,28,Yes,No,Fiber optic,No,...,Yes,Yes,No,Yes,Month-to-month,No,Electronic check,91.0,2626.15,0
8,1551,Male,1,Yes,No,6,Yes,Yes,Fiber optic,No,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,80.8,457.1,0
9,2891,Female,1,Yes,Yes,47,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,No,Bank transfer (automatic),70.55,3309.25,1


In [4]:
# Get the count of nulls per column
# Turns out we don't have any
print("Nulls:")
print(data_raw_input.isnull().sum().sum())
print("Na count:")
print(data_raw_input.isna().sum().sum())
print("Duplicate rows:")
print(data_raw_input.duplicated(keep='first').sum())

# In attempting to change total_charges from string to numeric I received an error that there were some
#   values in there that were just ' ' -- a non-empty string that contains just a space. Now I'll formally detect it.
print(data_raw_input.columns[data_raw_input.isin([' ']).any()])

Nulls:
0
Na count:
0
Duplicate rows:
0
Index(['total_charges'], dtype='object')


### Dealing with total_charges

The values are empty strings with a space in them, ' ', but the column needs to be changed to float. So I'll first run an imputer over them to replace that, then we'll change to numeric data type.

Since we'll need to do this on the prediction to submit I've made a function to do this so we can reuse it at predict time.

In [None]:
class MultiPrep:
    feature_encoders={}
    def destroy_encoders(self):
        self.feature_encoders = {}                   # overwrite the old encoders with an empty collection
        
    def fit_text_encoders(self, df_features):
        non_numeric_columns = df_features.select_dtypes(exclude='number').columns   # get a list of non-numeric cols
        for col in non_numeric_columns:         # loop through those columns to fit an encoder
            encoder = LabelEncoder()            # make a new label encoder for this column
            encoder.fit(df_features[col])       # fit this encoder
            self.feature_encoders[col] = encoder     # add it to the encoders collection for later use

    # fit a normalizer for all numerics
    def fit_normalizers(self, df_features):
        # note I'm not caching the "this model has not been fit" error, like I would for a real app
        numeric_columns = df_features.select_dtypes(include='number').columns   # get a list of numeric cols
        scaler = Normalizer()
        scaler.fit(df_features[numeric_columns])    # apply the transform
        self.feature_encoders['numerics'] = scaler  # preserve the numeric scaler 
     
    # encode all labels
    def transform_encode_all(self, df_features):
        # note I'm not caching the "this model has not been fit" error, like I would for a real app
        non_numeric_columns = df_features.select_dtypes(exclude='number')   # get a list of non-numeric cols
        for col in non_numeric_columns:         # loop through them
            df_features[col] = self.feature_encoders[col].transform(df_features[col])    # apply the transform
        return df_features     
    
    # normalize all numerics
    def transform_normalize_all(self, df_features):
        # note I'm not caching the "this model has not been fit" error, like I would for a real app
        numeric_columns = df_features.select_dtypes(include='number').columns   # get a list of numeric cols
        df_features[numeric_columns] = self.feature_encoders['numerics'].fit_transform(df_features[numeric_columns])    # apply the transform
        return df_features
            
    def decode_all(self, df_features):
        non_numeric_columns = df_features.select_dtypes(exclude='number')   # get the non-numeric cols
        for col in non_numeric_columns:         # loop through them
            df_features[col] = self.feature_encoders[col].inverse_transform(df_features[col])    # decode the data
    
    
# split the x and y data. Doing it outside the transformation as we don't want to transform the validation
#   set yet
def split_x_y(data_raw):
    # remove the result column from the input parameters
    # also remove the ID column. It carries no signal.
    X_no_label = data_raw.drop('label', axis=1).drop('id', axis=1)

    # Assign class labels for the input data
    y_labels = data_raw['label']      # assign the labels we'll encode in the next block
    return X_no_label, y_labels


# method: transform_features
# purpose: to clean and encode the features of a dataset
# parameters: X_without_label - the raw features without label or id columns
#       transform_only - True or False. If it's the training set we fit and transform otherwise transform only
# returns: X_scaled - cleaned features that are label encoded and scaled
# steps:
#  Drop the label and id from the x
#  Put the label into y
#  replace spaces in total_charges with Nan
#  Recast total_charges to float
#  impute the missing values in total_charges
#  encode the string data
#  scale the numeric data
#  return the x data
def transform_features(X_without_label, transform_only):
   
    # --------------- total_charges processing --------------------
    # change spaces in total_charges to Nan then recast
    X_without_label['total_charges'] = X_without_label['total_charges'].replace(' ', np.nan)
    X_without_label['total_charges'] = pd.to_numeric(X_without_label['total_charges']) # recast as floating point
    print(X_without_label['total_charges'])
    # Now we're missing values so let's impute them
    imputer = SimpleImputer(strategy='mean')
    X_without_label['total_charges'] = imputer.fit_transform(X_without_label['total_charges'].values.reshape(-1,1))
    # --------------- end total_charges processing
      
     
    # Encode the string columns and scale the numerics
    if not transform_only:                           # only fit the model if it's the x training set, not validate or predict
        X_without_label = multiprep.normalize_all(X_without_label)        # normalize all of the numieric columns
        multiprep.fit_all(X_without_label)                               # fit all of the text columns
    X_transformed = multiprep.encode_all(X_without_label)             # transform the X   
    
    return X_transformed

label_encoders={}
multiprep = MultiPrep()

In [43]:
# For the label column in the training set
# Show the unique values of the training labels
col_list = data_raw_input.columns.to_list()

for col in data_raw_input.columns:
    if not pd.api.types.is_numeric_dtype(data_raw_input[col]):
        print("{}: {}".format(col, data_raw_input[col].unique()))

gender: ['Male' 'Female']
partner: ['Yes' 'No']
dependents: ['Yes' 'No']
phone_service: ['Yes' 'No']
multiple_lines: ['No' 'Yes' 'No phone service']
internet_service: ['No' 'DSL' 'Fiber optic']
online_security: ['No internet service' 'Yes' 'No']
online_backup: ['No internet service' 'Yes' 'No']
device_protection: ['No internet service' 'Yes' 'No']
tech_support: ['No internet service' 'Yes' 'No']
streaming_tv: ['No internet service' 'No' 'Yes']
streaming_movies: ['No internet service' 'Yes' 'No']
contract: ['Two year' 'One year' 'Month-to-month']
paperless_billing: ['No' 'Yes']
payment_method: ['Mailed check' 'Credit card (automatic)' 'Bank transfer (automatic)'
 'Electronic check']
total_charges: ['258.35' '3160.55' '4681.75' ... '2979.5' '114.1' '1114.85']


I don't see any ordinal features but I do see the column "total_charges" needs to be retyped from string to numeric. We'll need to re-type that before we encode or those will blow out the dimensionality of that field and be useless.

In [77]:
# split the features and the class labels
X_without_label, y_values = split_x_y(data_raw_input)        # split x and y training sets

# Do train test split
# Here I'm doing a 60/40 split as my dataset is fairly large
X_train, X_test, y_train, y_test = train_test_split(X_without_label, y_values,
    test_size=0.4,
    stratify=y_values, 
    random_state=17)

# heavy preprocessing here - encode, scale, etc.
# senior_citizen is a weird column as it's already been encoded as 0/1.
# scaling it will do no harm as the values will be scaled and still represent the classes well enough

X_train = transform_features(X_train, transform_only=False)
X_test = transform_features(X_test, transform_only=True)

3547      69.40
4971      67.10
3918    4541.20
3515    1415.55
1011     162.15
         ...   
881       45.70
3608    1567.55
5633    1114.85
404     1120.30
1065    3001.20
Name: total_charges, Length: 3380, dtype: float64
4346    5084.65
1886      55.45
1397      20.50
2279    3848.80
5490     188.70
         ...   
5037    3343.15
5064     308.25
3549    5567.45
82       116.65
3431    1510.30
Name: total_charges, Length: 2254, dtype: float64


In [78]:


print(X_train[:25])
print(X_test[:25])

      gender  senior_citizen  partner  dependents    tenure  phone_service  \
3547       1       -0.423004        0           0 -1.259845              1   
4971       1       -0.423004        0           0 -1.178579              1   
3918       1        2.364042        1           1  0.568631              1   
3515       1       -0.423004        0           0 -0.731619              1   
1011       0       -0.423004        1           1 -0.975415              1   
2310       1       -0.423004        0           0 -0.040861              1   
3354       0       -0.423004        1           1 -0.650353              1   
4962       1       -0.423004        0           0  0.893693              1   
2620       1       -0.423004        0           0 -1.219212              1   
4858       0       -0.423004        1           0 -0.894150              1   
5275       1        2.364042        0           0 -0.487822              1   
2841       0       -0.423004        0           0 -0.853517     