In [57]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Part II: Model Building

Here you try your hand at model building to predict appointment no shows.

### Preprocessing

Package 'proj2_lib' now includes code to carry out preprocessing steps from part I. Here's how to use it:

In [58]:
import proj2_lib.util as utils

First, it includes a dictionary used for configuring path and file names
used through the project

In [59]:
utils.file_config

{'feature_pipeline_file': 'feature_pipeline.pkl',
 'labels_pipeline_file': 'labels_pipeline.pkl',
 'objstore_path': 'objects',
 'processed_data_path': 'processed_data',
 'raw_data_csv': 'KaggleV2-May-2016.csv',
 'raw_data_path': 'data',
 'test_csv': 'test_set.csv',
 'train_csv': 'train_set.csv'}

`feature_pipeline_file`: file storing the preprocessing pipeline used for preparing the feature matrix

`labels_pipeline_file`: file storing the preprocessing pipeline used for
preparing labels

`objstore_path`: directory to store python objects to disk

`processed_data_path`: directory containing processed data

`raw_data_csv`: name of the csv download from Kaggle

`raw_data_path`: directory containing raw data

`test_csv`: name of csv file containing test set

`train_csv`: name of csv file containing train set

You can change these paths and names to suit your project directory structure if you need so. E.g.,

In [60]:
file_config = utils.file_config
#config['raw_data_path'] = "some_other_directory"

First step is to create train test sets. Code is in file `proj2_lib/util.py` function `make_train_test_sets`. You
can edit that function as needed to include your own part I code if you so desire. The result will be to 
create files `train_set.csv` and `test_set.csv` in your `processed_data` directory (unless you change any of the entries in the configuration directory as above).

In [61]:
# ONLY NEED TO RUN THIS STEP ONCE (switch this to True to run it)
RUN_MAKE_TRAIN_TEST_FILES = False
if RUN_MAKE_TRAIN_TEST_FILES:
    utils.make_train_test_sets(config=file_config)

Next step is to fit the preprocessing pipelines. This is done in file `proj2_lib/preprocess.py`. Again you can edit code as needed in that file to incorporate your part I solution as you wish. The result will be to create files `feature_pipeline.pkl` and `labels_pipeline.pkl` containing the fit preprocessing pipelines we can then use to preprocess data.

In [62]:
import proj2_lib.preprocess as preprocess

# ONLY NEED TO RUN THIS STEP ONCE
RUN_FIT_PREPROCESSING = False
if RUN_FIT_PREPROCESSING:
    preprocess.fit_save_pipelines(config=file_config)

Finally, once we do that, we can get a training matrix and labels:

In [63]:
train_X, train_y = preprocess.load_train_data(config=file_config)

In [64]:
print(train_X.shape)
print(train_y.shape)

(90526, 101)
(90526,)


### Model Building

Using `sklearn` fit:
    - DecisionTree classifier
    - RandomForest classifier
    - Linear SVM classifier
    - SVM with Radial Basis Kernel classifier
    
Use default parameters for now.
Using 10-fold cross validation report both accuracy and AUC for each of the above four models.

QUESTION: Should you use accuracy or AUC for this task as a performance metric?

_ANSWER HERE_

In [None]:
import pandas as pd
import numpy as np


PROCESSED_DATA_DIR = 'processed_data'
#======================   Data PreProcessing ============================

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, StandardScaler

# github.com/pandas-dev/sklearn-pandas
# install with pip install sklearn-pandas
from sklearn_pandas import DataFrameMapper

class YesNoTransform(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        no_show = np.ndarray(shape=X.shape, dtype=int)
        for index, rowdata in X.iterrows():
            if any(rowdata == 'Yes'):
                no_show[index] = 1
            if any(rowdata == 'No'):
                no_show[index] = -1
        return no_show


yes_no_mapper = DataFrameMapper([
    (['No-show'], YesNoTransform())
], input_df=True)
    
    
no_show_pipeline = Pipeline([
    ('yes_no_mapper', yes_no_mapper)
])    


class WeekdayTransform(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X['AppointmentDay'].dt.weekday.values

weekday_mapper = DataFrameMapper([
    (['AppointmentDay'], WeekdayTransform())
], input_df=True)
    

weekday_pipeline = Pipeline([
    ('weekday_adder', weekday_mapper),
    ('weekday_encoder', OneHotEncoder(n_values=7))
])


class DaysAheadTransform(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        daysahead = (X['AppointmentDay'] - X['ScheduledDay'])\
            .dt.days\
            .values\
            .astype('float64')
        return daysahead
    
daysahead_mapper = DataFrameMapper([
    (['AppointmentDay', 'ScheduledDay'], DaysAheadTransform())
], input_df=True)

daysahead_pipeline = Pipeline([
    ('mapper', daysahead_mapper),
    ('scaler', StandardScaler())
])

date_pipeline = FeatureUnion(transformer_list=[
    ('weekday_pipeline', weekday_pipeline),
    ('daysahead_pipeline', daysahead_pipeline)
])

numeric_attributes = ['Scholarship',
                      'Hypertension',
                      'Diabetes',
                      'Alcoholism',
                      'SMS_received'
                     ]

num_mapper = DataFrameMapper(list(zip(numeric_attributes, [None for x in numeric_attributes])))

df_mapper = DataFrameMapper([
    (['Age'], StandardScaler()),
    ('Gender', LabelBinarizer()),
    ('Neighbourhood', LabelBinarizer()),
    (['Handicap'], OneHotEncoder())
])


full_pipeline = FeatureUnion(transformer_list=[
    ('date_pipeline', date_pipeline),
    ('num_mapper', num_mapper),
    ('df_mapper', df_mapper),
    ('no_show_pipeline', no_show_pipeline)
])


clean_df = pd.read_csv(PROCESSED_DATA_DIR + "/train_set.csv", parse_dates=['ScheduledDay','AppointmentDay'],
                      dtype={'Age': np.float64})

full_pipeline.fit(clean_df)
appt_mat = full_pipeline.transform(clean_df)
print(type(appt_mat))

appt_mat_df = pd.DataFrame(appt_mat.toarray())

appt_mat_df.to_csv(PROCESSED_DATA_DIR + '/appt_mat.csv', index=False)

In [None]:

import pandas as pd
import numpy as np
from sklearn import tree

DATA_10_FOLD_DIR = 'data_10_fold'
DATA_5_FOLD_DIR = 'data_5_fold'

PROCESSED_DATA_DIR = 'processed_data'
CSV_FILE = PROCESSED_DATA_DIR + '/appt_mat.csv'

appt_df = pd.read_csv(CSV_FILE)
print(appt_df.head())

df_feature = appt_df.iloc[:, :101].as_matrix()
df_label = appt_df.iloc[:, 101:].as_matrix()

K = 11

for k in range(1, K):
    
    train_feature = []
    train_label = []
    test_feature = []
    test_label = []
    
    for n in range(1, df_feature.shape[0]):
        if n % K != k - 1:
            train_feature.append(df_feature[n].tolist())
            train_label.append(df_label[n].tolist())

        else:
            test_feature.append(df_feature[n].tolist())
            test_label.append(df_label[n].tolist())
    train_feature_df = pd.DataFrame(train_feature)
    train_label_df = pd.DataFrame(train_label)
    test_feature_df = pd.DataFrame(test_feature)
    test_label_df = pd.DataFrame(test_label)
    
    
    train_feature_df.to_csv(DATA_10_FOLD_DIR + '/train_feature_' + str(k) + '.csv', index=False)
    train_label_df.to_csv(DATA_10_FOLD_DIR + '/train_label_' + str(k) + '.csv', index=False)
    test_feature_df.to_csv(DATA_10_FOLD_DIR + '/test_feature_' + str(k) + '.csv', index=False)
    test_label_df.to_csv(DATA_10_FOLD_DIR + '/test_label_' + str(k) + '.csv', index=False)    

K = 6


for k in range(1, K):
    
    train_feature = []
    train_label = []
    test_feature = []
    test_label = []
    
    for n in range(1, df_feature.shape[0]):
        if n % K != k - 1:
            train_feature.append(df_feature[n].tolist())
            train_label.append(df_label[n].tolist())

        else:
            test_feature.append(df_feature[n].tolist())
            test_label.append(df_label[n].tolist())
            
    train_feature_df = pd.DataFrame(train_feature)
    train_label_df = pd.DataFrame(train_label)
    test_feature_df = pd.DataFrame(test_feature)
    test_label_df = pd.DataFrame(test_label)
    
    
    train_feature_df.to_csv(DATA_5_FOLD_DIR + '/train_feature_' + str(k) + '.csv', index=False)
    train_label_df.to_csv(DATA_5_FOLD_DIR + '/train_label_' + str(k) + '.csv', index=False)
    test_feature_df.to_csv(DATA_5_FOLD_DIR + '/test_feature_' + str(k) + '.csv', index=False)
    test_label_df.to_csv(DATA_5_FOLD_DIR + '/test_label_' + str(k) + '.csv', index=False)   

    
    

     0    1    2    3    4    5    6         7    8    9 ...    92   93   94  \
0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  2.017970  0.0  0.0 ...   0.0  0.0  0.0   
1  1.0  0.0  0.0  0.0  0.0  0.0  0.0 -0.339370  0.0  1.0 ...   0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  1.0  0.0  0.0 -0.601297  0.0  1.0 ...   0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  1.0  0.0  0.0 -0.666779  1.0  0.0 ...   0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  1.0  0.0  0.0 -0.011962  0.0  0.0 ...   1.0  0.0  0.0   

    95   96   97   98   99  100  101  
0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  
1  0.0  1.0  0.0  0.0  0.0  0.0 -1.0  
2  0.0  1.0  0.0  0.0  0.0  0.0 -1.0  
3  0.0  1.0  0.0  0.0  0.0  0.0  1.0  
4  0.0  1.0  0.0  0.0  0.0  0.0 -1.0  

[5 rows x 102 columns]


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, auc

DATA_10_FOLD_DIR = 'data_10_fold'

    
K = 11
accuracy_total = 0
roc_auc_total = 0
for k in range(1, K):
    
    train_feature = pd.read_csv(DATA_10_FOLD_DIR + '/train_feature_' + str(k) + '.csv').as_matrix()
    train_label = pd.read_csv(DATA_10_FOLD_DIR + '/train_label_' + str(k) + '.csv').as_matrix()
    test_feature = pd.read_csv(DATA_10_FOLD_DIR + '/test_feature_' + str(k) + '.csv').as_matrix()
    test_label = pd.read_csv(DATA_10_FOLD_DIR + '/test_label_' + str(k) + '.csv').as_matrix()
    
    
    X = train_feature
    Y = train_label
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf = clf.fit(X, Y)
    
    predicted = clf.predict(test_feature)
    actual = test_label.flatten()
    
    accuracy = sum(np.array(predicted)==np.array(actual))/float(len(actual))
    fpr, tpr, thresholds = roc_curve(actual, predicted)
    roc_auc = auc(fpr, tpr)
    print("K={}, accuracy={}, AUC={}".format(k, accuracy, roc_auc))

    accuracy_total = accuracy_total + accuracy
    roc_auc_total = roc_auc_total + roc_auc


accuracy_final = accuracy_total / 10
roc_auc_final = roc_auc_total / 10

print ("for random forrest, accuracy={}, AUC={}".format(accuracy_final, roc_auc_final))




K=1, accuracy=0.800340260056, AUC=0.5
K=2, accuracy=0.802673147023, AUC=0.5
K=3, accuracy=0.788335358445, AUC=0.5
K=4, accuracy=0.79489671932, AUC=0.5


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, auc

DATA_10_FOLD_DIR = 'data_10_fold'

    
K = 11
accuracy_total = 0
roc_auc_total = 0
for k in range(1, K):
    
    train_feature = pd.read_csv(DATA_10_FOLD_DIR + '/train_feature_' + str(k) + '.csv').as_matrix()
    train_label = pd.read_csv(DATA_10_FOLD_DIR + '/train_label_' + str(k) + '.csv').as_matrix()
    test_feature = pd.read_csv(DATA_10_FOLD_DIR + '/test_feature_' + str(k) + '.csv').as_matrix()
    test_label = pd.read_csv(DATA_10_FOLD_DIR + '/test_label_' + str(k) + '.csv').as_matrix()
    
    X = train_feature
    Y = train_label
    clf = LinearSVC(random_state=0)
    clf = clf.fit(X, Y)
    
    predicted = clf.predict(test_feature)
    actual = test_label.flatten()
    
    accuracy = sum(np.array(predicted)==np.array(actual))/float(len(actual))
    fpr, tpr, thresholds = roc_curve(actual, predicted)
    roc_auc = auc(fpr, tpr)
    print("K={}, accuracy={}, AUC={}".format(k, accuracy, roc_auc))

    accuracy_total = accuracy_total + accuracy
    roc_auc_total = roc_auc_total + roc_auc


accuracy_final = accuracy_total / 10
roc_auc_final = roc_auc_total / 10

print ("for Linear SVM, accuracy={}, AUC={}".format(accuracy_final, roc_auc_final))


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, auc

DATA_10_FOLD_DIR = 'data_10_fold'

    
K = 11
accuracy_total = 0
roc_auc_total = 0
for k in range(1, K):
    
    train_feature = pd.read_csv(DATA_10_FOLD_DIR + '/train_feature_' + str(k) + '.csv').as_matrix()
    train_label = pd.read_csv(DATA_10_FOLD_DIR + '/train_label_' + str(k) + '.csv').as_matrix()
    test_feature = pd.read_csv(DATA_10_FOLD_DIR + '/test_feature_' + str(k) + '.csv').as_matrix()
    test_label = pd.read_csv(DATA_10_FOLD_DIR + '/test_label_' + str(k) + '.csv').as_matrix()
    
    X = train_feature
    Y = train_label
    clf = SVC()
    clf = clf.fit(X, Y)
    
    predicted = clf.predict(test_feature)
    actual = test_label.flatten()
    
    accuracy = sum(np.array(predicted)==np.array(actual))/float(len(actual))
    fpr, tpr, thresholds = roc_curve(actual, predicted)
    roc_auc = auc(fpr, tpr)
    print("K={}, accuracy={}, AUC={}".format(k, accuracy, roc_auc))

    accuracy_total = accuracy_total + accuracy
    roc_auc_total = roc_auc_total + roc_auc


accuracy_final = accuracy_total / 10
roc_auc_final = roc_auc_total / 10

print ("for Linear SVM, accuracy={}, AUC={}".format(accuracy_final, roc_auc_final))


### Model Tuning

Based on the above, choose two methods and fit a tuned model:
    - use 5-fold cross validation for model selection
    - use 10-fold cross validation for model assessment (based on appropriate performance metric)

Report estimated performance for both tuned classifiers

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import roc_curve, auc

DATA_5_FOLD_DIR = 'data_5_fold'

    
K = 6
accuracy_total = 0
roc_auc_total = 0
for k in range(1, K):
    
    train_feature = pd.read_csv(DATA_5_FOLD_DIR + '/train_feature_' + str(k) + '.csv').as_matrix()
    train_label = pd.read_csv(DATA_5_FOLD_DIR + '/train_label_' + str(k) + '.csv').as_matrix()
    test_feature = pd.read_csv(DATA_5_FOLD_DIR + '/test_feature_' + str(k) + '.csv').as_matrix()
    test_label = pd.read_csv(DATA_5_FOLD_DIR + '/test_label_' + str(k) + '.csv').as_matrix()
    
    
    X = train_feature
    Y = train_label
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X, Y)
    
    predicted = clf.predict(test_feature)
    actual = test_label.flatten()
    
    accuracy = sum(np.array(predicted)==np.array(actual))/float(len(actual))
    fpr, tpr, thresholds = roc_curve(actual, predicted)
    roc_auc = auc(fpr, tpr)
    print("K={}, accuracy={}, AUC={}".format(k, accuracy, roc_auc))

    accuracy_total = accuracy_total + accuracy
    roc_auc_total = roc_auc_total + roc_auc


accuracy_final = accuracy_total / 5
roc_auc_final = roc_auc_total / 5

print ("for decision tree with 5 fold cross validation, accuracy={}, AUC={}".format(accuracy_final, roc_auc_final))


### Linear SVM with Gradient Descent

In [106]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import roc_curve, auc
from numpy import linalg as LA


class Gradient_Descent(object):
    
    def fit(self, X, Y):
        self.w, self.b = self.calculate_gradient_descent(X, Y)
        

    def calculate_gradient_descent(self, X, Y, num_iter=500, _lambda=1, learn_rate=0.1):
        b = 0
        w = np.full((X.shape[1], 1), 0.0)
        
        for k in range(0, num_iter):
            print('current iteration -> {}'.format(k))
            
            # Halve learning rate for every iteration
            learn_rate = learn_rate / 2
    
            #calculate gradient to w and b respectively
            gradient_to_w, gradient_to_b = self.calculate_gradient_using_hinge_loss(w, X, Y, _lambda, b)
            
            # Update b & w
            w = w - learn_rate * gradient_to_w
            b = b - learn_rate * gradient_to_b
                
        return w, b



    def calculate_gradient_using_hinge_loss(self, w, X, Y, _lambda, b):

        gradient_to_w = np.full((1, w.shape[0]), 0.0)
        gradient_to_b = 0
        
        for (x_, y_) in zip(X, Y):
            _x = np.full((1, x_.shape[0]), x_)
            
            f = _x.dot(w) + b
            u = y_ * f
            
            # Using hinge loss ->gradient = -yx if yf < 1, or 0 if yf > 1
            if np.any(u < 1):
                gradient_w_curr = -y_ * x_ + (2 * _lambda * LA.norm(w))
                gradient_b_curr = -y_
            else:
                gradient_w_curr = np.full((1, w.shape[0]), 0.0)   
                gradient_b_curr = 0 
                
            gradient_to_w += gradient_w_curr
            gradient_to_b += gradient_b_curr
            
        #print(gradient_to_w)
        return np.transpose(gradient_to_w), gradient_to_b
    
    
    def predict(self, X):
        w = self.w
        b = self.b
        outcome_list = []
        
        print(type(X))
        
        for _x in X:
            x = np.full((1, _x.shape[0]), _x)
            f = x.dot(w) + b
            
            if f >= 0:
                outcome = 1.0
            else:
                outcome = -1.0
            outcome_list.append(outcome)
         
        return outcome_list    
            

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import roc_curve, auc
from Gradient_Descent import Gradient_Descent
from collections import Counter

DATA_10_FOLD_DIR = 'data_10_fold'

K = 2
accuracy_total = 0
roc_auc_total = 0
for k in range(1, K):
    
    train_feature = pd.read_csv(DATA_10_FOLD_DIR + '/train_feature_' + str(k) + '.csv').as_matrix()
    train_label = pd.read_csv(DATA_10_FOLD_DIR + '/train_label_' + str(k) + '.csv').as_matrix()
    test_feature = pd.read_csv(DATA_10_FOLD_DIR + '/test_feature_' + str(k) + '.csv').as_matrix()
    test_label = pd.read_csv(DATA_10_FOLD_DIR + '/test_label_' + str(k) + '.csv').as_matrix()
    
    
    X = train_feature
    Y = train_label
    gradient_descent = Gradient_Descent()
    gradient_descent.fit(X, Y)
    
    predicted = gradient_descent.predict(test_feature)
    actual = test_label.flatten()
  
    print('predicted -->', Counter(predicted))
    print('actual -->', Counter(actual))
    print('equals -->', sum(np.array(predicted)==np.array(actual)))
    
    accuracy = sum(np.array(predicted)==np.array(actual))/float(len(actual))
    fpr, tpr, thresholds = roc_curve(actual, predicted)
    roc_auc = auc(fpr, tpr)

    accuracy_total = accuracy_total + accuracy
    roc_auc_total = roc_auc_total + roc_auc
    

accuracy_final = accuracy_total / (K - 1)
roc_auc_final = roc_auc_total / (K - 1)

print ("for gradient descent, accuracy={}, AUC={}".format(accuracy_final, roc_auc_final))




In [107]:
w,b = fit_svm(train_X, train_y, 1.0, n_iter=100)

it: 0, obj 82716.70
it: 10, obj 57686416.24
it: 20, obj 88729263.82
it: 30, obj 18878808.66
it: 40, obj 29077969.32
it: 50, obj 16569926.42
it: 60, obj 2649898.07
it: 70, obj 1512776.62
it: 80, obj 1238395.36
it: 90, obj 989114.39
