In [1]:
%load_ext autoreload
%autoreload 2

## Part II: Model Building

Here you try your hand at model building to predict appointment no shows.

### Preprocessing

Package 'proj2_lib' now includes code to carry out preprocessing steps from part I. Here's how to use it:

In [2]:
import proj2_lib.util as utils

First, it includes a dictionary used for configuring path and file names
used through the project

In [3]:
utils.file_config

{'feature_pipeline_file': 'feature_pipeline.pkl',
 'labels_pipeline_file': 'labels_pipeline.pkl',
 'objstore_path': 'objects',
 'processed_data_path': 'processed_data',
 'raw_data_csv': 'KaggleV2-May-2016.csv',
 'raw_data_path': 'data',
 'test_csv': 'test_set.csv',
 'train_csv': 'train_set.csv'}

`feature_pipeline_file`: file storing the preprocessing pipeline used for preparing the feature matrix

`labels_pipeline_file`: file storing the preprocessing pipeline used for
preparing labels

`objstore_path`: directory to store python objects to disk

`processed_data_path`: directory containing processed data

`raw_data_csv`: name of the csv download from Kaggle

`raw_data_path`: directory containing raw data

`test_csv`: name of csv file containing test set

`train_csv`: name of csv file containing train set

You can change these paths and names to suit your project directory structure if you need so. E.g.,

In [4]:
file_config = utils.file_config
#config['raw_data_path'] = "some_other_directory"

First step is to create train test sets. Code is in file `proj2_lib/util.py` function `make_train_test_sets`. You
can edit that function as needed to include your own part I code if you so desire. The result will be to 
create files `train_set.csv` and `test_set.csv` in your `processed_data` directory (unless you change any of the entries in the configuration directory as above).

In [5]:
# ONLY NEED TO RUN THIS STEP ONCE (switch this to True to run it)
RUN_MAKE_TRAIN_TEST_FILES = False
if RUN_MAKE_TRAIN_TEST_FILES:
    utils.make_train_test_sets(config=file_config)

Next step is to fit the preprocessing pipelines. This is done in file `proj2_lib/preprocess.py`. Again you can edit code as needed in that file to incorporate your part I solution as you wish. The result will be to create files `feature_pipeline.pkl` and `labels_pipeline.pkl` containing the fit preprocessing pipelines we can then use to preprocess data.

In [6]:
import proj2_lib.preprocess as preprocess

# ONLY NEED TO RUN THIS STEP ONCE
RUN_FIT_PREPROCESSING = False
if RUN_FIT_PREPROCESSING:
    preprocess.fit_save_pipelines(config=file_config)

Finally, once we do that, we can get a training matrix and labels:

In [7]:
train_X, train_y = preprocess.load_train_data(config=file_config)

In [8]:
print(train_X.shape)
print(train_y.shape)

(90526, 101)
(90526,)


### Model Building

Using `sklearn` fit:
    - DecisionTree classifier
    - RandomForest classifier
    - Linear SVM classifier
    - SVM with Radial Basis Kernel classifier
    
Use default parameters for now.
Using 10-fold cross validation report both accuracy and AUC for each of the above four models.

QUESTION: Should you use accuracy or AUC for this task as a performance metric?

_ANSWER HERE_

    AUC is more suitalbe for this task.

#### DecisionTree classifier

In [9]:
from sklearn.tree import DecisionTreeClassifier

tree_cls = DecisionTreeClassifier()
tree_cls.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

#### RandomForest classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

forest_cls = RandomForestClassifier()
forest_cls.fit(train_X,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Linear SVM classifier

In [11]:
from sklearn.svm import LinearSVC
lin_svm = LinearSVC()
lin_svm.fit(train_X,train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

#### SVM with Radial Basis Kernel classifier

In [12]:
from sklearn.svm import SVC
svm_cls= SVC(kernel="rbf")
svm_cls.fit(train_X,train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Using 10-fold cross validation report both accuracy and AUC

In [None]:
def display_scores(model,acc,auc):
    print("\nModel is {}".format(model))
    print("\tAccuracy Scores:", acc)
    print("\tAccuracy Mean:", acc.mean())
    print("\tAccuracy Sd:", acc.std())
    print("\tAUC Scores:", auc)
    print("\tAUC Mean:", auc.mean())
    print("\tAUC Sd:", auc.std())

In [None]:
from sklearn.model_selection import cross_val_score

tree_accuracy_scores = cross_val_score(tree_cls, train_X, train_y, 
                        scoring="accuracy", cv=10)
tree_auc_scores = cross_val_score(tree_cls, train_X, train_y, 
                        scoring="roc_auc", cv=10)

forest_accuracy_scores = cross_val_score(forest_cls, train_X, train_y, 
                        scoring="accuracy", cv=10)
forest_auc_scores = cross_val_score(forest_cls, train_X, train_y, 
                        scoring="roc_auc", cv=10)

lin_svm_accuracy_scores = cross_val_score(lin_svm, train_X, train_y, 
                        scoring="accuracy", cv=10)
lin_svm_auc_scores = cross_val_score(lin_svm, train_X, train_y, 
                        scoring="roc_auc", cv=10)

svm_cls_accuracy_scores = cross_val_score(svm_cls, train_X, train_y, 
                        scoring="accuracy", cv=10)
svm_cls_auc_scores = cross_val_score(svm_cls, train_X, train_y, 
                        scoring="roc_auc", cv=10)

display_scores('DecisionTree',tree_accuracy_scores,tree_auc_scores)
display_scores('RandomForest',forest_accuracy_scores,forest_auc_scores)
display_scores('LinerSVM',lin_svm_accuracy_scores,lin_svm_auc_scores)
display_scores('SVM',svm_cls_accuracy_scores,svm_cls_auc_scores)


### Model Tuning

Based on the above, choose two methods and fit a tuned model:
    - use 5-fold cross validation for model selection
    - use 10-fold cross validation for model assessment (based on appropriate performance metric)

Report estimated performance for both tuned classifiers

In [None]:
# tune your models here

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30,50],'max_features': [2, 4, 6, 8,10]},
    {'bootstrap': [False], 'n_estimators': [3, 10, 30,50], 'max_features': [2, 4, 6, 8,10]}
]

forest_cls = RandomForestClassifier()
grid_search = GridSearchCV(forest_cls, param_grid, cv=5,
                          scoring="roc_auc")
grid_search.fit(train_X, train_y)

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
svm_cls= SVC(kernel="rbf")
svm_grids = GridSearchCV(svm_cls, param_grid, cv=5, scoring="roc_auc" )

svm_grids.fit(train_X, train_y)

In [None]:
svm_grids.best_estimator_

In [None]:
svm_grids.best_params_

In [None]:
svm_grids.best_score_

### Linear SVM with Gradient Descent

In [None]:
import numpy as np

# initialize model parameters w and b
# intializing to 0 is not a good idea
# it should be a random vector see np.random.randn
# YOU NEED TO IMPLEMENT THIS
def _initialize_parameters(nfeatures):
    w = np.full((nfeatures), 0.0)
    b = np.full((1), 0.0)
    return w, b

# this is a vectorized version of positive_part operation
# we can use this for hinge loss as post_part(1.0 - y*f)
pos_part = np.vectorize(lambda u: u if u > 0. else 0.)

# compute the value of the linear SVM objective function
# given current signed distances, and parameter vector w
def _get_objective(f, y, w, lam):
    loss = np.sum(pos_part(1.0 - y*f))
    penalty = lam * np.dot(w,w)
    return loss + penalty

# compute the signed distances
# based on current model estimates
# w and b
# YOU NEED TO IMPLEMENT THIS
def _get_signed_distances(X, w, b):
    nobs = X.shape[0]
    f = np.full(nobs, 0.0)
    return f

# compute gradients with respect to w and b
# YOU NEEED TO IMPLEMENT THIS
def _get_gradients(f, X, y, w, b, lam):
    nfeatures = X.shape[1]
    gw = np.full((nfeatures), 0.)
    gb = 0.
    return gw, gb

# fit an SVM using gradient descent
# X: matrix of feature values
# y: labels (-1 or 1)
# n_iter: numer of iterations
# eta: learning rate
def fit_svm(X, y, lam, n_iter=100, eta=.4):
    nexamples, nfeatures = X.shape
    
    w, b = _initialize_parameters(nfeatures)
    
    for k in range(n_iter):
        f = _get_signed_distances(X, w, b)
        
        # print information and 
        # update the learning rate
        if k % 10 == 0:
            obj = _get_objective(f, y, w, lam)
            eta = eta / 2.0
            print("it: %d, obj %.2f" % (k, obj))
        
        gw, gb = _get_gradients(f, X, y, w, b, lam)
        w = w - eta * gw
        b = b - eta * b
    return w, b

In [None]:
w,b = fit_svm(train_X, train_y, 1.0, n_iter=100)