#### CANB8347 Machine Learning Project
Trying out supervised ML methods on pre-cleaned and imputed dataset
##### 3) Algorithm Validation

In [1]:
from ml_utils import *

# sklearn tools
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
# read in data that has been preprocessed to only contain numeric values
vlbw = pd.read_csv('data/vlbw_train_imputed.csv')

pull out labels for data as the column that we want to predict (live births)  
then, drop that column from the training data to ignore it in our algorithms

In [3]:
labels = vlbw['dead']
vlbw.drop('dead', axis=1, inplace=True)

In [4]:
# normalize each column to fractional representation between 0 and 1
vlbw_norm = normalize(vlbw, axis=1, norm='l1')

---
Perform k-fold split on preprocessed data for downstream validation of classifiers

In [6]:
vlbw_splits = kfold_split(vlbw_norm, labels, 5, seed=18, shuffle=True)

Test supervised classifiers using `validator` function

In [7]:
# Logistic regression classifier
clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial', max_iter=1000)

In [8]:
validator(vlbw_splits, clf)


Split 0: 0.8543689320388349
[[72  1]
 [14 16]]

Split 1: 0.9029126213592233
[[82  1]
 [ 9 11]]

Split 2: 0.8640776699029126
[[78  3]
 [11 11]]

Split 3: 0.9019607843137255
[[82  1]
 [ 9 10]]

Split 4: 0.8725490196078431
[[81  5]
 [ 8  8]]


In [9]:
# k-nearest neighbor classifier
neigh = KNeighborsClassifier(n_neighbors=3)

In [10]:
validator(vlbw_splits, neigh)


Split 0: 0.8155339805825242
[[70  3]
 [16 14]]

Split 1: 0.8932038834951457
[[81  2]
 [ 9 11]]

Split 2: 0.8737864077669902
[[77  4]
 [ 9 13]]

Split 3: 0.9117647058823529
[[81  2]
 [ 7 12]]

Split 4: 0.8921568627450981
[[83  3]
 [ 8  8]]


In [11]:
# random forest classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)

In [12]:
validator(vlbw_splits, rf)


Split 0: 0.8543689320388349
[[72  1]
 [14 16]]

Split 1: 0.9029126213592233
[[80  3]
 [ 7 13]]

Split 2: 0.9223300970873787
[[78  3]
 [ 5 17]]

Split 3: 0.9313725490196079
[[81  2]
 [ 5 14]]

Split 4: 0.9411764705882353
[[83  3]
 [ 3 13]]
