#### CANB8347 Machine Learning Project
Cross-validating supervised ML methods on pre-cleaned dataset

In [1]:
from ml_utils import *

# sklearn tools
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# read in data that has been preprocessed to only contain numeric values
vlbw = pd.read_csv('data/vlbw_train_numeric.csv')

pull out labels for data as the column that we want to predict (live births)  
then, drop that column from the training data to ignore it in our algorithms

In [3]:
labels = vlbw['dead']
vlbw.drop('dead', axis=1, inplace=True)

In [4]:
# fill NaNs with mode value of each column
vlbw_filled = SimpleImputer(strategy='most_frequent').fit_transform(vlbw)

In [5]:
# normalize each column to fractional representation between 0 and 1
vlbw_filled_norm = normalize(vlbw_filled, axis=1, norm='l1')

---
Perform k-fold split on preprocessed data for downstream validation of classifiers

In [6]:
vlbw_splits = kfold_split(vlbw_filled_norm, labels, 5, seed=18, shuffle=True)

Test supervised classifiers using `validator` function

In [7]:
# Logistic regression classifier
clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial', max_iter=1000)

In [8]:
validator(vlbw_splits, clf)


Split 0: 0.8518518518518519
[[79  3]
 [13 13]]

Split 1: 0.8888888888888888
[[83  1]
 [11 13]]

Split 2: 0.8504672897196262
[[77  2]
 [14 14]]

Split 3: 0.8785046728971962
[[83  7]
 [ 6 11]]

Split 4: 0.8411214953271028
[[75  2]
 [15 15]]


In [9]:
# k-nearest neighbor classifier
neigh = KNeighborsClassifier(n_neighbors=3)

In [10]:
validator(vlbw_splits, neigh)


Split 0: 0.8796296296296297
[[76  6]
 [ 7 19]]

Split 1: 0.8888888888888888
[[79  5]
 [ 7 17]]

Split 2: 0.8317757009345794
[[76  3]
 [15 13]]

Split 3: 0.9065420560747663
[[86  4]
 [ 6 11]]

Split 4: 0.822429906542056
[[73  4]
 [15 15]]
