#### CANB8347 Machine Learning Project
Trying out supervised ML methods on pre-cleaned dataset

In [2]:
import numpy as np
import pandas as pd
import scipy as sc

# sklearn tools
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier

# plotting tools
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style = 'white')
%matplotlib inline

In [6]:
# read in data that has been preprocessed to only contain numeric values
vlbw = pd.read_csv('data/vlbw_train_numeric.csv')

pull out labels for data as the column that we want to predict (live births)  
then, drop that column from the training data to ignore it in our algorithms

In [7]:
labels = vlbw['dead']
vlbw.drop('dead', axis=1, inplace=True)

impute missing values with mode of each feature  
might not be the best strategy for dealing with `NaN`s, but good for first-pass

In [8]:
# fill NaNs with mode value of each column
vlbw_filled = SimpleImputer(strategy='most_frequent').fit_transform(vlbw)

In [9]:
# normalize each column to fractional representation between 0 and 1
vlbw_filled_norm = normalize(vlbw_filled, axis=1, norm='l1')

---
Try some supervised ML algorithms on the training data and labels

In [10]:
lda = LinearDiscriminantAnalysis(n_components=2)
vlbw_lda = lda.fit(X=vlbw_filled_norm, y=labels).transform(vlbw_filled_norm)



In [11]:
# Logistic regression classifier
clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial', max_iter=1000).fit(vlbw_filled_norm, labels)

In [12]:
# get score of correct predictions in training data
clf.score(vlbw_filled_norm, labels)

0.8901303538175046

In [13]:
# KNN classifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(vlbw_filled_norm, labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [14]:
# get score of correct predictions in training data
neigh.score(vlbw_filled_norm, labels)

0.9217877094972067