# Homework 3: NBC, KNN, and Ensemble Learning
## CIS600
## Evan Smith

#### Data Loading and EDA
First, we import the data from file.

In [127]:
import pandas as pd

test = pd.read_csv("C:/Users/evana/Documents/GitHub/SyracuseMasters/CIS_600_Fund_Data_Mining/HW3/Disease Prediction Testing.csv")
train = pd.read_csv("C:/Users/evana/Documents/GitHub/SyracuseMasters/CIS_600_Fund_Data_Mining/HW3/Disease Prediction Training.csv")

random_seed = 123

We then define and use two functions: one to display the null values per row and one for the unique values.

In [128]:
def DisplayNullRows (df):
    dataCleaningCounts = []
    for col in df.columns:
        dataCleaningCounts.append([col, df[col].nunique(), df[col].isnull().sum()])
    display(pd.DataFrame(dataCleaningCounts, columns = ['Column', 'Unique Values', 'Nulls']))
    return

print("Training Data Report")
DisplayNullRows(train)
print("Testing Data Report")
DisplayNullRows(test)

Training Data Report


Unnamed: 0,Column,Unique Values,Nulls
0,Age,28,0
1,Gender,2,0
2,Height,101,0
3,Weight,248,0
4,High Blood Pressure,143,0
5,Low Blood Pressure,143,0
6,Cholesterol,3,0
7,Glucose,3,0
8,Smoke,2,0
9,Alcohol,2,0


Testing Data Report


Unnamed: 0,Column,Unique Values,Nulls
0,ID,21000,0
1,Age,27,0
2,Gender,2,0
3,Height,81,0
4,Weight,182,0
5,High Blood Pressure,115,0
6,Low Blood Pressure,92,0
7,Cholesterol,3,0
8,Glucose,3,0
9,Smoke,2,0


We see that there are no null rows in either the test or training set, and all columns contain information (they have at least 2 unique values).

Next, we convert all non-numeric fields such that the various classifiers can process the inputs. First we determine the columns that need conversion.

In [38]:
train.select_dtypes(['object']).columns

Index(['Gender', 'Cholesterol', 'Glucose'], dtype='object')

Since many of these columns have an ordering that is important to capture, we manually dictate the encoding and update the dataset.

In [129]:
def NumerizeObjectColumns(df):
    
    df['Gender'] = df['Gender'].apply(lambda x: ['female', 'male'].index(x))
    df['Cholesterol'] = df['Cholesterol'].apply(lambda x: ['normal', 'high', 'too high'].index(x))
    df['Glucose'] = df['Glucose'].apply(lambda x: ['normal', 'high', 'too high'].index(x))
    
    return

NumerizeObjectColumns(train)
NumerizeObjectColumns(test)

Now we look to see if there are any notable outliers, starting by looking at the range of each column.

In [160]:
display(train.agg([min, max]))

Unnamed: 0,Age,Gender,Height,Weight,High Blood Pressure,Low Blood Pressure,Cholesterol,Glucose,Smoke,Alcohol,Exercise,Disease
min,29,0,55,10.0,-150,0,0,0,0,0,0,0
max,64,1,207,200.0,14020,11000,2,2,1,1,1,1


While most of the fields seem reasonable, the blood pressure range is clearly out of scope. Brief research shows that the max survivable everyday blood pressure is around 300, and the minimum is around 50. We drop all rows with values outside that range.

In [None]:
train.drop('High Blood Pressure')

Finally, we normalize all fields to ensure that we are using a scaled form of the data that will work well in our classifiers and then split them into testing and training sets using a fixed random seed.

In [130]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ShuffleSplit
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = train.drop('Disease', axis=1)
y = train['Disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)
print(f"train data size is {X_train.shape}")

Y_train = train[-1:]

scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

train data size is (34300, 11)


### Naive Bayes Classifier

For this classifier, we select the most critical parameters to tune. In this case, we want to learn from class prior probabilities, so we only need to vary `binarize` and `alpha`. We oscillate these values around the default values and select the best performing model by accuracy.

In [78]:
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

warnings.filterwarnings("ignore")

param_grid = {'binarize': [0, 0.3, 0.8],
              'alpha': [0, 0.5, 1, 1.5]}

clf_nbc = GridSearchCV(BernoulliNB(), param_grid, n_jobs=-1, scoring='accuracy')
t = clf_nbc.fit(X_train_std, y_train)

GridSearchCV(estimator=BernoulliNB(), n_jobs=-1,
             param_grid={'alpha': [0, 0.1, 0.5, 1, 1.5],
                         'binarize': [0, 0.1, 0.3, 0.8]},
             scoring='accuracy')

### K Nearest Neighbors Classifier

For this classifier, we select the most critical parameters to tune. In this case, that is `weights`, `n_neighbors`, and `leaf_size`. We oscillate these values around the default values and select the best performing model by accuracy.

In [55]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'weights': ['uniform', 'distance'],
              'n_neighbors': [4, 5, 6],
              'leaf_size': [25, 30, 35]}
clf_knn = GridSearchCV(KNeighborsClassifier(n_jobs=-1), param_grid, n_jobs=-1, scoring='accuracy')
clf_knn.fit(X_train_std, y_train)

### Random Forest Classifier

For the Random Forest Classifier, we chose to vary our parameters over the `n_estimators`, which controls the number of trees that are generated for each fold, and `max_features`, both of which are oscillated around the default. 

This classifier also is the first time we use a custom model selection object, with a 10-fold version overriding the default 5-fold to improve the consistency of accuracy measurements.

In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

param_grid = {'n_estimators': [50, 100, 150],
              'max_features': [3, 5, 7]}
cv = KFold(n_splits=10, shuffle=True, random_state=random_seed)

clf_rfc = GridSearchCV(RandomForestClassifier(n_jobs=-1, random_state=random_seed), param_grid, n_jobs=-1, scoring='accuracy', cv=cv)
t = clf_rfc.fit(X_train_std, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=16, shuffle=True),
             estimator=RandomForestClassifier(n_jobs=-1, random_state=123),
             n_jobs=-1,
             param_grid={'max_features': [3, 5, 7],
                         'n_estimators': [50, 100, 150]},
             scoring='accuracy')

### Gradient Boosting Machine

The GBM has very similar parameters to the Random Forest, in that we will vary the `n_estimators` parameter. In this case, that parameter interacts strongly with the `learning_rate`, since they act in a similar space but with opposite effects. The `max_depth` param is recommended for tuning within the documentation, so we vary it +-1 from the original default.

In [132]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {'n_estimators': [100, 150],
              'learning_rate': [0.1, 0.3, 0.5],
              'max_depth': [2, 3, 4]}
cv = KFold(n_splits=10, shuffle=True, random_state=random_seed)

clf_gbm = GridSearchCV(GradientBoostingClassifier(random_state=random_seed), param_grid, n_jobs=-1, scoring='accuracy', cv=cv)
t = clf_gbm.fit(X_train_std, y_train)

### Generating Predictions

We can summarize the selected and tuned models and compare them by accuracy alongside their hyperparameters.

In [144]:
def printModelSummary(model):
    print(f"Model Name: {model['Name']}")
    print(f"\tAccuracy: {round(model['Clf'].best_score_*100, 2)}%")
    print(f"\tParams: {model['Clf'].best_estimator_.get_params()}\n")
    return

modelSet = [{'Name':"Naive Bayes", 'Clf':clf_nbc},
            {'Name':"K-Nearest Neighbors", 'Clf':clf_knn},
            {'Name':"Random Forest", 'Clf':clf_rfc},
            {'Name':"Gradient Boosting Machine", 'Clf':clf_gbm}
           ]

for model in modelSet:
    printModelSummary(model)

Model Name: Naive Bayes
	Accuracy: 71.34%
	Params: {'alpha': 0, 'binarize': 0, 'class_prior': None, 'fit_prior': True}

Model Name: K-Nearest Neighbors
	Accuracy: 63.51%
	Params: {'algorithm': 'auto', 'leaf_size': 25, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': -1, 'n_neighbors': 4, 'p': 2, 'weights': 'distance'}

Model Name: Random Forest
	Accuracy: 73.81%
	Params: {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 123, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}



We now take those tuned classifiers and use each of them to produce a set of predictions for the output set. We then save those predictions to a csv.

In [147]:
test_data = test.drop('ID', axis=1)

frame = {'ID': test['ID'],
         'NBC' : clf_nbc.predict(test_data),
         'KNN' : clf_knn.predict(test_data),
         'RF' : clf_rfc.predict(test_data),
         'GBM' : clf_gbm.predict(test_data)}

outputData = pd.DataFrame(frame)
outputData.to_csv('HW3_output_predictions.csv', index=False)

NameError: name 'clf_gbm' is not defined