# Supervised Model Cup 2022

Submitted By: Denisse Joyce Alido

---

APPROACH: Use all models with only their default hyperparameters values (except for those that will produce an error/warning message). Get the model with the highest $F_{1}$ value and tune hyperparameters of said model.

Loading Python libraries.

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# for splitting of data
from sklearn.model_selection import train_test_split

# to get optimal hyperparameters' values
from sklearn.model_selection import GridSearchCV

Creating a dataframe from the CSV file.

In [11]:
df = pd.read_csv("annthyroid-training.csv", header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.750000,1,0,1,1,1,1,1,0,1,...,1,1,1,1,0.001132,0.080780,0.197324,0.300926,0.225000,1
1,0.239583,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0.000472,0.164345,0.235786,0.537037,0.165625,1
2,0.479167,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0.003585,0.130919,0.167224,0.527778,0.118750,1
3,0.656250,0,1,1,1,1,1,1,1,1,...,1,1,1,1,0.001698,0.091922,0.125418,0.337963,0.129688,1
4,0.229167,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0.000472,0.142061,0.229097,0.337963,0.235938,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,0.875000,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0.060377,0.050696,0.088629,0.333333,0.093750,-1
6996,0.218750,0,1,1,1,1,1,1,1,1,...,1,1,1,1,0.004340,0.097493,0.239130,0.347222,0.243750,1
6997,0.229167,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0.005094,0.109192,0.103679,0.291667,0.121875,1
6998,0.531250,0,1,1,1,1,1,1,1,1,...,1,1,1,0,0.002830,0.109192,0.160535,0.328704,0.170313,1


Checking if there are no missing values from the dataset.

In [12]:
if df.isnull().sum().sum()==0:
    print("No missing data")
    
else:
    print("Dataset with missing data")
    print(df.isnull().sum())

No missing data


Defining X and y from the dataframe and splitting these data into the training set and validation set.

In [13]:
X = df.iloc[:,:-1]
y = df[21]

# splitting training and validation data
train_X, valid_X, train_y, valid_y = train_test_split(X , y, train_size = 0.8, test_size = 0.2, random_state = 0)

Getting $F_{1}$ value for models.

In [14]:
def getF1(tn, fp, fn, tp):
    accuracy = (tn+tp)/(tn+fn+fp+tp)
    
    # sensitivity/recall - ratio between number of true positives and
    # the number of all samples whose class is the positive one
    sensitivity = tp/(tp+fn)
    
    # specificity - true negative (correct negative predictions
    # to actual negatives) 
    specificity = tn/(tn+fp)
    
    # precision - ratio between number of true positives and 
    # number of all samples classified as positive
    precision = tp/(tp+fp)
    
    f1 = (2*(precision*sensitivity))/(precision+sensitivity)
    
    return f1

### Random Forest Classifier

In [16]:
model = RandomForestClassifier(random_state=0) # n_estimators=100 default; no max depth
model.fit(train_X, train_y)
predictions = model.predict(valid_X)


tn, fp, fn, tp = confusion_matrix(valid_y.values, predictions).ravel()
print("True Negative: {}".format(tn))
print("False Positive: {}".format(fp))
print("False Negative: {}".format(fn))
print("True Positive: {}".format(tp))

res_RF = getF1(tn, fp, fn, tp)
print(res_RF)

# creating a dictionary for later use (identifying which model has the best F1 value)
dict_results = {'Random Forest Classifier':res_RF}

True Negative: 108
False Positive: 0
False Negative: 4
True Positive: 1288
0.9984496124031007


### K Nearest Neighbors

In [17]:
model = KNeighborsClassifier()
model.fit(train_X, train_y)
predictions = model.predict(valid_X)

tn, fp, fn, tp = confusion_matrix(valid_y.values, predictions).ravel()
print("True Negative: {}".format(tn))
print("False Positive: {}".format(fp))
print("False Negative: {}".format(fn))
print("True Positive: {}".format(tp))

res_KNN = getF1(tn, fp, fn, tp)
print(res_KNN)
dict_results['K Nearest Neighbors'] = res_KNN

True Negative: 34
False Positive: 74
False Negative: 6
True Positive: 1286
0.9698340874811462


### Multilayer Perception

In [18]:
model = MLPClassifier(random_state=0, max_iter=10000)
# also tried a higher max iter value but f1 value did not increase
# max iter fixed warning message

model.fit(train_X, train_y)
predictions = model.predict(valid_X)

tn, fp, fn, tp = confusion_matrix(valid_y.values, predictions).ravel()
print("True Negative: {}".format(tn))
print("False Positive: {}".format(fp))
print("False Negative: {}".format(fn))
print("True Positive: {}".format(tp))

res_MLP = getF1(tn, fp, fn, tp)
print(res_MLP)
dict_results['Multilayer Perception'] = res_MLP

True Negative: 102
False Positive: 6
False Negative: 10
True Positive: 1282
0.9937984496124032


### Logistic Regression

In [19]:
model = LogisticRegression(max_iter=10000)
model.fit(train_X, train_y)
predictions = model.predict(valid_X)

tn, fp, fn, tp = confusion_matrix(valid_y.values, predictions).ravel()
print("True Negative: {}".format(tn))
print("False Positive: {}".format(fp))
print("False Negative: {}".format(fn))
print("True Positive: {}".format(tp))

res_LR = getF1(tn, fp, fn, tp)
print(res_LR)
dict_results['Logistic Regressor'] = res_LR

True Negative: 22
False Positive: 86
False Negative: 0
True Positive: 1292
0.9677902621722847


### Naive Bayes

In [20]:
model = GaussianNB()
model.fit(train_X, train_y)
predictions = model.predict(valid_X)

tn, fp, fn, tp = confusion_matrix(valid_y.values, predictions).ravel()
print("True Negative: {}".format(tn))
print("False Positive: {}".format(fp))
print("False Negative: {}".format(fn))
print("True Positive: {}".format(tp))

res_NB = getF1(tn, fp, fn, tp)
print(res_NB)
dict_results['Naive Bayes'] = res_NB

True Negative: 107
False Positive: 1
False Negative: 1216
True Positive: 76
0.1110299488677867


Determine which model produced the highest $F_{1}$ value.

In [22]:
model_fin = max(dict_results, key=dict_results.get)
print("Model w/ highest F1: {}\nF1 Value: {}".format(model_fin, dict_results[model_fin]))

Model w/ highest F1: Random Forest Classifier
F1 Value: 0.9984496124031007


### Tuning of hyperparameters: Apply GridSearchCV to check best hyperparameter values

Applying GridSearchCV to avoid overfitting when getting the best hyperparameter values.

Hyperparameters:

In [23]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [2,100]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

Param grid:

In [24]:
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

param_grid

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [2, 100],
 'min_samples_split': [2, 5],
 'min_samples_leaf': [1, 2],
 'bootstrap': [True, False]}

In [25]:
model_final = RandomForestClassifier(random_state=0)

In [26]:
model_grid = GridSearchCV(estimator=model_final, param_grid=param_grid, cv=3, verbose=3, n_jobs=4)

In [27]:
model_grid.fit(train_X, train_y)

Fitting 3 folds for each of 480 candidates, totalling 1440 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 100],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100]},
             verbose=3)

In [28]:
model_grid.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

In [29]:
model_final = RandomForestClassifier(random_state=0, bootstrap=False, max_depth=4, max_features='auto',
                                     min_samples_leaf=1, min_samples_split=2, n_estimators=100)
model_final.fit(train_X, train_y)
predictions = model_final.predict(valid_X)


tn, fp, fn, tp = confusion_matrix(valid_y.values, predictions).ravel()
print("Random Forest Classifier (with hyperparameter tuning):")
print("True Negative: {}".format(tn))
print("False Positive: {}".format(fp))
print("False Negative: {}".format(fn))
print("True Positive: {}".format(tp))

result = getF1(tn, fp, fn, tp)
print("Final F1 Value (validation set): ",result)

Random Forest Classifier (with hyperparameter tuning):
True Negative: 62
False Positive: 46
False Negative: 3
True Positive: 1289
Final F1 Value (validation set):  0.9813475447278264


---