In [1]:
# Author: Elsha Siochi
# Description: Build a neural network for thyroid function classification

import sys
stdout = sys.stdout
sys.stdout = stdout
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




/kaggle/input/thyroid-dataset-donated-by-peter-turney/whole-ann-dataset.data


# **Load dataset**

In [2]:
# header names
headers = ["age", "sex", "on thyroxine", "query on thyroxine", "on antithyroid medication", 
"sick", "pregnant", "thyroid surgery", "I131 treatment", "query hypothyroid", "query hyperthyroid", 
"lithium", "goitre", "tumor", "hypopituitary", "psych", "TSH", "T3", "TT4", "T4U", "FTI", "thyroid function"]

warnings.filterwarnings("ignore") #suppress warnings from not match length of data and header
thyroid_df = pd.read_csv('../input/thyroid-dataset-donated-by-peter-turney/whole-ann-dataset.data',
                        sep=' ',
                        names=headers,
                        index_col=False)

# Only select features used
x = thyroid_df.iloc[:,[0,1, 6, 16,17,18,19,20]]
y = thyroid_df.iloc[:,-1]
print(x)
print(f'Feature count: {len(x.columns)}')



       age  sex  pregnant      TSH      T3    TT4    T4U     FTI
0     0.73    0         0  0.00060  0.0150  0.120  0.082  0.1460
1     0.24    0         0  0.00025  0.0300  0.143  0.133  0.1080
2     0.47    0         0  0.00190  0.0240  0.102  0.131  0.0780
3     0.64    1         0  0.00090  0.0170  0.077  0.090  0.0850
4     0.23    0         0  0.00025  0.0260  0.139  0.090  0.1530
...    ...  ...       ...      ...     ...    ...    ...     ...
7195  0.59    0         0  0.00250  0.0208  0.079  0.099  0.0800
7196  0.51    0         0  0.10600  0.0060  0.005  0.089  0.0055
7197  0.51    0         0  0.00076  0.0201  0.090  0.067  0.1340
7198  0.35    1         0  0.00280  0.0201  0.090  0.089  0.1010
7199  0.73    0         0  0.00056  0.0201  0.081  0.090  0.0900

[7200 rows x 8 columns]
Feature count: 8


In [3]:
print(x)
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Preliminary train (cross-validation) and test (hold out) set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=1, stratify=y)

# Normalize/Standardize training data first
scaler = MinMaxScaler()   
x_train_scale = x_train.iloc[:,[0, 3, 4, 5, 6, 7]]
x_train_noscale = x_train.iloc[:, [1, 2]]  #bianry vals, sex and pregnant

scaler.fit(x_train_scale)
dump(scaler, 'standard_scaler.joblib') #store to file for future use of scaler w/ same params just in case
x_train_scale = scaler.transform(x_train_scale)
train_scaled = np.concatenate((x_train_scale, x_train_noscale), axis=1) #append sex and pregnant
x_train = pd.DataFrame(train_scaled, columns=["age", "TSH", "T3", "TT4", "T4U", "FTI", "sex", "pregnant"])
x_train.to_csv('minmaxscaled_train_features.csv')


# Repeat for testing, except use the same fitted scaler for transformation
x_test_scale = x_test.iloc[:,[0, 3, 4, 5, 6, 7]]
x_test_noscale = x_test.iloc[:, [1, 2]]

x_test_scale = scaler.transform(x_test_scale)
test_scaled = np.concatenate((x_test_scale, x_test_noscale), axis=1) #append sex and pregnant
x_test = pd.DataFrame(test_scaled, columns=["age", "TSH", "T3", "TT4", "T4U", "FTI", "sex", "pregnant"])

x_test.to_csv('minmaxscaled_test_features.csv')

       age  sex  pregnant      TSH      T3    TT4    T4U     FTI
0     0.73    0         0  0.00060  0.0150  0.120  0.082  0.1460
1     0.24    0         0  0.00025  0.0300  0.143  0.133  0.1080
2     0.47    0         0  0.00190  0.0240  0.102  0.131  0.0780
3     0.64    1         0  0.00090  0.0170  0.077  0.090  0.0850
4     0.23    0         0  0.00025  0.0260  0.139  0.090  0.1530
...    ...  ...       ...      ...     ...    ...    ...     ...
7195  0.59    0         0  0.00250  0.0208  0.079  0.099  0.0800
7196  0.51    0         0  0.10600  0.0060  0.005  0.089  0.0055
7197  0.51    0         0  0.00076  0.0201  0.090  0.067  0.1340
7198  0.35    1         0  0.00280  0.0201  0.090  0.089  0.1010
7199  0.73    0         0  0.00056  0.0201  0.081  0.090  0.0900

[7200 rows x 8 columns]


# Using built-in MLPClassifier

In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, precision_recall_fscore_support, classification_report

mlp = MLPClassifier(learning_rate_init=0.001, random_state=1)

# Split, Train, and Grid Search

In [5]:
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score

# Create StratifiedKFold object for parameter-tuning
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

hyperparameter_space = {
  'activation': ['logistic', 'relu', 'tanh'],
  'solver': ['sgd', 'adam'],
  'max_iter': [200, 300, 400, 500],
  'hidden_layer_sizes': [(12, 8,), (12,)],
}

gm_scorer = make_scorer(geometric_mean_score, greater_is_better=True, average='macro')
scoring = {'gmean': gm_scorer,
            'f1_macro': 'f1_macro',
           'accuracy': 'accuracy'}
gs = GridSearchCV(mlp, param_grid=hyperparameter_space , 
                  scoring=scoring,
                  refit='gmean',
                  cv=skf, return_train_score=True)

gs.fit(x_train, y_train)
print("Optimal hyperparameter combination: ", gs.best_params_)
print("Mean cross-validated gmean of the best_estimator: ", gs.best_score_)
gs.best_estimator_.fit(x_train, y_train)
y_pred = gs.best_estimator_.predict(x_test)

acc_test = accuracy_score(y_test, y_pred)
print("Accuracy test score: ", np.round(acc_test, 4))

gmean_test = geometric_mean_score(y_test, y_pred)
print("GMean test score: ", np.round(gmean_test, 4))

print("GS:")
print(gs)

pd.DataFrame(gs.cv_results_)
# res_df = pd.DataFrame(gs.cv_results_)


Optimal hyperparameter combination:  {'activation': 'relu', 'hidden_layer_sizes': (12, 8), 'max_iter': 500, 'solver': 'adam'}
Mean cross-validated gmean of the best_estimator:  0.9339772560027312
Accuracy test score:  0.9792
GMean test score:  0.955
GS:
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             estimator=MLPClassifier(random_state=1),
             param_grid={'activation': ['logistic', 'relu', 'tanh'],
                         'hidden_layer_sizes': [(12, 8), (12,)],
                         'max_iter': [200, 300, 400, 500],
                         'solver': ['sgd', 'adam']},
             refit='gmean', return_train_score=True,
             scoring={'accuracy': 'accuracy', 'f1_macro': 'f1_macro',
                      'gmean': make_scorer(geometric_mean_score, average=macro)})


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_hidden_layer_sizes,param_max_iter,param_solver,params,split0_test_gmean,...,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,split4_train_accuracy,mean_train_accuracy,std_train_accuracy
0,0.684895,0.011901,0.004717,0.000168,logistic,"(12, 8)",200,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.471405,...,0.925772,0.000309,27,0.925733,0.925733,0.925733,0.925733,0.925926,0.925772,7.7e-05
1,3.867448,0.02846,0.004978,0.000581,logistic,"(12, 8)",200,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.693427,...,0.94213,0.002289,21,0.941551,0.942323,0.941937,0.94213,0.94348,0.942284,0.00065
2,0.694692,0.008421,0.004609,4.5e-05,logistic,"(12, 8)",300,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.471405,...,0.925772,0.000309,27,0.925733,0.925733,0.925733,0.925733,0.925926,0.925772,7.7e-05
3,5.768951,0.067199,0.00464,2.6e-05,logistic,"(12, 8)",300,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.701885,...,0.94213,0.00234,21,0.941165,0.942515,0.942901,0.941937,0.943673,0.942438,0.000851
4,0.694631,0.001857,0.004608,2e-05,logistic,"(12, 8)",400,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.471405,...,0.925772,0.000309,27,0.925733,0.925733,0.925733,0.925733,0.925926,0.925772,7.7e-05
5,7.775856,0.079701,0.004821,0.000177,logistic,"(12, 8)",400,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.726843,...,0.942593,0.004515,19,0.94213,0.943866,0.946181,0.94348,0.945602,0.944252,0.001469
6,0.684252,0.009474,0.004661,0.000198,logistic,"(12, 8)",500,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.471405,...,0.925772,0.000309,27,0.925733,0.925733,0.925733,0.925733,0.925926,0.925772,7.7e-05
7,9.723712,0.426768,0.004915,0.000501,logistic,"(12, 8)",500,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.74488,...,0.944599,0.005864,14,0.943866,0.945023,0.948881,0.94348,0.945409,0.945332,0.001912
8,0.514289,0.012671,0.004519,0.000182,logistic,"(12,)",200,sgd,"{'activation': 'logistic', 'hidden_layer_sizes...",0.471405,...,0.925772,0.000309,27,0.925733,0.925733,0.925733,0.925733,0.925926,0.925772,7.7e-05
9,3.011511,0.033815,0.004388,7.4e-05,logistic,"(12,)",200,adam,"{'activation': 'logistic', 'hidden_layer_sizes...",0.63157,...,0.938426,0.001786,24,0.937886,0.938079,0.93885,0.938079,0.939236,0.938426,0.000523


In [6]:
gmean_test = geometric_mean_score(y_test, y_pred)
print("GMean test score: ", np.round(gmean_test, 4))

summary = classification_report(y_test, y_pred, digits=4)
print("Summary report: \n", summary)

res_df = pd.DataFrame(gs.cv_results_)
res_df.to_csv('results[minmaxscaled-gmean-best].csv')

GMean test score:  0.955
Summary report: 
               precision    recall  f1-score   support

           1     0.8333    0.9375    0.8824        16
           2     0.7955    0.9459    0.8642        37
           3     0.9954    0.9820    0.9887       667

    accuracy                         0.9792       720
   macro avg     0.8747    0.9552    0.9117       720
weighted avg     0.9816    0.9792    0.9799       720



In [7]:
gs.best_estimator_.coefs_


[array([[-3.20455021e-002,  4.17108075e-001,  1.60609943e-179,
         -9.06909888e-002, -1.73153021e+000, -1.06499085e-001,
         -9.04820730e-002,  1.59790654e-001,  1.39064805e-001,
          1.72975696e-001,  1.72052055e-001,  3.84144779e-001],
        [-5.47156655e+000, -1.46737304e+000, -2.43064990e-182,
          5.68968473e+000,  3.41923927e-001, -3.57429751e+000,
         -4.67527745e+000, -2.03226679e+000, -4.70207449e+000,
          5.48120623e+000, -4.29828922e+000,  1.08682931e+000],
        [ 1.62513385e-001, -3.65773962e-001, -1.05342297e-191,
         -1.78743452e-001, -2.06910606e-001,  6.31597272e-001,
         -8.21485486e-001,  6.17666201e-001, -2.12695968e-002,
          3.12150148e-001,  4.67547961e-001, -4.54499810e-002],
        [ 5.95744905e-002, -9.57551216e-001,  3.26255066e-181,
          5.43740241e-001,  8.08486942e-001,  8.79256440e-001,
         -7.51381670e-001,  1.80853733e+000, -1.06452836e+000,
          1.00856937e-001,  9.52229381e-001, -3.5993

In [8]:
#store model
dump(gs.best_estimator_, 'best_estimator_minmax.joblib')

['best_estimator_minmax.joblib']