In [2]:
import pandas as pd
import numpy as np

import csv
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit

In [13]:
# Model-specific imports:
from sklearn.tree  import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost.sklearn import XGBClassifier

In [4]:
inputData = pd.read_csv('../meryck/CTG1.csv').sample(frac = 1)
inputData.head()

Unnamed: 0,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,NSP
1193,133,5,0,7,41,1.1,9,13.0,0,0,0,1
1170,126,0,0,3,21,1.5,0,10.4,0,0,0,1
588,120,0,5,2,38,1.3,0,15.8,0,0,0,1
242,125,0,2,1,66,0.4,20,6.7,0,0,0,2
1689,137,17,0,8,56,2.4,0,0.4,5,0,0,1


In [5]:
# select the input brain volume vector (X) and the output age vector (y)
X = inputData.drop(columns=['NSP'])
y = inputData['NSP']

# split the X and y vectors into training and testing dataset
# we're using 80% for training and 20% for testing, but this can change
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [6]:
# instantiate a decision tree model using sklearn:
dt_model = DecisionTreeClassifier(criterion='gini',max_depth=7, min_samples_leaf=2, min_samples_split=7)


# fit the model to the training data
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7, min_samples_leaf=2, min_samples_split=7)

In [7]:
# check the model's performance on unseen, test data
dt_predictions = dt_model.predict(X_test)

# compare the predictions to the known values (y_test)
print('f1 score', metrics.f1_score(y_test, dt_predictions, average='macro'))
print('precision:', metrics.precision_score(y_test, dt_predictions, average='macro'))
print('recall:', metrics.recall_score(y_test, dt_predictions, average='macro'))
print(metrics.confusion_matrix(y_test, dt_predictions))

f1 score 0.8721132006644883
precision: 0.9270767943084327
recall: 0.8322859657030026
[[329   5   0]
 [ 21  40   1]
 [  4   0  26]]


In [19]:
param_dict={
    #"criterion" : ['gini', 'entropy'],
            "max_depth" : range(1,10),
            "min_samples_split" : range(2,10), 
            "min_samples_leaf" : range(2,5)
           }
grid = GridSearchCV(dt_model, param_grid=param_dict, verbose=1, n_jobs=1)
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
{'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 7}


### Gaussian Process

In [10]:
# instantiate a Gaussian Process model using sklearn:
gp_model = GaussianProcessClassifier(copy_X_train = True, multi_class = 'one_vs_one')


# fit the model to the training data
gp_model.fit(X_train, y_train)

GaussianProcessClassifier(multi_class='one_vs_one')

In [11]:
# check the model's performance on unseen, test data
gp_predictions = gp_model.predict(X_test)

# compare the predictions to the known values (y_test)
print('f1 score', metrics.f1_score(y_test, gp_predictions, average='macro'))
print('precision:', metrics.precision_score(y_test, gp_predictions, average='macro'))
print('recall:', metrics.recall_score(y_test, gp_predictions, average='macro'))
print(metrics.confusion_matrix(y_test, gp_predictions))


f1 score 0.8462855239047142
precision: 0.8447733447733449
recall: 0.8478312192818663
[[318  15   1]
 [ 11  47   4]
 [  4   1  25]]


In [25]:
param_dict={
    #"criterion" : ['gini', 'entropy'],
            "copy_X_train" : [True, False],
            "multi_class" : ['one_vs_rest', 'one_vs_one']
#             "n_restarts_optimizer" : range(2,5)
           }
grid = GridSearchCV(gp_model, param_grid=param_dict, verbose=2, n_jobs=1)
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END .........copy_X_train=True, multi_class=one_vs_rest; total time=   9.4s
[CV] END .........copy_X_train=True, multi_class=one_vs_rest; total time=  10.0s
[CV] END .........copy_X_train=True, multi_class=one_vs_rest; total time=  10.1s
[CV] END .........copy_X_train=True, multi_class=one_vs_rest; total time=  10.0s
[CV] END .........copy_X_train=True, multi_class=one_vs_rest; total time=  11.2s
[CV] END ..........copy_X_train=True, multi_class=one_vs_one; total time=   4.9s
[CV] END ..........copy_X_train=True, multi_class=one_vs_one; total time=   5.0s
[CV] END ..........copy_X_train=True, multi_class=one_vs_one; total time=   4.8s
[CV] END ..........copy_X_train=True, multi_class=one_vs_one; total time=   5.0s
[CV] END ..........copy_X_train=True, multi_class=one_vs_one; total time=   5.6s
[CV] END ........copy_X_train=False, multi_class=one_vs_rest; total time=   9.8s
[CV] END ........copy_X_train=False, multi_class=

### XGBOOST

In [14]:
# instantiate a Gaussian Process model using sklearn:
xgb_model = XGBClassifier()


# fit the model to the training data
xgb_model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [15]:
# check the model's performance on unseen, test data
xgb_predictions = xgb_model.predict(X_test)

# compare the predictions to the known values (y_test)
print('f1 score', metrics.f1_score(y_test, xgb_predictions, average='macro'))
print('precision:', metrics.precision_score(y_test, xgb_predictions, average='macro'))
print('recall:', metrics.recall_score(y_test, xgb_predictions, average='macro'))
print(metrics.confusion_matrix(y_test, xgb_predictions))


f1 score 0.9157270793940925
precision: 0.9425202342265008
recall: 0.900231794475565
[[331   3   0]
 [ 16  44   2]
 [  0   0  30]]


In [34]:
param_dict={
    #"criterion" : ['gini', 'entropy'],
            "eta" : [0.001, 0.05, 0.01],
#             "n_restarts_optimizer" : range(2,5)
           }
grid = GridSearchCV(xgb_model, param_grid=param_dict, verbose=1, n_jobs=1)
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




























































{'eta': 0.001}
