In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [81]:
# Import necessary libraries
import pandas as pd
import numpy as np


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import metrics


In [72]:
%cd /content/drive/MyDrive/Github/TM10007_ML/worcgist

# Data loading functions. Uncomment the one you want to use
from worcgist.load_data import load_data
#from worclipo.load_data import load_data
#from worcliver.load_data import load_data
#from ecg.load_data import load_data


#load data
data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
data.info()


/content/drive/MyDrive/Github/TM10007_ML/worcgist
/content/drive/MyDrive/Github/TM10007_ML/worcgist
The number of samples: 246
The number of columns: 494
<class 'pandas.core.frame.DataFrame'>
Index: 246 entries, GIST-001_0 to GIST-246_0
Columns: 494 entries, label to PREDICT_original_phasef_phasesym_entropy_WL3_N5
dtypes: float64(468), int64(25), object(1)
memory usage: 951.3+ KB


In [82]:
##PREPROCESSING

# Replace label values from string to binary
data['label'] = data['label'].replace({'GIST': 1, 'non-GIST': 0})

# Separate the features and labels
X = data.drop(['label'], axis=1)
y = data['label']

# Data scaling
X = StandardScaler().fit_transform(X)


[[ 0.13710764  0.15772696  2.01348899 ...  1.25989044 -0.36182743
   1.77275927]
 [ 0.89501907 -0.85685132 -0.03789684 ... -0.22413129 -0.36182743
   0.42862155]
 [ 0.86645156 -0.77484861 -0.84901145 ... -0.39563212 -0.36182743
  -0.56401068]
 ...
 [-0.26486217  1.29973227  0.83648559 ... -0.11989483 -0.36182743
   0.69088603]
 [ 0.06730983 -1.31328502 -1.32936117 ... -0.46174238 -0.3471394
  -2.63760002]
 [ 0.54100394 -0.74502675 -0.21465726 ... -0.39958471 -0.36182743
  -0.1850884 ]]


In [83]:
##FEATURE SELECTION

#feature selection
X_new = SelectKBest(f_classif, k=10).fit_transform(X, y) #removes all but k-highest scoring features
X_new.shape

  f = msb / msw


(246, 10)

In [84]:
##TRAIN-TEST SET

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [85]:
##CLASSIFIERS

# Prepare pipeline
model_pipeline = []
model_pipeline.append(LogisticRegression(solver='liblinear'))
model_pipeline.append(SVC())
model_pipeline.append(KNeighborsClassifier())
model_pipeline.append(DecisionTreeClassifier())
model_pipeline.append(RandomForestClassifier())
model_pipeline.append(GaussianNB())

model_list = ['LogisticRegression', 'SVM', 'KNN', 'Decision Tree', 'Random Forest', 'Naive Bayes']
acc_list = []
auc_list = []
cm_list = []

# Execute pipeline
for model in model_pipeline:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  acc_list.append(metrics.accuracy_score(y_test, y_pred))
  fpr, tpr, _thresholds = metrics.roc_curve(y_test, y_pred)
  auc_list.append(round(metrics.auc(fpr, tpr),2))
  cm_list.append(confusion_matrix(y_test, y_pred))



In [86]:
##VALIDATION
# Plot Model validation
result_df = pd.DataFrame({'Model':model_list, 'Accuracy': acc_list, 'AUC': auc_list})
result_df

Unnamed: 0,Model,Accuracy,AUC
0,LogisticRegression,0.54,0.54
1,SVM,0.58,0.61
2,KNN,0.56,0.57
3,Decision Tree,0.44,0.45
4,Random Forest,0.66,0.67
5,Naive Bayes,0.64,0.64
