In [1]:
# import basic python libraries
import numpy as np
import pandas as pd

# load sequence data
sequence_data = pd.read_csv('Waltz_and_AAIndex1_Data_Filtered_Waltz_Features')

training_data = sequence_data.drop(['Classification'], axis = 1)
target_data = sequence_data['Classification']

In [2]:
# import data pre-processing package
from sklearn.model_selection import train_test_split

# Split the dataset in two training and test sets: use an 80:20 split
X_train, X_test, y_train, y_test = train_test_split(training_data, target_data, test_size = 0.2)

In [3]:
training_data.describe()

Unnamed: 0,pos0_orth_0,pos0_orth_1,pos0_orth_2,pos0_orth_3,pos0_orth_4,pos0_orth_5,pos0_orth_6,pos0_orth_7,pos0_orth_8,pos0_orth_9,...,pos5_MAXF760104,pos0_ZIMJ680103,pos5_ZIMJ680103,pos1_QIAN880123,pos5_AURR980106,pos0_FINA910102,pos1_FINA910102,pos2_FINA910102,pos3_FINA910102,pos4_FINA910102
count,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0,...,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0,652.0
mean,0.078221,0.003067,0.039877,0.046012,0.044479,0.102761,0.026074,0.050613,0.058282,0.052147,...,1.063834,10.769755,9.686258,-0.093528,1.00592,1.397853,1.382975,1.396166,1.416411,1.322853
std,0.268725,0.055342,0.195821,0.209673,0.206314,0.303879,0.159477,0.219375,0.234456,0.222495,...,1.388189,19.363446,18.369969,0.278196,0.387613,1.970223,1.915649,1.968485,2.072382,1.804683
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.58,0.26,0.7,0.7,0.7,0.7,0.7
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.23,0.13,0.13,-0.28,0.77,1.0,1.0,1.0,1.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.57,1.66,1.66,-0.11,0.9,1.0,1.0,1.0,1.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.08,3.53,3.53,0.065,1.1,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.02,52.0,52.0,0.47,2.3,13.0,13.0,13.0,13.0,13.0


In [4]:
# list the four machine learning methods to be used
methods = ['svm', 'forest', 'logistic', 'mlp']

In [5]:
# define a dictionary of parameters for each machine learning method
parameters = {'svm' :      {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, # error term penalty parameter
              
              'forest' :   {'n_estimators': [10, 100, 1000], # number of decision trees in forest
                            'max_depth': [1, 10, 100, 1000]}, # maximum tree depth
              
              'logistic' : {'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, # inverse of regularization strength
              
              'mlp' :      {'activation' : ['identity', 'logistic', 'tanh'], # mlp activation function
                            'hidden_layer_sizes' : [(10, 1), (20, 1), (30, 1), (50, 1)], # size of single hidden layer
                            'learning_rate_init' : [0.1, 0.01, 0.001, 0.0001, 0.00001]} # training rate
             }

In [None]:
# import grid search package and machine learning libraries
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

num_folds = 10 # we are doing 10-fold cross validation
perf_metric = 'roc_auc' # use mean area under ROC curve for performance evaluation 

# define dictionary of classifier objects for each machine learning method
classifiers = {'svm' : GridSearchCV(LinearSVC(), parameters['svm'], cv = num_folds, scoring = perf_metric),
               'forest' : GridSearchCV(RandomForestClassifier(), parameters['forest'], cv = num_folds, scoring = perf_metric),
               'logistic' : GridSearchCV(LogisticRegression(), parameters['logistic'], cv = num_folds, scoring = perf_metric),
               'mlp' : GridSearchCV(MLPClassifier(max_iter = 1000), parameters['mlp'], cv = num_folds, scoring = perf_metric)
              }

In [None]:
# train each method on the training data
for method in methods:
    classifiers[method].fit(X_train, y_train)
    print(method, 'complete')

svm complete
forest complete
logistic complete


