# Baseline

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [44]:
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from scipy import stats
from sklearn.neural_network import MLPClassifier

#### Read the data

In [45]:
df = pd.read_csv('../experiments/prop.csv', index_col=0)

In [46]:
df.shape

(69653, 21)

In [47]:
# delete unnamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# convert all columns of DataFrame
df = df.dropna()
# find the objects columns
cols = df.columns[df.dtypes.eq('object')]
# convert to numeric these columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

#### Cross-Validation

In [48]:
seed = 7
array = df.values
X = array[:,0:20]
Y = array[:,20]
validation_size = 0.2
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [49]:
scoring = 'f1'
#scoring = 'roc_auc'

In [50]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier()))

In [51]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.088095 (0.012258)
KNN: 0.216311 (0.012836)
CART: 0.298772 (0.017782)
NB: 0.254699 (0.015874)
SVM: 0.053648 (0.009776)
RF: 0.286726 (0.020117)


#### Neural Network

In [52]:
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

In [53]:
mlp.fit(X_train,Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [54]:
predictions = mlp.predict(X_validation)

In [55]:
from sklearn.metrics import f1_score
f1_score(Y_validation, predictions, average="macro")

0.5205521970808967

In [56]:
#from sklearn.metrics import roc_curve, auc
#fpr2, tpr2, threshold = roc_curve(Y_validation, mlp.predict_proba(X_validation)[:,1])
#roc_auc2 = auc(fpr2, tpr2)
#print(roc_auc2)

##### That's all folks!!!