In [2]:
parameters_names_file = 'res/parameters_names.txt'
test_parameters_label_file = "res/test_parameters_label.txt"
test_parameters_features_file = "res/test_parameters_features.txt"

train_parameters_label_file = "res/train_parameters_label.txt"
train_parameters_features_file = "res/train_parameters_features.txt"

In [3]:
import numpy as np
import pandas as pd

In [4]:
#Reading data
train_features = pd.read_csv(train_parameters_features_file, header=None)
train_labels = pd.read_csv(train_parameters_label_file, header=None)
test_features = pd.read_csv(test_parameters_features_file, header=None)
test_labels = pd.read_csv(test_parameters_label_file, header=None)

In [5]:

from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

In [6]:
#libraries for tree

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [7]:
#Tree with split-function

data_train, data_test, label_train, label_test = train_test_split(train_features.values, train_labels.values, test_size=0.1,random_state=17)
clf = DecisionTreeClassifier(max_depth=5, random_state=17)
clf.fit(data_train, label_train)
clf_pred = clf.predict(data_test)
acc_tree = accuracy_score(label_test, clf_pred) 
print('\n########## TREE #########')
print('Score: ', acc_tree)

print(clf.predict_proba([[10, 0.5, 0.5, 0.64, 0.6575342465753424, 1, 0, 4.0, 0, 0, 2681854, 0, 0, 2]]))



############################
#Parameter tuning for tree #
############################

tree_params = {'max_depth': range(2,20), 'max_features': range(1,14)}
tree_grid = GridSearchCV(clf, tree_params, cv=10, n_jobs=-1, verbose=True)


tree_grid.fit(data_train, label_train)

tr_bst_prm = tree_grid.best_params_ 
tr_bst_scr = tree_grid.best_score_
tr_acc_grd = accuracy_score(label_test, tree_grid.predict(data_test))


print ('Best parameters:', tr_bst_prm)
print('Best score: ', tr_bst_scr)





########## TREE #########
Score:  0.875
[[ 0.  0.  1.]]
Fitting 10 folds for each of 234 candidates, totalling 2340 fits




Best parameters: {'max_depth': 2, 'max_features': 12}
Best score:  0.859375


[Parallel(n_jobs=-1)]: Done 1884 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 2340 out of 2340 | elapsed:    2.3s finished


In [8]:
#preprocessing
from sklearn import preprocessing

train_features_scaled = preprocessing.scale(train_features)
test_features_scaled = preprocessing.scale(test_features)



In [9]:
#no execution
#Tree with preprocessed data

data_train, data_test, label_train, label_test = train_features_scaled, test_features_scaled, train_labels, test_labels
clf = DecisionTreeClassifier(max_depth=5, random_state=17)
clf.fit(data_train, label_train)
clf_pred = clf.predict(data_test)
acc_tree = accuracy_score(label_test, clf_pred) 
print('\n########## TREE #########')
print('Score: ', acc_tree)

print(clf.predict_proba([[7, 0.5, 0.5, 0.5757575757575758, 0.59375, 1, 0, 2.5, 1, 0, 90931315, -17, 135128817935782, 2]]))


############################
#Parameter tuning for tree #
############################

tree_params = {'max_depth': range(2,20), 'max_features': range(1,14)}
tree_grid = GridSearchCV(clf, tree_params, cv=10, n_jobs=-1, verbose=True)


tree_grid.fit(data_train, label_train)

tr_bst_prm = tree_grid.best_params_ 
tr_bst_scr = tree_grid.best_score_
tr_acc_grd = accuracy_score(label_test, tree_grid.predict(data_test))


print ('Best parameters:', tr_bst_prm)
print('Best score: ', tr_bst_scr)


########## TREE #########
Score:  0.75
[[ 0.  0.  1.]]
Fitting 10 folds for each of 234 candidates, totalling 2340 fits


[Parallel(n_jobs=-1)]: Done 648 tasks      | elapsed:    1.3s


Best parameters: {'max_depth': 3, 'max_features': 4}
Best score:  0.847222222222


[Parallel(n_jobs=-1)]: Done 2340 out of 2340 | elapsed:    3.4s finished


In [10]:
#Tree with our test


data_train, data_test, label_train, label_test = train_features.values, test_features.values, train_labels.values, test_labels.values
clf = DecisionTreeClassifier(max_depth=5, random_state=17)
clf.fit(data_train, label_train)
clf_pred = clf.predict(data_test)
acc_tree = accuracy_score(label_test, clf_pred) 
print('\n########## TREE #########')
print('Score: ', acc_tree)

print(clf.predict_proba([[7, 0.5, 0.5, 0.5757575757575758, 0.59375, 1, 0, 2.5, 1, 0, 90931315, -17, 135128817935782, 2]]))


############################
#Parameter tuning for tree #
############################

tree_params = {'max_depth': range(2,20), 'max_features': range(1,14)}
tree_grid = GridSearchCV(clf, tree_params, cv=10, n_jobs=-1, verbose=True)


tree_grid.fit(data_train, label_train)

tr_bst_prm = tree_grid.best_params_ 
tr_bst_scr = tree_grid.best_score_
tr_acc_grd = accuracy_score(label_test, tree_grid.predict(data_test))


print ('Best parameters:', tr_bst_prm)
print('Best score: ', tr_bst_scr)


########## TREE #########
Score:  0.833333333333
[[ 0.  0.  1.]]
Fitting 10 folds for each of 234 candidates, totalling 2340 fits


[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:    1.8s


Best parameters: {'max_depth': 4, 'max_features': 13}
Best score:  0.875


[Parallel(n_jobs=-1)]: Done 2340 out of 2340 | elapsed:    2.5s finished


In [11]:
import graphviz 

dot_data = export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("Parameters_defining_tree") 

'Parameters_defining_tree.pdf'

In [12]:
#libraries for KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [13]:
data_train, data_test, label_train, label_test = train_features.values, test_features.values, train_labels.values, test_labels.values
label_train = np.ravel(label_train)

###############
#     KNN     #
###############

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(data_train, label_train)
knn_pred = knn.predict(data_test)
acc_knn = accuracy_score(label_test, knn_pred) 

print('\n########## KNN #########')
print('Score: ', acc_knn)


############################
#Parameter tuning for KNN  #
############################

knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 10)}

knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True)
knn_grid.fit(data_train, label_train)

knn_bst_prm = knn_grid.best_params_
knn_bst_scr = knn_grid.best_score_
knn_acc_grd = accuracy_score(label_train, knn_grid.predict(data_train))

print ('Best parameters:', knn_bst_prm)
print('Best score: ', knn_bst_scr)



########## KNN #########
Score:  0.416666666667
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best parameters: {'knn__n_neighbors': 1}
Best score:  0.666666666667


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    2.6s finished


In [14]:

#KNN with preprocessing
data_train, data_test, label_train, label_test = train_features_scaled, test_features_scaled, train_labels, test_labels
label_train = np.ravel(label_train)

###############
#     KNN     #
###############

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(data_train, label_train)
knn_pred = knn.predict(data_test)
acc_knn = accuracy_score(label_test, knn_pred) 

print('\n########## KNN #########')
print('Score: ', acc_knn)


############################
#Parameter tuning for KNN  #
############################

knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 14)}

knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True)
knn_grid.fit(data_train, label_train)

knn_bst_prm = knn_grid.best_params_
knn_bst_scr = knn_grid.best_score_
knn_acc_grd = accuracy_score(label_train, knn_grid.predict(data_train))

print ('Best parameters:', knn_bst_prm)
print('Best score: ', knn_bst_scr)


########## KNN #########
Score:  0.916666666667
Fitting 5 folds for each of 13 candidates, totalling 65 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.4s


Best parameters: {'knn__n_neighbors': 1}
Best score:  0.666666666667


[Parallel(n_jobs=-1)]: Done  65 out of  65 | elapsed:    3.6s finished


In [15]:
from sklearn.svm import LinearSVC

In [16]:
data_train, data_test, label_train, label_test = train_features.values, test_features.values, train_labels.values, test_labels.values


################
#     SVM      #
################

clf = LinearSVC(random_state=0)
clf.fit(data_train, label_train)

clf_pred = clf.predict(data_test)
acc_clf = accuracy_score(label_test, clf_pred) 

print('\n########## SVM #########')
print('Score: ', acc_clf)
print('SVM-coefficient:')
print(clf.coef_)


########## SVM #########
Score:  0.333333333333
SVM-coefficient:
[[ -2.73636094e-01  -2.33897700e-01  -1.76939817e-01  -2.02131042e-01
   -1.60254192e-01  -6.82564671e-02   9.15109430e-01  -1.75295867e-01
   -1.36512934e-01  -2.73025869e-01   1.48431503e-10  -5.79963235e-13
    6.22001047e-26   8.67131267e-02]
 [ -1.42442552e-01  -4.50915283e-02  -4.24035407e-02  -4.44980868e-02
   -4.29341790e-02   4.78949057e-13  -5.22594953e-02  -6.65954292e-02
   -4.30078018e-02  -2.86718678e-02   3.47803015e-09  -1.50713462e-11
    1.57148180e-14  -1.04518991e-01]
 [  2.73758608e-01   2.33866464e-01   1.76941674e-01   2.02106845e-01
    1.60255526e-01   6.82685723e-02  -9.14694731e-01   1.75499974e-01
    1.36537145e-01   2.73074289e-01  -2.95845523e-08   1.50479366e-11
    3.99098477e-15  -8.58074546e-02]]


  y = column_or_1d(y, warn=True)


In [17]:
#SVM  with preprocessing
data_train, data_test, label_train, label_test = train_features_scaled, test_features_scaled, train_labels, test_labels


###################################
#     SVM  with preprocessing     #
###################################

clf = LinearSVC(random_state=0)
clf.fit(data_train, label_train)

clf_pred = clf.predict(data_test)
acc_clf = accuracy_score(label_test, clf_pred) 

print('\n########## SVM #########')
print('Score: ', acc_clf)
print('SVM-coefficient:')
print(clf.coef_)


########## SVM #########
Score:  0.833333333333
SVM-coefficient:
[[ -8.19678222e-01  -7.93742994e-01  -5.54823203e-01   6.19179019e-01
    4.06404027e-01  -2.09385425e-01   3.62511927e-01  -2.53765349e-04
   -2.28161004e-01  -1.34068068e-01   1.24766423e-02  -2.89846190e-02
   -1.31010850e-01   3.37229842e-01]
 [ -1.03273119e-01   6.26345931e-01   1.88398644e+00  -4.51677756e-01
   -1.00888605e+00   4.88434953e-01  -1.70937378e-01  -1.29962809e-01
    9.49237221e-01   1.83945781e-01   4.72914249e-03  -1.27240922e+00
   -1.69888705e-01  -7.63005505e-02]
 [  1.43026774e-01   2.75313465e-01  -1.37700541e+00  -5.91323722e-01
    1.16808374e+00  -1.92363437e-01  -1.04907156e-01   1.65733225e-02
   -4.08649715e-01   8.12473874e-02   7.32715981e-02   5.99030230e-01
    2.62258190e-01  -3.67858470e-01]]


  y = column_or_1d(y, warn=True)


In [18]:
#libraries for ANN
from sklearn.neural_network import MLPClassifier

In [19]:
data_train, data_test, label_train, label_test = train_features, test_features, train_labels, test_labels


###############
#    ANN      #
###############


print('\n########## ANN #########')
max_sc_ann = 0

for ls2 in range (10,40):
    for ls1 in range(8, 20, 2):
        ann_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(ls1, ls2), random_state=1)
        ann_clf.fit(data_train, label_train)      
        ann_clf_pred = ann_clf.predict(data_test)

        acc_ann = accuracy_score(label_test, ann_clf_pred) 
        if (acc_ann > max_sc_ann):
            max_sc_ann = acc_ann
            max_ls1 = ls1
            max_ls2 = ls2
                                
print('Score (', max_ls1, max_ls2, 'layers): ', max_sc_ann)



########## ANN #########


  y = column_or_1d(y, warn=True)


Score ( 8 14 layers):  0.666666666667


In [27]:
data_train, data_test, label_train, label_test = train_features_scaled, test_features_scaled, train_labels, test_labels

data_train = data_train[0:25]
label_train = label_train[0:25]
print(len(data_train))

################################
#    ANN with preprocessing    #
################################


print('\n########## ANN #########')
max_sc_ann = 0

for ls2 in range (10,40):
    for ls1 in range(8, 20, 2):
        ann_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(ls1, ls2), random_state=1)
        ann_clf.fit(data_train, label_train)      
        ann_clf_pred = ann_clf.predict(data_test)

        acc_ann = accuracy_score(label_test, ann_clf_pred) 
        if (acc_ann > max_sc_ann):
            max_sc_ann = acc_ann
            max_ls1 = ls1
            max_ls2 = ls2
                                
print('Score (', max_ls1, max_ls2, 'layers): ', max_sc_ann)


25

########## ANN #########


  y = column_or_1d(y, warn=True)


Score ( 16 35 layers):  1.0
