## Feature Extraction

#### Imports

In [1]:
#imports for 
import pandas as pd
import numpy as np
# imports for our classes
from configLoader import ConfigLoader
from recordReader import RecordReader
from featuresExtractor import FeaturesExtractor

In [2]:
# class instatiation
config = ConfigLoader('../data/config.ini')
recordReader = RecordReader(config)
featuresExtractor = FeaturesExtractor(config)

In [3]:
df = pd.read_csv('../data/db-bruno/result-preproc.csv')

In [16]:
data = {'features':[], 'y':[]}
for index, row in df.iterrows():
    tmpPair = recordReader.fromPandasRowToPair(row)
    tmpZipAttributeTuples = tmpPair.getAtributesAsZip()
    tmpFeatures = featuresExtractor.extractFeatures(tmpZipAttributeTuples)
    if None in tmpFeatures:
        continue
    data['features'].append(tmpFeatures)
    data['y'].append(tmpPair.getY())

In [17]:
data

{'features': [[0.5258821638131983, 0.6333333333333333, 0.9, 0.0, 1.0],
  [0.6410438571563891, 0.6746031746031745, 0.9, 0.0, 1.0],
  [0.6862953138815208, 0.5428028761362095, 1.0, 1.0, 1.0],
  [0.6800118040527248, 0.6529513021267145, 0.9, 1.0, 1.0],
  [0.7069877759532931, 0.5644501278772379, 0.9, 1.0, 1.0],
  [0.635, 0.5757575757575758, 0.9, 1.0, 1.0],
  [0.5898050974512744, 0.6575757575757576, 1.0, 0.0, 1.0],
  [0.5395021645021645, 0.5994152046783626, 0.9, 1.0, 1.0],
  [0.5826236679231235, 0.6038509316770186, 0.9, 1.0, 1.0],
  [0.6268337870538415, 0.6141724146839236, 0.9, 1.0, 0.0],
  [0.6291187739463601, 0.6319212188777406, 0.9, 1.0, 1.0],
  [0.6733932733932734, 0.6978466104638312, 1.0, 0.0, 1.0],
  [0.6241078807001728, 0.6521464646464646, 0.9, 0.0, 1.0],
  [0.6725068639769184, 0.6613220000643936, 0.9, 0.4, 1.0],
  [0.5921717171717171, 0.6487586487586489, 0.9, 0.0, 1.0],
  [0.7875058275058274, 0.7413580246913579, 0.9, 1.0, 1.0],
  [0.5715873015873016, 0.5878191856452727, 1.0, 0.4, 1.0]

## Machine Learning

#### Imports

In [18]:
# general
import numpy as np
import pandas as pd
# classifiers
from sklearn import svm
from sklearn import naive_bayes
from sklearn import linear_model
from sklearn import tree
# cross_val
from sklearn.model_selection import cross_val_score

#### Model testing

In [20]:
# SVM
clf = svm.SVC(kernel='linear', C=1)
f1 = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='f1')
recall = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='recall')
precision = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='precision')
accuracy = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='accuracy')

print("accuracy: %0.3f (+/- %0.3f)" % (accuracy.mean(), accuracy.std()*2 ))
print("precision: %0.3f (+/- %0.3f)" % (precision.mean(), precision.std()*2 ))
print("recall: %0.3f (+/- %0.3f)" % (recall.mean(), recall.std()*2 ))
print("f1: %0.3f (+/- %0.3f)" % (f1.mean(), f1.std()*2 ))

accuracy: 0.998 (+/- 0.005)
precision: 0.962 (+/- 0.082)
recall: 0.998 (+/- 0.013)
f1: 0.979 (+/- 0.044)


In [23]:
# REGRESSAO LOGISTICA
clf = linear_model.LogisticRegression(C=1e5)
f1 = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='f1')
recall = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='recall')
precision = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='precision')
accuracy = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='accuracy')

print("accuracy: %0.3f (+/- %0.3f)" % (accuracy.mean(), accuracy.std()*2 ))
print("precision: %0.3f (+/- %0.3f)" % (precision.mean(), precision.std()*2 ))
print("recall: %0.3f (+/- %0.3f)" % (recall.mean(), recall.std()*2 ))
print("f1: %0.3f (+/- %0.3f)" % (f1.mean(), f1.std()*2 ))

accuracy: 0.997 (+/- 0.008)
precision: 0.970 (+/- 0.069)
recall: 0.975 (+/- 0.150)
f1: 0.971 (+/- 0.090)


In [21]:
# DECISION TREE
clf = tree.DecisionTreeClassifier()
f1 = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='f1')
recall = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='recall')
precision = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='precision')
accuracy = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='accuracy')

print("accuracy: %0.3f (+/- %0.3f)" % (accuracy.mean(), accuracy.std()*2 ))
print("precision: %0.3f (+/- %0.3f)" % (precision.mean(), precision.std()*2 ))
print("recall: %0.3f (+/- %0.3f)" % (recall.mean(), recall.std()*2 ))
print("f1: %0.3f (+/- %0.3f)" % (f1.mean(), f1.std()*2 ))

accuracy: 0.997 (+/- 0.009)
precision: 0.966 (+/- 0.092)
recall: 0.969 (+/- 0.188)
f1: 0.964 (+/- 0.110)


In [22]:
# NAIVE BAYES
clf = naive_bayes.GaussianNB()
f1 = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='f1')
recall = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='recall')
precision = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='precision')
accuracy = cross_val_score(clf, data['features'], data['y'], cv=10, scoring='accuracy')

print("accuracy: %0.3f (+/- %0.3f)" % (accuracy.mean(), accuracy.std()*2 ))
print("precision: %0.3f (+/- %0.3f)" % (precision.mean(), precision.std()*2 ))
print("recall: %0.3f (+/- %0.3f)" % (recall.mean(), recall.std()*2 ))
print("f1: %0.3f (+/- %0.3f)" % (f1.mean(), f1.std()*2 ))

accuracy: 0.997 (+/- 0.009)
precision: 0.975 (+/- 0.069)
recall: 0.969 (+/- 0.188)
f1: 0.969 (+/- 0.108)
