In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import numpy as np
import warnings
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')

path = "/home/daniel/Data/arrays_256/X.txt"
X = np.loadtxt(path)
print(X.shape)

(35264, 27)


In [2]:
# separando datos en datos de entrenamiento y datos de testeo (0.25)
X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1], random_state=0)

In [7]:
def train_ml_model(X, y, model):
    """Entrena el modelo de las estrategias de ML."""

    model.fit(X,y)

    return model

def test_ml_model(X, y, model):
    """Predice utilizando el modelo model sobre los datos de testeo test.
    :param model: Modelo de machine learning previamente entrenado
    :type model: Modelo de machine learning de sklearn o xgboost
    :param y: Datos de testeo
    :type y: 1D numpy array
    """
    # prediciendo con el modelo
    pred = model.predict(X)
    
    print("\nprecision: %s"%precision_score(y, pred)), "cuantas que dijo P realmente lo eran"
    print("recall: %s"%recall_score(y, pred)), "Qué porcentaje de todas las P acertó"
    #print("accuracy: %s"%accuracy_score(y, pred))
    print("f1_score: %s"%f1_score(y, pred))

    return pred

In [57]:
clf1 = RandomForestClassifier(n_estimators=80, max_features="sqrt", max_depth=29)
clf2 = DecisionTreeClassifier(max_depth=15)
clf3 = XGBClassifier(n_estimators=4700, max_depth=10, learning_rate=0.6) #
clf5 = LogisticRegression(C=2)
clf7 = KNeighborsClassifier(n_neighbors=3)
clf8 = MLPClassifier(hidden_layer_sizes=(18,))

classifiers = [clf1, clf2, clf3, clf5, clf7, clf8]

In [46]:
import timeit
clfs = [clf3, clf8]
for clf in clfs:
    print("\n",str(clf).split("(")[0])
    t = timeit.timemodel = train_ml_model(X_train, y_train, clf)
    %time y_pred = test_ml_model(X_test, y_test, model)


 RandomForestClassifier
CPU times: user 19 s, sys: 8 ms, total: 19 s
Wall time: 19 s

precision: 0.9573732718894009
recall: 0.9481971702418986
f1_score: 0.9527631277229992
CPU times: user 120 ms, sys: 16 ms, total: 136 ms
Wall time: 133 ms

 DecisionTreeClassifier
CPU times: user 2.97 s, sys: 0 ns, total: 2.97 s
Wall time: 2.97 s

precision: 0.8770373921380633
recall: 0.8350068461889548
f1_score: 0.8555061959317278
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 18 ms

 XGBClassifier
CPU times: user 5min 32s, sys: 0 ns, total: 5min 32s
Wall time: 5min 32s

precision: 0.9768348623853211
recall: 0.9719306252852579
f1_score: 0.9743765728666209
CPU times: user 1.22 s, sys: 0 ns, total: 1.22 s
Wall time: 1.22 s

 LogisticRegression
CPU times: user 4.46 s, sys: 0 ns, total: 4.46 s
Wall time: 4.46 s

precision: 0.4642157929084185
recall: 0.4869922409858512
f1_score: 0.47533132865575234
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 10.5 ms

 KNeighborsClassifier
CPU time

# Cross validation for feature extraction

## Geometry of filterbank --> 512cf

In [34]:
path1 = "/home/daniel/Data/arrays_256/X.txt"
path2 = "/home/daniel/Data/arrays_256cf/X.txt"
path3 = "/home/daniel/Data/arrays_512/X.txt"
path4 = "/home/daniel/Data/arrays_512cf/X.txt"
paths = [path1, path2, path3, path4]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1])
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

arrays_256 (35264, 66)
Accuracy: 0.87 (+/- 0.00)
arrays_256cf (35264, 66)
Accuracy: 0.89 (+/- 0.01)
arrays_512 (35264, 66)
Accuracy: 0.91 (+/- 0.01)
arrays_512cf (35264, 66)
Accuracy: 0.92 (+/- 0.01)


## Noise ratio --> 512cf 50-50

In [37]:
path1 = "/home/daniel/Data/arrays_512cf/X.txt"
path2 = "/home/daniel/Data/arrays_512_0.5/X.txt"
paths = [path1, path2]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1])
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

arrays_512cf (35264, 66)
Accuracy: 0.92 (+/- 0.01)
arrays_512_0.5 (35264, 66)
Accuracy: 0.89 (+/- 0.01)


## Preemphasis --> True, 512cf

In [39]:
path1 = "/home/daniel/Data/arrays_512cf/X.txt"
path2 = "/home/daniel/Data/arrays_512_noPre/X.txt"
paths = [path1, path2]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1])
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

arrays_512cf (35264, 66)
Accuracy: 0.92 (+/- 0.00)
arrays_512_noPre (35264, 66)
Accuracy: 0.90 (+/- 0.00)


## CMN --> True, 512cf/X_CMN

In [59]:
path1 = "/home/daniel/Data/arrays_512/X_CMN.txt"
path2 = "/home/daniel/Data/arrays_512cf/X_CMN.txt"
path3 = "/home/daniel/Data/arrays_512cf_nor/X_CMN.txt"
paths = [path1, path2, path3]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[5], X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1])
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

X_CMN.txt (35264, 66)
Accuracy: 0.97 (+/- 0.00)
X_CMN.txt (35264, 66)
Accuracy: 0.97 (+/- 0.00)
X_CMN.txt (35264, 66)
Accuracy: 0.97 (+/- 0.00)


## cepfilter True and energy True

In [43]:
path1 = "/home/daniel/Data/arrays_512cf_noEn/X_CMN.txt"
path2 = "/home/daniel/Data/arrays_512cf_noCepf/X_CMN.txt"
path3 = "/home/daniel/Data/arrays_512cf_noEn/X.txt"
path4 = "/home/daniel/Data/arrays_512cf_noCepf/X.txt"
paths = [path1, path2, path3, path4]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4:],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1])
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

['arrays_512cf_noEn', 'X_CMN.txt'] (35264, 66)
Accuracy: 0.93 (+/- 0.01)
['arrays_512cf_noCepf', 'X_CMN.txt'] (35264, 66)
Accuracy: 0.92 (+/- 0.01)
['arrays_512cf_noEn', 'X.txt'] (35264, 66)
Accuracy: 0.91 (+/- 0.01)
['arrays_512cf_noCepf', 'X.txt'] (35264, 66)
Accuracy: 0.90 (+/- 0.01)


# Classifier

In [60]:
X = np.loadtxt("/home/daniel/Data/arrays_512cf_nor/X_CMN.txt")
X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1],random_state=0)
for clf in classifiers:
    print("\n",str(clf).split("(")[0])
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("f1 score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


 RandomForestClassifier
f1 score: 0.95 (+/- 0.01)

 DecisionTreeClassifier
f1 score: 0.89 (+/- 0.00)

 XGBClassifier
f1 score: 0.97 (+/- 0.00)

 LogisticRegression
f1 score: 0.44 (+/- 0.02)

 KNeighborsClassifier
f1 score: 0.88 (+/- 0.01)

 MLPClassifier
f1 score: 0.97 (+/- 0.00)
