In [99]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import numpy as np
import warnings
from sklearn.model_selection import cross_val_score
from time import time
from joblib import dump, load

warnings.filterwarnings('ignore')

path = "/home/daniel/Data/arrays_512cf_nor/X_CMN.txt"
X = np.loadtxt(path)
print(X.shape)

(35264, 66)


In [86]:
# separando datos en datos de entrenamiento y datos de testeo (0.25)
X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1], random_state=0)

In [63]:
def train_ml_model(X, y, model):
    """Entrena el modelo de las estrategias de ML."""

    model.fit(X,y)

    return model

def test_ml_model(X, y, model):
    """Predice utilizando el modelo model sobre los datos de testeo test.
    :param model: Modelo de machine learning previamente entrenado
    :type model: Modelo de machine learning de sklearn o xgboost
    :param y: Datos de testeo
    :type y: 1D numpy array
    """
    # prediciendo con el modelo
    pred = model.predict(X)
    
    print("\nprecision: %s"%precision_score(y, pred)), "cuantas que dijo P realmente lo eran"
    print("recall: %s"%recall_score(y, pred)), "Qué porcentaje de todas las P acertó"
    #print("accuracy: %s"%accuracy_score(y, pred))
    print("f1_score: %s"%f1_score(y, pred))

    return pred

In [119]:
clf1 = RandomForestClassifier(n_estimators=80, max_features="sqrt", max_depth=29, n_jobs=-1)
clf2 = DecisionTreeClassifier(max_depth=15)
clf3 = XGBClassifier(n_estimators=4700, max_depth=10, learning_rate=0.6, n_jobs=32) #
clf5 = LogisticRegression(C=2)
clf7 = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf8 = MLPClassifier(hidden_layer_sizes=(18,))

classifiers = [clf2, clf1, clf5, clf7, clf8, clf3]

In [120]:
clfs = classifiers
for clf in clfs:
    model_name = str(clf).split("(")[0]
    print("\n",model_name)
    t0=time()
    train_ml_model(X_train, y_train, clf)
    dump(clf, model_name+"_RUS.joblib")
    print("Tiempo de entrenamiento:", round(time()-t0, 2))
    t1=time()
    y_pred = test_ml_model(X_test, y_test, clf)
    print("Tiempo de testeo:", round(time()-t1, 2))


 DecisionTreeClassifier
Tiempo de entrenamiento: 2.6

precision: 0.9148491879350348
recall: 0.8801339285714286
f1_score: 0.8971558589306029
Tiempo de testeo: 0.02

 RandomForestClassifier
Tiempo de entrenamiento: 1.44

precision: 0.9634034766697164
recall: 0.9401785714285714
f1_score: 0.9516493447808405
Tiempo de testeo: 0.13

 LogisticRegression
Tiempo de entrenamiento: 1.43

precision: 0.47424460431654675
recall: 0.3678571428571429
f1_score: 0.4143306096794469
Tiempo de testeo: 0.03

 KNeighborsClassifier
Tiempo de entrenamiento: 0.18

precision: 0.9703703703703703
recall: 0.81875
f1_score: 0.8881355932203389
Tiempo de testeo: 11.17

 MLPClassifier
Tiempo de entrenamiento: 8.14

precision: 0.9745592501673733
recall: 0.9747767857142857
f1_score: 0.9746680058029238
Tiempo de testeo: 0.02

 XGBClassifier
Tiempo de entrenamiento: 28.9

precision: 0.9841413683733575
recall: 0.9696428571428571
f1_score: 0.9768383179671688
Tiempo de testeo: 0.07


# Cross validation for feature extraction

## Geometry of filterbank --> 512cf

In [66]:
path1 = "/home/daniel/Data/arrays_256/X.txt"
path2 = "/home/daniel/Data/arrays_256cf/X.txt"
path3 = "/home/daniel/Data/arrays_512/X.txt"
path4 = "/home/daniel/Data/arrays_512cf/X.txt"
paths = [path1, path2, path3, path4]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1],random_state=0)
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

arrays_256 (35264, 66)
Accuracy: 0.88 (+/- 0.01)
arrays_256cf (35264, 66)
Accuracy: 0.89 (+/- 0.01)
arrays_512 (35264, 66)
Accuracy: 0.91 (+/- 0.01)
arrays_512cf (35264, 66)
Accuracy: 0.92 (+/- 0.00)


## Noise ratio --> 512cf 50-50

In [67]:
path1 = "/home/daniel/Data/arrays_512cf/X.txt"
path2 = "/home/daniel/Data/arrays_512_0.5/X.txt"
paths = [path1, path2]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1],random_state=0)
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

arrays_512cf (35264, 66)
Accuracy: 0.92 (+/- 0.00)
arrays_512_0.5 (35264, 66)
Accuracy: 0.89 (+/- 0.01)


## Preemphasis --> True, 512cf

In [78]:
path1 = "/home/daniel/Data/arrays_512cf_nor/X_CMN.txt"
path2 = "/home/daniel/Data/arrays_512_noPre/X_CMN.txt"
paths = [path1, path2]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1],random_state=0)
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

arrays_512cf (35264, 66)
Accuracy: 0.98 (+/- 0.00)
arrays_512_noPre (35264, 66)
Accuracy: 0.97 (+/- 0.00)


## CMN --> True, 512cf/X_CMN

In [69]:
path1 = "/home/daniel/Data/arrays_512/X_CMN.txt"
path2 = "/home/daniel/Data/arrays_512cf/X_CMN.txt"
path3 = "/home/daniel/Data/arrays_512cf_nor/X_CMN.txt"
paths = [path1, path2, path3]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[5], X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1],random_state=0)
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

X_CMN.txt (35264, 66)
Accuracy: 0.97 (+/- 0.00)
X_CMN.txt (35264, 66)
Accuracy: 0.98 (+/- 0.00)
X_CMN.txt (35264, 66)
Accuracy: 0.97 (+/- 0.00)


## cepfilter True and energy True

In [70]:
path1 = "/home/daniel/Data/arrays_512cf_noEn/X_CMN.txt"
path2 = "/home/daniel/Data/arrays_512cf_noCepf/X_CMN.txt"
path3 = "/home/daniel/Data/arrays_512cf_noEn/X.txt"
path4 = "/home/daniel/Data/arrays_512cf_noCepf/X.txt"
paths = [path1, path2, path3, path4]
for path in paths:
    X = np.loadtxt(path)
    print(path.split("/")[4:],X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1],random_state=0)
    scores = cross_val_score(clf3, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

['arrays_512cf_noEn', 'X_CMN.txt'] (35264, 66)
Accuracy: 0.96 (+/- 0.00)
['arrays_512cf_noCepf', 'X_CMN.txt'] (35264, 66)
Accuracy: 0.96 (+/- 0.01)
['arrays_512cf_noEn', 'X.txt'] (35264, 66)
Accuracy: 0.91 (+/- 0.00)
['arrays_512cf_noCepf', 'X.txt'] (35264, 66)
Accuracy: 0.91 (+/- 0.01)


# Classifier

In [96]:
X = np.loadtxt("/home/daniel/Data/arrays_512cf_nor/X_CMN.txt")
X_train, X_test, y_train, y_test = train_test_split(X[:,:-1],X[:,-1],random_state=0)
for clf in classifiers:
    print("\n",str(clf).split("(")[0])
    t0=time()
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='recall', n_jobs=-1)
    print("precision: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2), "Ex. time:",round(time()-t0, 2))


 RandomForestClassifier
precision: 0.943 (+/- 0.009) Ex. time: 5.8

 DecisionTreeClassifier
precision: 0.873 (+/- 0.008) Ex. time: 3.64

 XGBClassifier
precision: 0.969 (+/- 0.002) Ex. time: 7355.22

 LogisticRegression
precision: 0.407 (+/- 0.018) Ex. time: 2.45

 KNeighborsClassifier
precision: 0.803 (+/- 0.008) Ex. time: 13.02

 MLPClassifier
precision: 0.966 (+/- 0.010) Ex. time: 1154.91


In [80]:
print(X.shape,X_train.shape)

(35264, 66) (26448, 65)


In [81]:
26448/35264

0.75

sklearn.linear_model.logistic.LogisticRegression

"<class 'sklearn.linear_model.logistic.LogisticRegression'>"