classificar em 2 a 8 hz

In [1]:
import pyedflib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.fftpack import rfft, irfft, fftfreq
import scipy as sp
from scipy import signal

import os
import itertools
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA

%matplotlib inline

In [2]:
HOME = "/home/jonnatas/git/mlModels/sistema/modelo/DB_REPO/edf/"
def dadosPK(tipo):
    dir_ct = HOME+tipo+"/"
    return os.listdir(HOME+tipo+"/")
def carregarDataFrame(tipo, caminho):
    caminho = HOME+tipo+"/"+caminho
    
    edf = pyedflib.EdfReader(caminho)
    n = edf.signals_in_file
    sigbufs = np.zeros((n, edf.getNSamples()[0]))
    for i in np.arange(n):
         sigbufs[i, :] = edf.readSignal(i)
    edf._close()
    del edf
    data = sigbufs.T
    
    return pd.DataFrame(data=data, columns=['ch1', 'ch2', 'ch3', 'ch4'])

In [3]:
def plot_confusion_matrix(cm, classes, normalize=True, title='Matriz de confusão, sem normalização', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(12,6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Esperado')
    plt.xlabel('Obtido')

In [4]:
def fft(df, canal):
    amostras_validas = df[canal][2000:11000]
    amostras = int(amostras_validas.shape[0]*500/2000)
    sinal = np.abs(np.fft.fft(amostras_validas))[:amostras]
    freq = np.linspace(0,500,amostras)

    return (sinal,freq)

def frequecia(df, canal):
    sinalFFT = [fft(data, canal)[0] for data in df]
    return sinalFFT

In [5]:
def pcaCLF(clf, trainData, rotulos, random_state=30, test_size=0.3,n_components=30, kfold=30):
    
    
    X_train, X_test, y_train, y_test = train_test_split(trainData, rotulos, test_size=test_size, random_state=random_state)
    #Raw score
    clf.fit(X_train, y_train)
    print('Raw score', clf.score(X_test, y_test))
    
    #cross-validation
    scores = cross_val_score(clf, trainData, rotulos, cv=kfold)      
    print("score cross validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    #PCA
    pca = PCA(n_components=n_components)
    pca.fit(trainData)
    X_t_train = pca.transform(X_train)
    X_t_test = pca.transform(X_test)
    
    clf.fit(X_t_train, y_train)
    print('score PCA ', clf.score(X_t_test, y_test))
   
    #PCA + cross-validation
    pcaCV = PCA(n_components=n_components)
    pcaCV.fit(trainData)
    t_trainData = pcaCV.transform(trainData)
    #cross-validation
    scores = cross_val_score(clf, t_trainData, rotulos, cv=kfold)   
    print("score PCA cross validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


In [6]:
nomes_parkinson = dadosPK('parkinson')
nomes_controle = dadosPK('controle')
df_ct = [ carregarDataFrame('controle', nome) for nome in nomes_controle]
df_pk = [ carregarDataFrame('parkinson', nome) for nome in nomes_parkinson]

# Filtrando os dados com a fft

In [24]:
sinalCT = frequecia(df_ct, 'ch1')
sinalPK = frequecia(df_pk, 'ch1')

rotulosCT = [0 for _ in sinalCT]
rotulosPK = [1 for _ in sinalPK]

trainData = sinalCT + sinalPK
rotulos = rotulosCT + rotulosPK
print(X_train)

[array([26958.83684211, 49988.79699492, 23700.45003159, ...,
         772.06936506,    86.38641717,   178.37312249]), array([5390.9       ,  260.27476548,  328.93893794, ...,  609.58229936,
        436.73989925,  719.79730995]), array([5283.23214286,  446.00421988,  512.91382417, ...,  532.83815736,
        380.6286292 ,  650.26143758]), array([15819.50694444, 10846.09004275,  4491.27983909, ...,
         204.61200697,   200.59625108,   483.59521403]), array([20845.04417178,  1675.59618358,  1539.54510771, ...,
         272.23172187,   365.47119823,   466.11321746]), array([15996.47619048, 41383.95090445, 22551.16180308, ...,
         500.75406406,   537.08217618,   201.00445625]), array([ 9575.98732394, 49442.97030921, 29319.41988874, ...,
         467.16935647,   260.83279142,   650.75462726]), array([3163.26976744,   35.21009018, 2499.18175935, ...,  633.30963496,
        518.5572047 ,  254.36506494]), array([ 4595.64285714,  7897.51280363, 10012.31576879, ...,
         462.56439219

## Separando os dados (Treino, teste e validação)

In [8]:
random_state=0
n_components = min(len(trainData), len(trainData[0]))
kfold = 30
nomes = ['controle','parkinson']
X_train, X_test, y_train, y_test = train_test_split(trainData, rotulos, test_size=0.3, random_state=random_state)


In [23]:
rfc = RandomForestClassifier(random_state=random_state, n_estimators=50, max_features='log2', max_depth=None)
pcaCLF(rfc, trainData, rotulos)
print(rfc.predict(trainData[0].T))

Raw score 0.7307692307692307
score cross validation: 0.75 (+/- 0.56)
score PCA  0.7692307692307693
score PCA cross validation: 0.77 (+/- 0.50)


ValueError: Expected 2D array, got 1D array instead:
array=[7815.4224   726.16016  985.2265  ...  714.7276   182.35501  388.6333 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [10]:
svm = SVC(C=1, gamma=0.1, kernel='poly', degree=3)  
pcaCLF(svm, trainData, rotulos)

Raw score 0.8076923076923077
score cross validation: 0.68 (+/- 0.53)
score PCA  0.6538461538461539
score PCA cross validation: 0.74 (+/- 0.49)


# Persistence

In [11]:
from sklearn.externals import joblib

In [14]:
# save the model to disk
joblib.dump(rfc, 'rfc.sav')
joblib.dump(svm, 'svm.sav')

# Save to file in the current working directory
joblib_file = "joblib_model.pkl"  
joblib.dump(rfc, joblib_file)

['joblib_model.pkl']

In [15]:
# load the model from disk
filename = 'rfc.sav'
loaded_model = joblib.load(joblib_file)
pcaCLF(loaded_model, trainData, rotulos)


Raw score 0.7307692307692307
score cross validation: 0.75 (+/- 0.56)
score PCA  0.7692307692307693
score PCA cross validation: 0.78 (+/- 0.50)
