classificar em 2 a 8 hz

In [1]:
import pyedflib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.fftpack import rfft, irfft, fftfreq
import scipy as sp
from scipy import signal

import os
import itertools
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA

%matplotlib inline

In [2]:
HOME = "/home/jonnatas/Downloads/DB_REPO/edf/"
def dadosPK(tipo):
    dir_ct = HOME+tipo+"/"
    return os.listdir(HOME+tipo+"/")
def carregarDataFrame(tipo, caminho):
    caminho = HOME+tipo+"/"+caminho
    
    edf = pyedflib.EdfReader(caminho)
    n = edf.signals_in_file
    sigbufs = np.zeros((n, edf.getNSamples()[0]))
    for i in np.arange(n):
         sigbufs[i, :] = edf.readSignal(i)
    edf._close()
    del edf
    data = sigbufs.T
    
    return pd.DataFrame(data=data, columns=['ch1', 'ch2', 'ch3', 'ch4'])

In [3]:
def plot_confusion_matrix(cm, classes, normalize=True, title='Matriz de confusão, sem normalização', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(12,6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Esperado')
    plt.xlabel('Obtido')

In [18]:
def fft(df, canal):
    amostras_validas = df[canal][2000:11000]
    amostras = int(amostras_validas.shape[0]*500/2000)
    sinal = np.abs(np.fft.fft(amostras_validas))[:amostras]
    freq = np.linspace(0,500,amostras)

    return (sinal,freq)

def frequecia(df, canal):
    sinalFFT = [fft(data, canal)[0] for data in df]
    return sinalFFT

In [5]:
def pcaCLF(clf, trainData, rotulos, random_state=30, test_size=0.3,n_components=30, kfold=30):
    
    
    X_train, X_test, y_train, y_test = train_test_split(trainData, rotulos, test_size=test_size, random_state=random_state)
    #Raw score
    clf.fit(X_train, y_train)
    print('Raw score', clf.score(X_test, y_test))
    
    #cross-validation
    scores = cross_val_score(clf, trainData, rotulos, cv=kfold)      
    print("score cross validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    #PCA
    pca = PCA(n_components=n_components)
    pca.fit(trainData)
    X_t_train = pca.transform(X_train)
    X_t_test = pca.transform(X_test)
    
    clf.fit(X_t_train, y_train)
    print('score PCA ', clf.score(X_t_test, y_test))
   
    #PCA + cross-validation
    pcaCV = PCA(n_components=n_components)
    pcaCV.fit(trainData)
    t_trainData = pcaCV.transform(trainData)
    #cross-validation
    scores = cross_val_score(clf, t_trainData, rotulos, cv=kfold)   
    print("score PCA cross validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


In [6]:
nomes_parkinson = dadosPK('parkinson')
nomes_controle = dadosPK('controle')
df_ct = [ carregarDataFrame('controle', nome) for nome in nomes_controle]
df_pk = [ carregarDataFrame('parkinson', nome) for nome in nomes_parkinson]

# Filtrando os dados com a fft

In [7]:
sinalCT = frequecia(df_ct, 'ch1')
sinalPK = frequecia(df_pk, 'ch1')

rotulosCT = [0 for _ in sinalCT]
rotulosPK = [1 for _ in sinalPK]

trainData = sinalCT + sinalPK
rotulos = rotulosCT + rotulosPK

## Separando os dados (Treino, teste e validação)

In [8]:
random_state=0
n_components = min(len(trainData), len(trainData[0]))
kfold = 30
nomes = ['controle','parkinson']
X_train, X_test, y_train, y_test = train_test_split(trainData, rotulos, test_size=0.3, random_state=random_state)


In [9]:
rfc = RandomForestClassifier(random_state=random_state, n_estimators=50, max_features='log2', max_depth=None)
pcaCLF(rfc, trainData, rotulos)

Raw score 0.7692307692307693
score cross validation: 0.81 (+/- 0.54)
score PCA  0.8076923076923077
score PCA cross validation: 0.82 (+/- 0.47)


In [10]:
svm = SVC(C=1, gamma=0.1, kernel='poly', degree=3)  
pcaCLF(svm, trainData, rotulos)

Raw score 0.6923076923076923
score cross validation: 0.74 (+/- 0.56)
score PCA  0.6538461538461539
score PCA cross validation: 0.78 (+/- 0.62)


# Persistence

import pickle
from sklearn.externals import joblib

# save the model to disk
pickle.dump(rfc, open('rfc.sav', 'wb'))
pickle.dump(svm, open('svm.sav', 'wb'))
 

joblib.dump(rfc, 'rfc.sav')
joblib.dump(svm, 'svm.sav')

# load the model from disk
filename = 'rfc.sav'
loaded_model = joblib.load(filename)
pcaCLF(loaded_model, trainData, rotulos)
