In [3]:
import pandas as pd
import scipy.io.wavfile as wf
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
from scipy import signal
import scipy.io.wavfile as wav
import neurokit as nk
import os
from numpy import zeros, floor, log10, log, mean, array, sqrt, vstack, cumsum,ones, log2, std
import glob
import scipy.io
import librosa
import numpy as np
import re
import pywt
import sklearn 
import gc
#from voice_detection import VoiceActivityDetector as VAD
from sklearn.preprocessing import MinMaxScaler
import keras
from keras.layers import Input,Dense
from keras.models import Model
#from tsfresh import extract_features
from numpy.fft import fft
from numpy import zeros, floor, log10, log, mean, array, sqrt, vstack, cumsum, ones, log2, std
from numpy.linalg import svd, lstsq


In [154]:
def loading_mat(file_path,channels):
    """
    Purpose : loading and preprocess .mat EEG files
    Arg:
        file_path : file directory
        channels : number of eeg channels processed in .mat file
    Output:
        df : dataframe for the EEG signals
    """
    mat = scipy.io.loadmat(file_path)
    mat = {k:v for k, v in mat.items() if k[0]!='_'}
    names = list(mat.keys())
    df = pd.DataFrame()
    for key in names[:channels]:
        value = mat[key]
        value = pd.DataFrame(value)
        df = pd.merge(df,value,how='right',right_index=True,left_index=True)
    df.columns = names[:channels]
    return df
def get_mfcc(file_path,sr_):
    """
    Purpose : compute mfcc features from a audio file
    Arg:
        file_path : audio file
    Output:
        mf : dataframe for mfcc features
    """
#     (rate,sig) = wav.read(file_path)
#     mfcc_feat = mfcc(sig,rate,winstep=0.01)
    df = pd.DataFrame()
    for file in range(30):
        path = file_path + str(file) + ".wav"
        y, sr = librosa.load(path,sr = sr_)
        mfcc_feat = librosa.feature.mfcc(y=y, sr=sr_,hop_length = 1, n_mfcc=13)
        mf = pd.DataFrame(mfcc_feat)
        mf = np.transpose(mf)
        df = pd.concat([df,mf])
    return df
def get_articulatory(file_path):
    """
    Purpose: load and preprocess the articulatory .csv for 30 sentences
    Arg:
        file_path: folder of 30 sentences for one subject
    Output: 
        articulatory df
    """
    df = pd.DataFrame()
    for file in range(30):
        path = file_path + str(file) + ".csv"
        sent_df = pd.read_csv(path)
        sent_df.columns = ["a1", "a2", "a3", "a4", "a5", "a6"]
        sent_df.loc[:,"Sentence"] = file
        df = pd.concat([df, sent_df])
    return df.reset_index(drop=True)

def get_audio_signal(file_path,sr):
    """
    Purpose : compute audio signal from a audio file
    Arg:
        file_path : audio file
    Output:
        y : dataframe for audio signal features
    """
    y, sr = librosa.load(file_path,sr=sr)
    y = pd.DataFrame(y)
    return y
def trimm_mfcc(file_path,label):
    """
    Trimming the mfcc by the label mapping
    Arg:
        file_path : the mfcc path
        label : label_mapping.csv
    Output:
        mf_df : trimmed mfcc file
    """
    mf_df = get_mfcc(file_path)
    pre_len = 0
    mf_df['code'] = None
    mf_df['word'] = None
    for row in range(len(label)):
        dif,code,word = label.iloc[row][['dif','code','word']]
        dif = round(dif,1)
        if dif==0:
            dif = 0.1
        if pre_len ==0:
            start = int(pre_len)*100
            end = int(pre_len*100)+int(dif*100)-1
        else:
            start = pre_len
            end = pre_len + int(dif*100)
        mf_df.at[start:end,['code','word']] = code,word
        pre_len = end
    mf_df = mf_df[mf_df['code'].notnull()]
    return mf_df


In [158]:
import librosa
y,sr = librosa.core.load("audio/jay1/1.wav")

In [159]:
len(y)//sr

3

In [157]:
# EEG_data = loading_mat("EEG_data/jay1_Filters.mat", 31)
# label = pd.read_csv("marker_data/dataframes/jay1_df.csv")
mfcc = get_mfcc("audio/jay1/", 100)
artic = get_articulatory("Articulatory_data/jay1/")
print(mfcc.shape)
print(artic.shape)
print(artic)

(10312, 13)
(10222, 7)
             a1        a2        a3        a4        a5        a6  Sentence
0      0.086838 -0.121088 -1.244090 -1.067202 -0.316126 -0.763496         0
1      0.177641 -0.142149 -1.293557 -1.054068 -0.368612 -0.721482         0
2      0.267745 -0.168539 -1.340196 -1.035811 -0.415383 -0.674794         0
3      0.343848 -0.194861 -1.371688 -1.022014 -0.441479 -0.627153         0
4      0.397213 -0.214886 -1.381066 -1.021195 -0.440705 -0.581629         0
...         ...       ...       ...       ...       ...       ...       ...
10217 -0.467356 -0.712224  0.102272  0.412311  0.333031 -0.191934        29
10218 -0.525288 -0.657675  0.132488  0.338855  0.420778 -0.172223        29
10219 -0.564131 -0.598585  0.184833  0.289414  0.482768 -0.124834        29
10220 -0.592099 -0.534039  0.249298  0.256945  0.531249 -0.062952        29
10221 -0.621053 -0.463667  0.316935  0.232585  0.580827 -0.000242        29

[10222 rows x 7 columns]


In [None]:
label.iloc[0]

In [135]:

def trim_EEG(EEG_data, marker_data):
    output = []
    df = pd.DataFrame()
    for row in range(len(marker_data)):
        start, end = marker_data.iloc[row]
        EEG_start = int(start * 1000)
        EEG_end = int(end * 1000)
        window = EEG_data.iloc[EEG_start+1:EEG_end-1,]
        window.loc[:,"Sentence"] = row
        #output.append(window)
        df = pd.concat([df, window])
    return df.reset_index(drop=True)
trim = trim_EEG(EEG_data, label)
trim

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,Fp1,Fz,F3,F7,FT9,FC5,FC1,C3,T7,TP9,...,CP2,C4,T8,FT10,FC6,FC2,F4,F8,Fp2,Sentence
0,30.273312,29.359789,35.896530,46.829239,47.339523,32.360237,28.087755,37.642586,24.123953,33.641674,...,5.103919,30.150871,33.061317,44.900940,30.919868,26.091274,23.977362,23.272577,13.109600,0
1,30.380035,29.846155,36.348316,46.720852,48.055157,33.151192,28.164606,37.502579,24.842512,37.457886,...,5.422882,30.509935,32.228752,44.955849,30.558807,26.383636,24.464115,22.853870,14.160234,0
2,29.958183,30.112015,36.628563,46.263279,48.433105,33.455353,28.154905,37.264664,25.448910,38.727966,...,5.580903,30.940941,31.286001,45.114067,30.062483,26.632214,24.785841,22.421146,14.666398,0
3,28.958513,30.049828,36.601723,45.405296,48.501381,33.260632,28.015406,36.958488,25.871763,37.241631,...,5.557716,31.375088,30.286585,45.248978,29.405685,26.715206,24.805420,21.981754,14.375419,0
4,27.493750,29.616899,36.217869,44.195816,48.346493,32.680389,27.724592,36.632881,26.112572,33.760189,...,5.386588,31.724718,29.252106,45.208443,28.569763,26.563412,24.424519,21.505655,13.236904,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102753,-18.416998,12.270480,5.748578,-17.894714,1.981005,4.008342,11.487144,7.120096,3.779440,14.997946,...,10.846295,10.992861,19.594849,28.552872,12.464072,14.388573,3.185489,16.343039,7.583976,29
102754,-18.784754,11.971295,5.628751,-17.874226,1.316536,4.321395,11.582186,7.912390,3.899096,17.698071,...,10.860739,10.252621,17.136614,26.796537,11.108159,14.446994,2.209593,15.283985,7.404121,29
102755,-19.456444,11.546941,5.198599,-18.143768,0.194912,4.337282,11.643970,8.430668,3.633068,21.086306,...,10.753442,9.457969,14.500813,24.532343,9.511013,14.429818,1.154181,13.838909,6.852047,29
102756,-20.326025,11.024794,4.550863,-18.678337,-1.174308,4.084603,11.602949,8.500950,3.064466,24.233812,...,10.473352,8.794523,12.387587,22.322336,7.977871,14.339578,0.243596,12.269120,6.037446,29


In [136]:
features = feature_compute(trim.iloc[:,:-1], 10, trim.iloc[:,-1])
jay1_feats = features.features_computing()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102758 entries, 0 to 102757
Data columns (total 31 columns):
Fp1     102758 non-null float64
Fz      102758 non-null float64
F3      102758 non-null float64
F7      102758 non-null float64
FT9     102758 non-null float64
FC5     102758 non-null float64
FC1     102758 non-null float64
C3      102758 non-null float64
T7      102758 non-null float64
TP9     102758 non-null float64
CP5     102758 non-null float64
CP1     102758 non-null float64
Pz      102758 non-null float64
P3      102758 non-null float64
P7      102758 non-null float64
O1      102758 non-null float64
Oz      102758 non-null float64
O2      102758 non-null float64
P4      102758 non-null float64
P8      102758 non-null float64
TP10    102758 non-null float64
CP6     102758 non-null float64
CP2     102758 non-null float64
C4      102758 non-null float64
T8      102758 non-null float64
FT10    102758 non-null float64
FC6     102758 non-null float64
FC2     102758 non-null f

In [137]:
jay1_feats.shape

(10276, 155)

In [69]:

class feature_compute():
    def __init__(self, data, window_size,label):
        self.data = data
        self.window_size = window_size
        self.label = label
        #self.map_file = map_file

    def spectral_entropy(self, col_data):
        """
        Purpose : compute spectral entropy accross the data
        Arg:
            data : input EEG signals data
            window_sze : moving window size
        Output:
            df : results after computing moving spectral entropy
        """
        data = col_data
        window_size = self.window_size
        df = []
        for i in range(int(len(data)/window_size)):
            complexity = nk.complexity(data[i*window_size:(i+1)*window_size],spectral=True,shannon=False,sampen=False,multiscale=False,svd = False,correlation=False,
                              higushi = False,petrosian=False,fisher = False,hurst = False,dfa= False,lyap_r=False,lyap_e =False)
            df.append(complexity['Entropy_Spectral'])
        df = pd.DataFrame(df)
        return df.reset_index(drop=True)
    def kurtosis(self, col_data):
        """
        Purpose : compute kurtosis accross the data
        Arg:
            data : input EEG signals data
            window_sze : moving window size
        Output:
            df : results after computing moving spectral entropy
        """    
        data = col_data
        window_size = self.window_size
        df=[]
        for i in range(int(len(data)/window_size)):
            df.append(data[i*window_size:(i+1)*window_size].kurtosis())
        df = pd.DataFrame(df)
        return df.reset_index(drop=True)
    def average(self, col_data):
        """
        Purpose : compute moving average accross the data
        Arg:
            data : input EEG signals data
            window_sze : moving window size
        Output:
            df : results after computing moving spectral entropy
        """
        data = col_data
        window_size = self.window_size
        df = pd.DataFrame()
        for i in range(int(len(data)/window_size)):
            dt = np.array(data[i*window_size:(i+1)*window_size])
            window = np.ones((window_size,))/float(window_size)
            dt = pd.DataFrame(np.convolve(dt, window,mode='valid'))
            df = pd.concat([df,dt])
        return df.reset_index(drop=True)

    def root_mean_square(self, col_data):
        """
        Purpose : compute root mean square accross the data
        Arg:
            data : input EEG signals data
            window_sze : moving window size
        Output:
            df : results after computing moving spectral entropy
        """
        data = col_data
        window_size = self.window_size
        df = pd.DataFrame()
        for i in range(int(len(data)/window_size)):
            dt = np.power(data[i*window_size:(i+1)*window_size],2)
            window = np.ones((window_size,))/float(window_size)
            dt = pd.DataFrame(np.sqrt(np.convolve(dt, window,mode='valid')))
            df = pd.concat([df,dt])
        return df.reset_index(drop=True)
    def zero_crossing_rate(self, col_data):
        """
        Purpose : compute root mean square accross the data
        Arg:
            data : input EEG signals data
            window_sze : moving window size
        Output:
            df : results after computing moving spectral entropy
        """
        data = col_data.values
        window_size = self.window_size
        df = pd.DataFrame()
        df = librosa.feature.zero_crossing_rate(data,frame_length=100,hop_length=10)
        df = pd.DataFrame(np.transpose(df))
        return df.reset_index(drop=True)
    def features_computing(self, trimmed = False, type_tf = 0):
        """
        Purpose : take the EEG signals data and computing moving features
        Arg:
            self:
        Output:
            df: the dataframe with 5 statistical features for each channels
        """
        if trimmed == True:
            data = self.trimmed_data(rate = 1000)
            labeling = data[['code','word']]
            data = data.drop(columns=['code','word'])
        else:
            data = self.data
        window_size = self.window_size
        df = pd.DataFrame()
        df_step = pd.DataFrame()
        print(data.info())
        if type_tf == 0:
            for column in data.columns:
                col_data = data[column]
                zero = self.zero_crossing_rate(col_data)
                rms = self.root_mean_square(col_data)
                avg = self.average(col_data)
                kurt = self.kurtosis(col_data)
                entropy = self.spectral_entropy(col_data)
                df_step = pd.concat([zero,rms,avg,kurt,entropy],axis=1)
                df_step.columns = [column+'_'+ feature for feature in ['zero','rms','avg','kurt','entropy']]
                df = pd.concat([df,df_step], axis=1)
                print('column %s is completed'%(column))
        df = df.reset_index(drop=True)
        return df


In [None]:
def encode(df,encoding_dim):
    """
        Encoding 
    """
    X = df.values
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    ncol = X.shape[1]

    input_dim = Input(shape = (ncol, ))
    # DEFINE THE DIMENSION OF ENCODER ASSUMED 3
    encoding_dim = encoding_dim
    # DEFINE THE ENCODER LAYERS

    encoded1 = Dense(155, activation = 'relu')(input_dim)
    encoded2 = Dense(100, activation = 'relu')(encoded1)
    encoded3 = Dense(55, activation = 'relu')(encoded2)
    encoded4 = Dense(30, activation = 'relu')(encoded3)
    encoded5 = Dense(15, activation = 'relu')(encoded4)
    encoded6 = Dense(encoding_dim, activation = 'relu')(encoded5)
    # DEFINE THE DECODER LAYERS

    decoded1 = Dense(15, activation = 'relu')(encoded5)
    decoded2 = Dense(30, activation = 'relu')(decoded1)
    decoded3 = Dense(55, activation = 'relu')(decoded2)
    decoded4 = Dense(100, activation = 'relu')(decoded3)
    decoded5 = Dense(155, activation = 'sigmoid')(decoded4)
    decoded6 = Dense(ncol, activation = 'relu')(encoded5)
    # COMBINE ENCODER AND DECODER INTO AN AUTOENCODER MODEL
    autoencoder = Model(input = input_dim, output = decoded5)
    # CONFIGURE AND TRAIN THE AUTOENCODER
    autoencoder.compile(optimizer = 'adam', loss = 'binary_crossentropy')
    autoencoder.summary()
    autoencoder.fit(X, X, nb_epoch = 500, batch_size = 100, shuffle = True, validation_data = (X, X))
    # THE ENCODER TO EXTRACT THE REDUCED DIMENSION FROM THE ABOVE AUTOENCODER
    encoder = Model(input = input_dim, output = encoded5)
    encoded_input = Input(shape = (encoding_dim, ))
    encoded_out = encoder.predict(X)
    encoded_out = pd.DataFrame(encoded_out)
#     encoded_out = pd.concat([encoded_out,y],axis=1)
    return encoded_out
def pad_sequences(sequences, maxlen=None, dtype=np.float32,
                  padding='post', truncating='post', value=0.):
    '''Pads each sequence to the same length: the length of the longest
    sequence.
        If maxlen is provided, any sequence longer than maxlen is truncated to
        maxlen. Truncation happens off either the beginning or the end
        (default) of the sequence. Supports post-padding (default) and
        pre-padding.
        Args:
            sequences: list of lists where each element is a sequence
            maxlen: int, maximum length
            dtype: type to cast the resulting sequence.
            padding: 'pre' or 'post', pad either before or after each sequence.
            truncating: 'pre' or 'post', remove values from sequences larger
            than maxlen either in the beginning or in the end of the sequence
            value: float, value to pad the sequences to the desired value.
        Returns
            x: numpy array with dimensions (number_of_sequences, maxlen)
            lengths: numpy array with the original sequence lengths
    '''
    lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x, lengths