In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob

from scipy.signal import find_peaks,lfilter, butter, welch
from tqdm import tqdm

from IPython import display

In [2]:
"""
Function performs bandpass filter for the digital signal 

Parameters: 
-----------
data: array_like
    An N-dimensional input array
lowcut: float
    Low cutoff frequency
highcut: float
    High cutoff frequency
fs: float
    Sampling rate of the signal
order: int
    The order of the filter.

Returns: 
-------
filtered_signal: array
    The output of the digital filter
"""

def butter_bandpass(data, lowcut, highcut, fs, order=2):
    nyq = 0.5 * fs
    
    high = highcut / nyq
    low = lowcut / nyq
    
    b, a = butter(order, [low, high], btype='bandpass', analog=False)
    filtered_signal = lfilter(b, a, data)
    
    return filtered_signal


"""
Fill NaN with zeros

Parameters: 
-----------
df: DataFrame
    DataFrame with NaNs
    
Returns: 
-------
df: Dataframe
    All NaNs filled by zeros
"""

def del_nul_and_nan(df):
    
    df.fillna(value=0, axis=1, inplace=True)    
    return df


"""


Parameters: 
-----------
df: pd.DataFrame
    An N-dimensional input DataFrame
init_target: pd.DataFrame
    targets of df
lowcut: float
    Low cutoff frequency
highcut: float
    High cutoff frequency
fs: float
    Sampling rate of the signal
flag_number: float
    special value which is unique for each recording (it should be the same for all windows from one recording)

Returns: 
-------
fft_out: pd.DataFrame
    An output Dataframe with spectrum power and frequencies
final_targets: pd.DataFrame
    An output DataFrame with all targets for the file
multiple_flags: pd.DataFrame
    An output DataFrame with all flags for the file
"""

def spec_and_freq_for_single_df(df, init_target, fs, flag_number, lowcut, highcut):
    
    fft_out = pd.DataFrame()
    final_targets = pd.DataFrame()
    final_target = []
    multiple_flag=[]
    target_raw = init_target.iloc[0]
    duration=5 #time duration of window in sec
    step = 500 #step of window beginning in msec
    t0=0 #start time
    end_point = int(t0+duration*fs)
    i=1 #counter
    while end_point < df.shape[0]:
        flag= flag_number
        for col in range(df.shape[1]):
            target = target_raw.iloc[col]
            end_point = int(t0+duration*fs)
            N = int(duration*fs)
            interesting_df = df.iloc[t0:end_point,col]
            filtered_yf = butter_bandpass(interesting_df, lowcut, highcut, fs, order=2)
            #print(col,t0,i)

            fft_yf = np.fft.fft(filtered_yf) #spectrum
            fft_xf = np.fft.fftfreq(N, 1/fs) #frequencies

            fft_20_index = np.argwhere((fft_xf<20) & (fft_xf>0))        
            fft_yf_20 = fft_yf[fft_20_index] #cutting on 20Hz
            fft_xf_20 = fft_xf[fft_20_index] #cutting on 20Hz

            fft_yf_20 = pd.DataFrame(np.abs(fft_yf_20) / N, columns=[df.keys()[col] + '_' + str(i) + '_yf'])
            fft_xf_20 = pd.DataFrame(fft_xf_20, columns=[df.keys()[col] + '_' + str(i) + '_xf']) 

            fft_out = pd.concat([fft_out, fft_yf_20, fft_xf_20], axis=1)
            final_target.append(target)
            multiple_flag.append(flag)
            flag += 1
        t0 = t0 + step
        i=i+1
    final_targets = pd.DataFrame(final_target,columns=['target'])
    multiple_flags = pd.DataFrame(multiple_flag,columns=['flag'])    
    return(fft_out, final_targets,multiple_flags)


"""
Creating DataFrame of target-flag combination 

Parameters: 
-----------
init_target: DataFrame
    Targets of initial dataset
flag_number: int
    Old number of flag from previous iteration
Returns: 
-------
target_flag: DataFrame
    combination of target-flag couples
flag_number: int
    the last number of flag after passing current file
"""

def target_flag_combination(init_target, flag_number):
    final_flag=[]
    final_target=[]
    target_raw = init_target.iloc[0]
    for col in range(init_target.shape[1]):
        target = target_raw.iloc[col]
        final_target.append(target)
        final_flag.append(flag_number)
        flag_number += 1
    final_targets = pd.DataFrame(final_target,columns=['target'])
    final_flags = pd.DataFrame(final_flag,columns=['flag'])
    target_flag = pd.concat([final_targets, final_flags], axis=1)  
    return(target_flag, flag_number)


"""
Creating the Fourier spectra of list of files

Parameters: 
-----------
df: list
    List with DataFrames of simultaneous OM (Optical) and EM (Electrode) Mapping
flag_number: float
    special value which is unique for each recording (it should be the same for all windows from one recording)   
Returns: 
-------
all_fft_el, all_fft_om: DataFrames
    Lists with spectrum and frequencies DataFrames
all_targets_el, all_targets_om: DataFrames
    Lists with targets for EM and OM DataFrames
all_flags_el, all_flags_om: DataFrames
    Lists with targets for EM and OM DataFrames
final_target_flag: DataFrame
    DataFrame with combined targets and flags
flag_number: int
    the last number of flag after passing all files
"""

def full_spec_and_freq(df, flag_number):
    
    all_fft_el = pd.DataFrame()
    all_fft_om = pd.DataFrame()
    all_targets_el = pd.DataFrame()
    all_targets_om = pd.DataFrame()
    all_flags_el = pd.DataFrame()
    all_flags_om = pd.DataFrame()
    
    electrode_signal = df[df.columns[::3]]
    optical_signal = df[df.columns[2::3]]
    init_target = df[df.columns[1::3]]

    electrode_signal = del_nul_and_nan(electrode_signal)
    optical_signal = del_nul_and_nan(optical_signal)        

    Fs_el = 1017.25 # sampling rate
    Fs_om = 1000.0    
    fft_el, target_el, flags_el = spec_and_freq_for_single_df(electrode_signal, init_target, Fs_el, flag_number, lowcut=2, highcut=20.0)
    fft_om, target_om, flags_om = spec_and_freq_for_single_df(optical_signal, init_target, Fs_om, flag_number, lowcut=2, highcut=20.0)
    final_target_flag, flag_number = target_flag_combination(init_target, flag_number)
                    
    all_fft_el = pd.concat([all_fft_el, fft_el], axis=1)
    all_fft_om = pd.concat([all_fft_om, fft_om], axis=1)
    all_targets_el = pd.concat([all_targets_el, target_el], axis=1)
    all_targets_om = pd.concat([all_targets_om, target_om], axis=1) 
    all_flags_el = pd.concat([all_flags_el, flags_el], axis=1)
    all_flags_om = pd.concat([all_flags_om, flags_om], axis=1)   
    return(all_fft_el, all_fft_om, all_targets_el, all_targets_om, all_flags_el, all_flags_om,final_target_flag, flag_number)

In [3]:
"""
Function that generates pd.DataFrame with different properties for n highest peaks. Properties are values of frequency, 
height, width, prominence. 

For more details about properties - https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html

Parameters: 

full_df: Dataframe
Dataframe with fourier spectrum

n: int
number of peaks

lf_th: float
threshold for low frequency noise (1Hz by default)

Returns: 

features: DataFrame, shape=(full_df.shape[1]/2, n*4)
properties of n hightest peaks
"""
def properties_of_peaks(list_df, n, lf_th=1): 
    index = []
    freq = []
    height = []
    width = []
    prominence = []
    prom_ratio=[]
    height_ratio=[]
    snr = []
    ratio_columns = ['1 and 2', '1 and 3', '1 and 4', '2 and 3', '2 and 4', '3 and 4']
    
    features = pd.DataFrame()
    lengh = 0
    for k in range(len(list_df)):
        full_df = list_df[k]
        lengh = lengh + full_df.shape[1]
        df = full_df[full_df.columns[::2]][:-1]
        xf = full_df[full_df.columns[1::2]][:-1]
        all_peaks = []
        all_props = []
        for col in df:
            peaks, properties = find_peaks(df[col], height=0, width=0, prominence=0, rel_height=0.5)
            properties['peak_index'] = peaks
            all_props.append(properties)
            all_peaks.append(peaks)

        for i in range(len(all_props)):
            z = np.argsort(all_props[i]['peak_heights'])
            z = z[:-(n+1):-1]
            if len(z) < 5:
                zeros_raw_n=np.zeros(n)
                height.append(zeros_raw_n)
                width.append(zeros_raw_n)
                prominence.append(zeros_raw_n)
                ratio_height = []
                ratio_prominence = []
                ratio_height = np.zeros(n+1)
                ratio_prominence = np.zeros(n+1)
                height_ratio.append(ratio_height)
                prom_ratio.append(ratio_prominence)
                for j in range(n):
                    fr = np.zeros(1)
                    freq.append(fr)                
            else:       
                idx = all_props[i]['peak_index'][z] #index of max peaks
                for j in range(n):
                    fr = xf.iloc[:,i][all_peaks[i][z][j]] #freqs of max peaks
                    freq.append(fr)

                h = all_props[i]['peak_heights'][z] #heights of max peaks
                height.append(h)
                ratio_height = []
                ratio_height = np.append(ratio_height, h[0]/h[1]) #ratio between height of 1st highest peak and 2nd
                ratio_height = np.append(ratio_height, h[0]/h[2]) #ratio between height of 1st highest peak and 3rd
                ratio_height = np.append(ratio_height, h[0]/h[3]) #ratio between height of 1st highest peak and 4th
                ratio_height = np.append(ratio_height, h[1]/h[2]) #ratio between height of 2nd highest peak and 3rd
                ratio_height = np.append(ratio_height, h[1]/h[3]) #ratio between height of 2nd highest peak and 4th
                ratio_height = np.append(ratio_height, h[2]/h[3]) #ratio between height of 3rd highest peak and 4th
                height_ratio.append(ratio_height)
                w = all_props[i]['widths'][z] #width of max peaks
                width.append(w)
                p = all_props[i]['prominences'][z]
                prominence.append(p)
                ratio_prominence = []
                ratio_prominence = np.append(ratio_prominence,p[0]/p[1]) #ratio between prominence of 1st highest peak and 2nd
                ratio_prominence = np.append(ratio_prominence,p[0]/p[2]) #ratio between prominence of 1st highest peak and 3rd
                ratio_prominence = np.append(ratio_prominence,p[0]/p[3]) #ratio between prominence of 1st highest peak and 4th
                ratio_prominence = np.append(ratio_prominence,p[1]/p[2]) #ratio between prominence of 2nd highest peak and 3rd
                ratio_prominence = np.append(ratio_prominence,p[1]/p[3]) #ratio between prominence of 2nd highest peak and 4th
                ratio_prominence = np.append(ratio_prominence,p[2]/p[3]) #ratio between prominence of 3rd highest peak and 4th
                prom_ratio.append(ratio_prominence)
           

        y_col = [col for col in df.columns]
        y_df = df[y_col]

        for i in range(y_df.shape[1]):          
            s = y_df[y_col[i]][y_df[y_col[i]] != 0]

            _, properties = find_peaks(s, height=0)
            mean_max = np.mean(np.sort(properties['peak_heights'])[-2:])#np.max(properties['peak_heights'])
            sd = s.std(axis=0)
            if np.isnan(mean_max)==True:
                ratio = 0
            else:
                ratio = np.round(mean_max / sd, 2)
                ratio = np.where(sd == 0, 0, ratio)
            snr.append(ratio)
        
    snr = pd.DataFrame(snr, columns=['SNR'])        
    freq = np.reshape(freq, ((int(lengh/2)), n)) 
    freq = pd.DataFrame(freq, columns=['freq ' + str(i) for i in range(n)])
    height = pd.DataFrame(height, columns=['height ' + str(i) for i in range(n)])
    width = pd.DataFrame(width, columns=['width ' + str(i) for i in range(n)])
    prominence = pd.DataFrame(prominence, columns=['prominence ' + str(i) for i in range(n)])
    height_ratio = pd.DataFrame(height_ratio, columns=['height ratio between peaks '+ str(i) for i in ratio_columns])
    prom_ratio = pd.DataFrame(prom_ratio, columns=['prominence ratio between peaks '+ str(i) for i in ratio_columns])
    
    features = pd.concat([features, freq, height, width, prominence, height_ratio, prom_ratio,snr], axis=1)
    
    return(features)

"""
Function that generates pd.DataFrame with amount of peaks for (th*100)% threshhold. 

Parameters: 

full_df: Dataframe
Dataframe with fourier spectrum

th: float, from 0 to 1
threshhold

Returns: 

features: DataFrame, shape=(full_df[1]/2, 1)
number of peaks
"""

def number_of_peaks(list_df, th):
    num_of_peaks = []
    for k in range(len(list_df)):
        full_df = list_df[k]
        all_props = []
        df = full_df[full_df.columns[::2]][:-1]
        for col in df:
            _, properties = find_peaks(df[col], height=0)
            all_props.append(properties)

        for i in range(len(all_props)):
            if len(all_props[i]['peak_heights'])< 5:
                num =0
            else:
                max_height = np.max(all_props[i]['peak_heights'])
                peaks, _ = find_peaks(df.iloc[:,i], threshold=th*max_height)
                num = peaks.shape[0]
            num_of_peaks.append(num)
    num_of_peaks = pd.DataFrame(num_of_peaks, columns=['#peaks_' + str(th)])
    return(num_of_peaks)


"""
Function that generates list with calculated neighbor features with kernel 3 by 3

Parameters: 

previous_grid: list of shape (8,8)
Grid of one feature

Returns: 

final_grid: list of shape (8,8)
Calculated neighbor grid of one feature
"""
def calculation_neighbour_features(previous_grid):
    final_grid=[]
    for i in range(8):
        for j in range(8):
            if i ==0:
                if j==0:
                    final_grid.append(float((previous_grid[i+1,j]+previous_grid[i,j+1]+previous_grid[i+1,j+1])/3))
                elif j==7:
                    final_grid.append(float((previous_grid[i+1,j]+previous_grid[i,j-1]+previous_grid[i+1,j-1])/3))
                else:
                    final_grid.append(float((previous_grid[i+1,j]+previous_grid[i,j+1]+previous_grid[i+1,j+1]+previous_grid[i,j-1]+previous_grid[i+1,j-1])/5))
            if i != 0 and i!=7:
                if j==0:
                    final_grid.append(float((previous_grid[i+1,j]+previous_grid[i-1,j]+previous_grid[i+1,j+1]+previous_grid[i,j+1]+previous_grid[i-1,j+1])/5))
                elif j==7:
                    final_grid.append(float((previous_grid[i+1,j]+previous_grid[i-1,j]+previous_grid[i+1,j-1]+previous_grid[i,j-1]+previous_grid[i-1,j-1])/5))
                else:
                    final_grid.append(float((previous_grid[i,j-1]+previous_grid[i,j+1]+previous_grid[i+1,j]+previous_grid[i+1,j-1]+previous_grid[i+1,j+1]+previous_grid[i-1,j]+previous_grid[i-1,j-1]+previous_grid[i-1,j+1])/8))
            if i ==7:
                if j==0:
                    final_grid.append(float((previous_grid[i-1,j]+previous_grid[i,j+1]+previous_grid[i-1,j+1])/3))
                elif j==7:
                    final_grid.append(float((previous_grid[i-1,j]+previous_grid[i,j-1]+previous_grid[i-1,j-1])/3))
                else:
                    final_grid.append(float((previous_grid[i,j-1]+previous_grid[i,j+1]+previous_grid[i-1,j+1]+previous_grid[i-1,j-1]+previous_grid[i-1,j])/5))
    return(final_grid)

"""
Function that drops the raws with "hight 0" equal to 0
"""

def drop_empty_rows(features):
    drop_matrix=[]
    for i in range(features.shape[0]):
        if features.iloc[i]['height 0'] ==0:
            drop_matrix.append(i)
    features.drop(drop_matrix,0,inplace=True)
    return(features)

"""
Function that generates final pd.DataFrame with all features

Parameters: 

full_df: Dataframe
Dataframe with fourier spectrum

n: int
number of peaks

th1: float, from 0 to 1
threshhold

th2: float, from 0 to 1
threshhold

lf_th: float
low frequency threshhold

path: str
path to save the matrix

download: bool
download or not download feature matrix

Returns: 

features: DataFrame
features for full_df dataframe
"""

def create_feature_df(list_df, target, flag, n, th1, th2, lf_th, path, name, download=False):
    
    features = pd.DataFrame()
    
    properties = properties_of_peaks(list_df, n=n, lf_th=lf_th) 
    num_peak_1 = number_of_peaks(list_df, th=th1)
    num_peak_2 = number_of_peaks(list_df, th=th2)
    target.fillna(value=-1, axis=1, inplace=True)
    features = pd.concat([features, num_peak_1, num_peak_2, properties], axis=1)
    
    neighbour_features = pd.DataFrame()
    for raw in range(int(np.size(features,0)/64)): #number of basket
        features_basket = pd.DataFrame() 
        for col in range(int(np.size(features,1))): #features
            index_set=[] #index matrix for last column indexes
            for i in range(raw*64,(raw+1)*64):
                index_set.append(i)
            resized_set = np.resize(features.iloc[raw*64:(raw+1)*64,col],(8,8)) #8x8 grid for one feature
            if resized_set.sum() ==0:
                last_column = np.zeros(64)
            else:
                last_column = calculation_neighbour_features(resized_set)
            last_column = pd.DataFrame(last_column, columns = ['neighbour ' + features.columns[col]], index = [index_set])
            features_basket = pd.concat([features_basket,last_column], axis=1)
        neighbour_features = pd.concat([neighbour_features,features_basket], axis=0)
        
    features_with_neighbour = pd.concat([s.reset_index(drop=True) for s in [features,neighbour_features,target,flag]], axis=1)
    features = pd.concat([s.reset_index(drop=True) for s in [features,target]], axis=1)
    
    features_with_neighbour = drop_empty_rows(features_with_neighbour)
    features = drop_empty_rows(features)
    
    if download == True: 
        features_with_neighbour.to_csv(path + name)
        return(features_with_neighbour)
    else:
        return(features_with_neighbour)

In [4]:
path = r'D:\Data\reannotation\All annotations with semidrivers'

In [5]:
MEM_spectra = []
NIOM_spectra=[]
all_targets_el=pd.DataFrame()
all_targets_om=pd.DataFrame()
all_flags_el=pd.DataFrame()
all_flags_om=pd.DataFrame()
final_target_flags=pd.DataFrame()
flag_number = 0
for filename in tqdm(glob.glob(os.path.join(path, '*.csv'))):
    data = pd.read_csv(filename, sep=';',header=0)
    print(filename, data.shape)
    path, file = os.path.split(filename)
    MEM_spectrum,NIOM_spectrum, target_el,target_om, flag_el,flag_om,final_target_flag, flag_number = full_spec_and_freq(data, flag_number)
    print(MEM_spectrum.shape,NIOM_spectrum.shape)
    MEM_spectra.append(MEM_spectrum)
    NIOM_spectra.append(NIOM_spectrum)
    all_targets_el = pd.concat([all_targets_el, target_el], axis=0)
    all_targets_om = pd.concat([all_targets_om, target_om], axis=0)
    all_flags_el = pd.concat([all_flags_el, flag_el], axis=0)
    all_flags_om = pd.concat([all_flags_om, flag_om], axis=0)
    final_target_flags = pd.concat([final_target_flags, final_target_flag], axis=0)

  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

D:\Data\reannotation\Annotation with semidrivers HD\F26.csv (16667, 192)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


flag 1 0
flag 2 0
flag 3 0
(99, 3200) (99, 3200)


 14%|████████████                                                                        | 1/7 [00:32<03:17, 32.96s/it]

D:\Data\reannotation\Annotation with semidrivers HD\F27.csv (16383, 192)
flag 1 64
flag 2 64
flag 3 64
(99, 3072) (99, 3072)


 29%|████████████████████████                                                            | 2/7 [01:06<02:44, 32.99s/it]

D:\Data\reannotation\Annotation with semidrivers HD\F28.csv (18311, 192)
flag 1 128
flag 2 128
flag 3 128
(99, 3584) (99, 3584)


 43%|████████████████████████████████████                                                | 3/7 [01:48<02:23, 35.87s/it]

D:\Data\reannotation\Annotation with semidrivers HD\F29.csv (18311, 192)
flag 1 192
flag 2 192
flag 3 192
(99, 3584) (99, 3584)


 57%|████████████████████████████████████████████████                                    | 4/7 [02:30<01:52, 37.60s/it]

D:\Data\reannotation\Annotation with semidrivers HD\F30.csv (10173, 192)
flag 1 256
flag 2 256
flag 3 256
(99, 1536) (99, 1536)


 71%|████████████████████████████████████████████████████████████                        | 5/7 [02:40<00:58, 29.44s/it]

D:\Data\reannotation\Annotation with semidrivers HD\F31.csv (10173, 192)
flag 1 320
flag 2 320
flag 3 320
(99, 1536) (99, 1536)


 86%|████████████████████████████████████████████████████████████████████████            | 6/7 [02:51<00:23, 23.97s/it]

D:\Data\reannotation\Annotation with semidrivers HD\F32.csv (10173, 192)
flag 1 384
flag 2 384
flag 3 384
(99, 1536) (99, 1536)


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [03:02<00:00, 26.08s/it]


In [17]:
features_el = create_feature_df(MEM_spectra, all_targets_el,all_flags_el, n=5, th1=0.05, th2=0.1, lf_th=1.5, path=path, name = '\Feature matrix electrode signal AF semidrivers with flags.csv', download=False)

In [11]:
features_el['target'].value_counts()

 0.0    20747
 0.5     3356
 1.0     2935
-1.0     1634
Name: target, dtype: int64