In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os
import glob

from scipy.signal import find_peaks
from sklearn.preprocessing import StandardScaler
from skimage.restoration import denoise_tv_chambolle

from tqdm import tqdm_notebook
from IPython import display

**Summary**

This notebook generate matrix with features for matricies with Fourier spectrum for electrode and optical mapping. 

Features description:

-  *freq i* - frequency of i$^{th}$ heightest peak
-  *height i* - height of i$^{th}$ heightest peak
-  *width i* - width of i$^{th}$ heightest peak
-  *prominence i* - prominence of i$^{th}$ heightest peak
-  *#peaks_th* - number of peaks for given (th) threshhold
-  *low_freq_noise* - presence of low-frequency noise (frequency of one of the n highest peaks in the interval from 0 to lf_thHz)


### Upload data

In [2]:
path = r'C:\Users\ecath\Desktop\Research\Raw Data'

mem_spectrum = pd.read_csv(path + '\LD dataset spectrum\Spectrum of electrode LD.csv', index_col=0)
niom_spectrum = pd.read_csv(path + '\LD dataset spectrum\Spectrum of optical LD.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


### Scaling

In [3]:
def scaling(df):  
    scaler = StandardScaler() 

    y_col = [col for col in df.columns if '_yf' in col] 

    df_yf = df[y_col]
    target = pd.DataFrame(df_yf.transpose().target).transpose()
    df_yf = df_yf.drop(['target'])
    df_yf = df_yf.replace(0, np.nan)

    scaled_features = scaler.fit_transform(df_yf.values)
    df_ = pd.DataFrame(scaled_features, columns=df_yf.columns, index=df_yf.index)
    df_ = df_.fillna(value=0, axis=1)
    df_ = pd.concat([df_, target], axis = 0)   
    df[y_col] = df_
    return(df)

In [4]:
mem_spec = scaling(mem_spectrum)
niom_spec = scaling(niom_spectrum)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count


### Feature generation

In [5]:
"""
Function that generates pd.DataFrame with amount of peaks for (th*100)% threshhold. 

Parameters: 

full_df: Dataframe
Dataframe with fourier spectrum

th: float, from 0 to 1
threshhold

Returns: 

features: DataFrame, shape=(full_df[1]/2, 1)
number of peaks
"""

def number_of_peaks(full_df, th):
    all_props = []
    df = full_df[full_df.columns[::2]][:-1]
    for col in df:
        _, properties = find_peaks(df[col][df[col] != 0], height=0)
        all_props.append(properties)
    num_of_peaks = []
    
    for i in range(len(all_props)):
        try:
            max_height = np.max(all_props[i]['peak_heights'])
            peaks, _ = find_peaks(df.iloc[:,i], threshold=th)
            num = peaks.shape[0]
        except ValueError:
            num = 0
        num_of_peaks.append(num)
    num_of_peaks = pd.DataFrame(num_of_peaks, columns=['#peaks_' + str(th)], index=full_df.iloc[:, ::2].columns)
    return(num_of_peaks)

In [6]:
number_of_peaks(niom_spec, 2.5)

Unnamed: 0,#peaks_2.5
1_Bsk_1_F7_opt_yf,2
1_Bsk_1_F8_opt_yf,1
1_Bsk_1_G6_opt_yf,0
1_Bsk_1_G7_opt_yf,2
1_Bsk_1_G8_opt_yf,2
1_Bsk_1_H6_opt_yf,1
1_Bsk_1_H7_opt_yf,0
1_Bsk_1_H8_opt_yf,0
10_Epi_Bsk_1_D4_opt_yf,1
10_Epi_Bsk_1_E4_opt_yf,1


In [7]:
def get_props(full_df):
    all_peaks = []
    all_props = []

    df = full_df[full_df.columns[::2]][:-1]
    xf = full_df[full_df.columns[1::2]][:-1]

    for col in df:
        peaks, properties = find_peaks(df[col][df[col] != 0], height=0, width=0, prominence=0, rel_height=0.5)
        all_props.append(properties)
        all_peaks.append(peaks)
    return df, xf, all_peaks, all_props
 

        
def height_width_prominence(full_df, n, prop):
    
    df, xf, all_peaks, all_props = get_props(full_df)
    all_max_prop = []

    
    for i in range(len(all_props)):
        try:
            z = np.argsort(all_props[i]['peak_heights'])
            z = z[:-(n+1):-1]
            
            n_max_prop = all_props[i][prop][z] #heights of max peaks
            all_max_prop.append(n_max_prop)
            
        except IndexError:
            n_max_prop = np.zeros((n))
            all_max_prop.append(n_max_prop)
            
    all_max_prop = pd.DataFrame(all_max_prop, columns=[prop + ' ' + str(i) for i in range(n)],  index=full_df.iloc[:, ::2].columns)      
    return all_max_prop


def freq_and_label(full_df, n):
    
    df, xf, all_peaks, all_props = get_props(full_df)

    freq = []
    el_om_labels = []

    
    for i in range(len(all_props)):
        try:
            z = np.argsort(all_props[i]['peak_heights'])
            z = z[:-(n+1):-1]
            
            fr = xf.iloc[:,i][all_peaks[i][z]].values #freqs of max peaks
            freq.append(fr)
            
            el_om_label = 1
            el_om_labels.append(el_om_label)
            
        except IndexError:
            fr = np.zeros((n))
            freq.append(fr)
            
            el_om_label = 0
            el_om_labels.append(el_om_label)
    
    freq = pd.DataFrame(freq, columns=['freq ' + str(i) for i in range(n)], index=full_df.iloc[:, ::2].columns)   
    el_om_labels = pd.DataFrame(el_om_labels, columns=['Label for OM and EL'],  index=full_df.iloc[:, ::2].columns)
    return freq, el_om_labels

def second_harm(full_df, n): 
    
    df, xf, all_peaks, all_props = get_props(full_df)

    
    freq = []
    given_n = n
    
    for i in range(len(all_props)):
        try:
            z = np.argsort(all_props[i]['peak_heights'])
            z = z[:-(n+1):-1]
            
            if len(all_props[i]['peak_heights']) < given_n:
                range_n = len(all_props[i]['peak_heights'])
            else: 
                range_n = given_n
            
            fr_forharm = xf.iloc[:,i][all_peaks[i][z]] # list of frequencies for max peaks
            fr_forharm.reset_index(drop = True, inplace = True)

            index_forharm = 0 #initiation of second harmonics index (for each i) 
            for q in range(range_n):
                for p in range(range_n):
                    try:
                        a = fr_forharm[p] / fr_forharm[q] #frequencies relation
                        if (a < 2.1) and (a > 1.9): 
                            index_forharm = 1 # if relation is 2 plus/minus 5% output 1 
                    except ZeroDivisionError:
                        a = 0 
            freq.append(index_forharm)
            
        except IndexError:
            index_forharm = 0
            freq.append(index_forharm)
    
    freq = pd.DataFrame(freq, columns=['second_harmonics'],  index=full_df.iloc[:, ::2].columns)

    return freq


def concat(full_df, n):
    
    height = height_width_prominence(full_df, n, 'peak_heights')
    width = height_width_prominence(full_df, n, 'widths')
    prominence = height_width_prominence(full_df, n, 'prominences')
    freq, label = freq_and_label(full_df, n)
    freq_forharm = second_harm(full_df, n) 
    
    features = pd.concat([freq, height, width, prominence, freq_forharm, label], axis=1)
    features.fillna(0.0, inplace=True)
    return(features)

"""
Calculate the SNR value for MEM and NIOM spectrum

Parameters: 
-----------
df: pd.DataFrame
    n x m dataframe with spectrums 

Returns: 
-------
snr: pd.DataFrame
    1 x n dataframe with correspondings SNR values
    
"""

def SNR(df):
    y_col = [col for col in df.columns if '_yf' in col]
    y_df = df[y_col]
    snr = []
        
    for i in range(y_df.shape[1]):          
        s = y_df[y_col[i]][y_df[y_col[i]] != 0]
        _, properties = find_peaks(s, height=0)
        
        num_of_avg_peak = 2 #number of highest peak tp average
        mean_max = np.mean(np.sort(properties['peak_heights'])[-num_of_avg_peak:]) 
        
        sd = s.std(axis=0)
        ratio = np.round(mean_max / sd, 2)
        ratio = np.where(sd == 0, 0, ratio)
        snr.append(ratio)
        
    snr = pd.DataFrame(snr, columns=['SNR'], index=y_df.columns)
#     snr['SNR'] = (snr['SNR'] - snr['SNR'].min()) / (snr['SNR'].max() - snr['SNR'].min()) #line to rescale SNR value from 0 to 1
    return snr

In [8]:
"""
Function that generates final pd.DataFrame with all features

Parameters: 

full_df: Dataframe
Dataframe with fourier spectrum

n: int
number of peaks

th1: float, from 0 to 1
threshhold

th2: float, from 0 to 1
threshhold

path: str
path to save the matrix

download: bool
download or not download feature matrix

Returns: 

features: DataFrame
features for full_df dataframe
"""

def create_feature_df(full_df, n, th1, th2, path, name, download=False):    
    properties = concat(full_df=full_df, n=n) 
    num_peak_1 = number_of_peaks(full_df=full_df, th=th1)
    num_peak_2 = number_of_peaks(full_df=full_df, th=th2)
    snr = SNR(df=full_df)
    target = pd.DataFrame(full_df[full_df.columns[::2]].loc['target'])
    
    features = pd.concat([num_peak_1, num_peak_2, properties, snr, target], axis=1)
    
    if download == True: 
        features.to_csv(path + name + '.csv')
        return(features)
    else:
        return(features)

In [21]:
def electode_optical_matrix(electrode_df, optical_df, path, name, download=False):
    electrode_df.drop(columns = ['target','Label for OM and EL'], inplace=True)
    optical_df.index = electrode_df.index
    el_om_features = pd.concat([electrode_df, optical_df], axis=1)
    el_om_features = el_om_features[el_om_features['Label for OM and EL'] == 1]
    el_om_features.drop(['Label for OM and EL'], axis=1, inplace=True)
#     el_om_features = el_om_features.reset_index().drop('index',axis=1)

    if download == True: 
        el_om_features.to_csv(path + name + '.csv', index=True)
        return(el_om_features)
    else:
        return(el_om_features)

In [22]:
def short_electode_matrix(electrode_df, path, name, download=False):
    electrode_df = electrode_df[electrode_df['Label for OM and EL'] == 1]
    electrode_df.drop(['Label for OM and EL'], axis=1, inplace=True)
#     electrode_df = electrode_df.reset_index().drop('index',axis=1)
    
    if download == True: 
        electrode_df.to_csv(path + name + '.csv', index=True)
        return(electrode_df)
    else:
        return(electrode_df)

In [23]:
electrode_df = create_feature_df(mem_spec, n=2, th1=2, th2=2, path=path,\
                                         name='\Feature matrix EL peaks',  download=False)
optical_df = create_feature_df(niom_spec, n=2, th1=2, th2=2, path=path,\
                               name='\Feature matrix OM peaks',  download=False)


In [24]:
electode_optical_matrix(electrode_df, optical_df, path=path,
                                                 name='\Feature matrix EL+OM peaks',
                                                 download=False)

Unnamed: 0,#peaks_2,#peaks_2.1,freq 0,freq 1,peak_heights 0,peak_heights 1,widths 0,widths 1,prominences 0,prominences 1,...,freq 1.1,peak_heights 0.1,peak_heights 1.1,widths 0.1,widths 1.1,prominences 0.1,prominences 1.1,second_harmonics,SNR,target
1_Bsk_1_F7_yf,4,4,8.999558,8.799567,3.893244,3.522688,2.888182,1.294096,4.911664,3.887853,...,8.257151,5.062922,4.832748,1.626484,1.761873,5.957064,3.582865,0,4.94,1.0
1_Bsk_1_F8_yf,2,2,7.299641,4.899759,5.491934,3.790265,1.581591,1.888874,6.443411,4.573044,...,13.467021,4.064414,3.602948,1.383065,2.479826,5.000694,3.862832,0,3.83,1.0
1_Bsk_1_G6_yf,6,6,5.299740,7.899612,5.548745,4.277543,1.459269,1.870464,6.467239,4.897076,...,5.603067,4.341092,3.377838,2.655227,4.359042,5.194518,3.698110,0,3.85,1.0
1_Bsk_1_G7_yf,3,3,7.099651,6.299690,4.943747,4.921319,1.680300,1.556153,5.861012,4.762427,...,6.389462,3.885832,3.741523,1.364850,2.150607,4.769680,3.823150,0,3.81,1.0
1_Bsk_1_G8_yf,1,1,7.299641,7.499631,4.442124,4.123401,4.511283,0.729211,5.425885,0.767075,...,11.402733,4.038118,3.551671,1.696303,1.910184,4.919546,4.194323,0,3.79,1.0
1_Bsk_1_H6_yf,5,5,7.399636,5.499730,3.637014,3.274829,1.781566,1.418596,4.698296,3.539036,...,5.799666,3.649687,3.463044,5.133254,2.111495,4.569908,4.253122,0,3.55,1.0
1_Bsk_1_H7_yf,2,2,8.699572,7.699622,3.829786,3.260977,1.409700,1.332191,4.900273,4.052400,...,13.467021,4.004485,3.229425,3.037411,1.886833,4.865794,3.787183,1,3.61,1.0
1_Bsk_1_H8_yf,1,1,8.699572,4.999754,3.914799,3.202179,6.692523,3.011932,4.936779,4.138191,...,12.877224,4.535443,4.330432,3.558546,1.214872,5.348135,3.149143,0,4.42,1.0
10_Epi_Bsk_1_D4_yf,2,2,8.569192,17.138384,11.824479,5.980077,1.162452,1.428535,12.100921,6.215653,...,17.214015,11.753353,2.498864,1.650022,1.880814,12.083241,2.785584,1,7.11,1.0
10_Epi_Bsk_1_E4_yf,6,6,8.569192,17.138384,6.319053,4.475598,1.220481,1.436916,7.149437,5.298922,...,17.214015,11.548079,3.152471,1.671326,1.780167,11.867346,3.365724,1,7.33,1.0


In [25]:
def download_feature_matrices(path, th1, th2, dn_type):
    
    for i in tqdm_notebook(range(2, 4)):
        electrode_df = create_feature_df(mem_spec, n=i, th1=th1, th2=th2, path=path,
                                         name='\Feature matrix EL ' + str(i) + ' peaks' + dn_type,  download=True)
        short_electode_matrix(electrode_df, path=path, name='\Feature matrix EL short ' + str(i) + ' peaks', download=True)
        print(electrode_df.shape)
        optical_df = create_feature_df(niom_spec, n=i, th1=th1, th2=th2, path=path,
                                       name='\Feature matrix OM ' + str(i) + ' peaks' + dn_type,  download=False)
        print(optical_df.shape)
        el_om_features = electode_optical_matrix(electrode_df, optical_df,
                                                 path=path,
                                                 name='\Feature matrix EL+OM ' + str(i) + ' peaks' + dn_type,
                                                 download=True)
        print(el_om_features.shape)

In [26]:
path = r'C:\Users\ecath\Desktop\Research\Raw Data\LD dataset features'
if not os.path.exists(path):
    os.mkdir(path)
download_feature_matrices(path, 2, 3, '')

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

(1728, 14)
(1728, 14)
(1728, 25)
(1728, 18)
(1728, 18)
(1728, 33)


-------------------------------------------------------