# Use esig package

In [11]:
import os
import numpy as np
import pandas as pd
from enum import Enum
from typing import Union
import esig


In [12]:
esig.is_library_loaded()

True

In [13]:
esig.get_version()

'0.9.8.3'

In [14]:
esig.logsigdim(5, 3)

55

In [15]:
stream = np.array([
    [1.0, 1.0],
    [3.0, 4.0],
    [5.0, 2.0],
    [8.0, 6.0]
])
depth = 2

sig = esig.stream2sig(stream, depth) # compute the signature
print(sig) # prints "[1.0, 7.0, 5.0, 24.5, 19.0, 16.0, 12.5]"

[ 1.   7.   5.  24.5 19.  16.  12.5]


## Preamble

In [16]:
KAGGLE = False

if KAGGLE:
    None

else: # local
    base_dir = os.path.join(os.getcwd(), "..")
    
    RANDOM_STATE = 1

    class Dir(str, Enum):
        root = base_dir
        eeg_train = os.path.join(base_dir, "train_eegs")
        eeg_test = os.path.join(base_dir, "test_eegs")
        spc_train = os.path.join(base_dir, "train_spectrograms")
        spc_test = os.path.join(base_dir, "test_spectrograms")
        out = os.path.join(base_dir, "submissions")

    class Const(int, Enum):
        eeg_len = 50
        fq_eeg = 200

    class Grade(float, Enum):
        certain = 1.
        # TO COMPLETE

train_eeg_names = os.listdir(Dir.eeg_train)
train_spc_names = os.listdir(Dir.spc_train)
len(train_eeg_names), len(train_spc_names)

(17300, 11138)

## Functions definition

In [17]:
def open_train_metadata(folder:str) -> pd.DataFrame:
    """
    open and process train.csv file
    """
    train = pd.read_csv(os.path.join(Dir.root, "train.csv"))
    vote_cols = [c for c in train.columns if c.endswith("vote")]
    train["n_votes"] = train[vote_cols].sum(axis=1)
    for c in vote_cols:
        train[c] = train[c] / train["n_votes"]
    train["eeg_length"] = train["eeg_label_offset_seconds"].diff().shift(-1).fillna(-1).astype(int)
    return train

def print_summary_metadata(data:pd.DataFrame) -> None:
    print("="*50)
    print("Metadata summary :")
    print("Len : ", len(data))
    summary_count = data.groupby("expert_consensus")[["eeg_id"]].count().rename(columns={"eeg_id": "n_sample"})
    tot = summary_count["n_sample"].sum()
    summary_count["percent"] = (summary_count["n_sample"] / tot * 100 ).astype(int)
    display(summary_count)
    print("="*50)

def open_file_id(folder:str, id:Union[str, int], prefix:str="", extension:str=".parquet") -> pd.DataFrame:
    return pd.read_parquet(os.path.join(folder, prefix + str(id) + extension))

def get_eeg_sample(id:int, subid:int, offset:int, length:int) -> pd.DataFrame:
    eeg_samp = open_file_id(Dir.eeg_train, id)
    start, end = offset * Const.fq_eeg, (offset + length) * Const.fq_eeg
    return eeg_samp.iloc[start:end]

def get_eeg_sample(id:int, subid:int, offset:int, length:int) -> pd.DataFrame:
    eeg_samp = open_file_id(Dir.eeg_train, id)
    start, end = offset * Const.fq_eeg, (offset + length) * Const.fq_eeg
    return eeg_samp.iloc[start:end]

def get_eeg_subsample(full_eeg:pd.DataFrame, start:int, end:int) -> pd.DataFrame:
    return full_eeg.iloc[start: end]



## Process metadata

In [18]:
meta_all = open_train_metadata(base_dir)
print_summary_metadata(meta_all)
VOTE_COLS = [c for c in meta_all.columns if c.endswith("vote")]
meta_all.iloc[:10]

Metadata summary :
Len :  106800


Unnamed: 0_level_0,n_sample,percent
expert_consensus,Unnamed: 1_level_1,Unnamed: 2_level_1
GPD,16702,15
GRDA,18861,17
LPD,14856,13
LRDA,16640,15
Other,18808,17
Seizure,20933,19




Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,n_votes,eeg_length
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,2
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,10
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,2
5,1628180742,5,26.0,353733,5,26.0,2413091605,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,4
6,1628180742,6,30.0,353733,6,30.0,364593930,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
7,1628180742,7,36.0,353733,7,36.0,3811483573,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,4
8,1628180742,8,40.0,353733,8,40.0,3388718494,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,-40
9,2277392603,0,0.0,924234,0,0.0,1978807404,30539,GPD,0.0,0.0,0.454545,0.0,0.090909,0.454545,11,2


In [19]:
eeg = pd.read_parquet(os.path.join(Dir.eeg_train, train_eeg_names[0]))
eeg = eeg.reset_index()

In [20]:
eeg

Unnamed: 0,index,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,...,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
0,0,-105.849998,-89.230003,-79.459999,-49.230000,-99.730003,-87.769997,-53.330002,-50.740002,-32.250000,...,-43.270000,-88.730003,-74.410004,-92.459999,-58.930000,-75.739998,-59.470001,8.210000,66.489998,1404.930054
1,1,-85.470001,-75.070000,-60.259998,-38.919998,-73.080002,-87.510002,-39.680000,-35.630001,-76.839996,...,-43.040001,-68.629997,-61.689999,-69.320000,-35.790001,-58.900002,-41.660000,196.190002,230.669998,3402.669922
2,2,8.840000,34.849998,56.430000,67.970001,48.099998,25.350000,80.250000,48.060001,6.720000,...,61.000000,16.580000,55.060001,45.020000,70.529999,47.820000,72.029999,-67.180000,-171.309998,-3565.800049
3,3,-56.320000,-37.279999,-28.100000,-2.820000,-43.430000,-35.049999,3.910000,-12.660000,8.650000,...,4.180000,-51.900002,-21.889999,-41.330002,-11.580000,-27.040001,-11.730000,-91.000000,-81.190002,-1280.930054
4,4,-110.139999,-104.519997,-96.879997,-70.250000,-111.660004,-114.430000,-71.830002,-61.919998,-76.150002,...,-67.480003,-99.029999,-93.610001,-104.410004,-70.070000,-89.250000,-77.260002,155.729996,264.850006,4325.370117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,-45.540001,-26.459999,-23.209999,-25.250000,-21.559999,-36.549999,10.730000,-16.290001,-55.919998,...,-29.770000,-22.000000,3.710000,8.470000,0.480000,9.950000,33.959999,110.510002,58.599998,301.239990
9996,9996,-26.860001,4.350000,7.410000,7.830000,5.260000,7.750000,50.130001,4.150000,1.720000,...,7.150000,-6.820000,38.070000,32.880001,21.990000,32.990002,60.209999,-156.949997,-275.929993,-4634.799805
9997,9997,-133.759995,-111.190002,-119.180000,-105.760002,-130.039993,-104.059998,-68.290001,-86.480003,-57.130001,...,-95.839996,-107.540001,-86.449997,-94.099998,-97.050003,-86.339996,-68.040001,-14.880000,66.440002,1667.800049
9998,9998,-78.889999,-59.660000,-60.770000,-59.810001,-63.020000,-60.020000,-20.690001,-42.820000,-68.669998,...,-62.810001,-52.869999,-34.099998,-31.500000,-37.810001,-32.259998,-10.870000,137.559998,193.839996,2743.379883


In [27]:
pd.set_option('display.max_columns', None)
display((100 * eeg.corr()).astype(int))

Unnamed: 0,index,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
index,100,0,0,0,-1,-2,0,3,5,-3,0,1,1,0,0,2,2,-2,1,0,0
Fp1,0,100,96,90,85,94,87,84,75,74,83,82,96,94,88,80,94,87,-58,-73,-77
F3,0,96,100,95,91,95,92,89,81,76,90,88,92,95,92,83,93,91,-66,-80,-84
C3,0,90,95,100,97,95,95,94,87,66,89,93,88,94,95,89,91,92,-63,-79,-85
P3,-1,85,91,97,100,91,93,94,92,66,90,97,83,91,94,92,87,90,-67,-80,-86
F7,-2,94,95,95,91,100,94,92,85,64,82,88,90,94,93,86,94,93,-61,-78,-83
T3,0,87,92,95,93,94,100,94,85,67,85,89,83,90,89,84,88,90,-74,-86,-90
T5,3,84,89,94,94,92,94,100,92,58,82,91,81,90,92,89,87,91,-69,-85,-90
O1,5,75,81,87,92,85,85,92,100,52,78,93,73,82,87,92,78,82,-59,-73,-82
Fz,-3,74,76,66,66,64,67,58,52,100,87,70,73,75,64,55,70,61,-78,-76,-74


In [23]:
sig = esig.stream2sig(eeg, depth=1)

In [24]:
len(sig)

22

In [25]:
eeg.shape

(10000, 21)