# Gradient Boost classifier

In [1]:
import os
from typing import Optional, Union, Tuple, List
import numpy as np
from enum import Enum
import pandas as pd 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## Preamble

In [2]:
from source.preamble import *

train_eeg_names = os.listdir(Dir.eeg_train)
train_spc_names = os.listdir(Dir.spc_train)
len(train_eeg_names), len(train_spc_names)

(17300, 11138)

## Functions definition

In [30]:
from source.scoring import compute_wasserstein
from source.train_algos import train_GBC


def open_train_metadata(folder:str) -> pd.DataFrame:
    """
    open and process train.csv file
    """
    train = pd.read_csv(os.path.join(Dir.root, "train.csv"))
    vote_cols = [c for c in train.columns if c.endswith("vote")]
    train["n_votes"] = train[vote_cols].sum(axis=1)
    for c in vote_cols:
        train[c] = train[c] / train["n_votes"]
    train["eeg_length"] = train["eeg_label_offset_seconds"].diff().shift(-1).fillna(-1).astype(int)
    return train

def print_summary_metadata(data:pd.DataFrame) -> None:
    print("="*50)
    print("Metadata summary :")
    print("Len : ", len(data))
    summary_count = data.groupby("expert_consensus")[["eeg_id"]].count().rename(columns={"eeg_id": "n_sample"})
    tot = summary_count["n_sample"].sum()
    summary_count["percent"] = (summary_count["n_sample"] / tot * 100 ).astype(int)
    display(summary_count)
    print("="*50)

def open_file_id(folder:str, id:Union[str, int], prefix:str="", extension:str=".parquet") -> pd.DataFrame:
    return pd.read_parquet(os.path.join(folder, prefix + str(id) + extension))

def get_eeg_sample(id:int, subid:int, offset:int, length:int) -> pd.DataFrame:
    eeg_samp = open_file_id(Dir.eeg_train, id)
    start, end = offset * Const.fq_eeg, (offset + length) * Const.fq_eeg
    return eeg_samp.iloc[start:end]

def get_eeg_subsample(full_eeg:pd.DataFrame, start:int, end:int) -> pd.DataFrame:
    return full_eeg.iloc[start: end]


def pre_process_meta(meta:pd.DataFrame, y_cols:str, grade:Optional[Grade]=None) -> pd.DataFrame:
    """
    - make sure metadata can be sampled randomly or linearly without fear of class imbalance
    - subselection on the "quality" of the target variable : how much are experts agreeing on the subsamle
    """
    if KAGGLE:
        meta = shuffle(meta)
    else: # deterministic output in local
        meta = shuffle(meta, random_state=RANDOM_STATE)

    if grade:
        meta = meta.loc[(meta[y_cols] >= grade).any(axis=1)]

    return meta

def process_extracted_features_to_design(X_:List[pd.DataFrame]) -> pd.DataFrame:
    """
    Take list of identically indexed dataframes,
    Returns dataframe with :
        - each feature(signal) as column
        - sample number - ie index in X_ - as index
    (design matrix without intercept)
    """
    X = pd.concat(X_, axis=1)
    return X.T

def test_model(model:GradientBoostingClassifier, y_cols:str, test_meta:pd.DataFrame):
    X, Y = process_data_from_meta(test_meta, y_cols)
    return model.predict_proba(X)

def extract_validation_set(all:pd.DataFrame, ratio:float=.1, seed:int=1) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return train_test_split(all, test_size=ratio, shuffle=True, random_state=seed)

def split_train_test(all:pd.DataFrame, ratio:float=.1, seed:int=1) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return train_test_split(all, test_size=ratio, shuffle=True, random_state=seed)

def predict_probas_test_set(model, meta_test:pd.DataFrame) -> pd.DataFrame:
    global VOTE_COLS
    X_ = []
    for i in range(len(meta_test)):
        eeg_id = meta_test.loc[i, "eeg_id"]
        eeg_test = pd.read_parquet(os.path.join(Dir.eeg_test, f"{eeg_id}.parquet"))
        X_.append(extract_features_eeg(eeg_test))
    predicted_probas = model.predict_proba(process_extracted_features_to_design(X_))
    sub = pd.DataFrame(predicted_probas, columns=VOTE_COLS, index=meta_test["eeg_id"].values)
    sub.index.name = "eeg_id"
    return sub


## Process metadata

In [22]:
meta_all = open_train_metadata(base_dir)
print_summary_metadata(meta_all)
VOTE_COLS = [c for c in meta_all.columns if c.endswith("vote")]
meta_all.iloc[:10]

Metadata summary :
Len :  106800


Unnamed: 0_level_0,n_sample,percent
expert_consensus,Unnamed: 1_level_1,Unnamed: 2_level_1
GPD,16702,15
GRDA,18861,17
LPD,14856,13
LRDA,16640,15
Other,18808,17
Seizure,20933,19




Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,n_votes,eeg_length
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,2
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,10
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,2
5,1628180742,5,26.0,353733,5,26.0,2413091605,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,4
6,1628180742,6,30.0,353733,6,30.0,364593930,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
7,1628180742,7,36.0,353733,7,36.0,3811483573,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,4
8,1628180742,8,40.0,353733,8,40.0,3388718494,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,-40
9,2277392603,0,0.0,924234,0,0.0,1978807404,30539,GPD,0.0,0.0,0.454545,0.0,0.090909,0.454545,11,2


## Train

In [23]:
# split validation
rest_meta, validation_meta = extract_validation_set(meta_all, ratio=.05)
print_summary_metadata(rest_meta)
print_summary_metadata(validation_meta)

Metadata summary :
Len :  101460


Unnamed: 0_level_0,n_sample,percent
expert_consensus,Unnamed: 1_level_1,Unnamed: 2_level_1
GPD,15830,15
GRDA,17942,17
LPD,14104,13
LRDA,15815,15
Other,17895,17
Seizure,19874,19


Metadata summary :
Len :  5340


Unnamed: 0_level_0,n_sample,percent
expert_consensus,Unnamed: 1_level_1,Unnamed: 2_level_1
GPD,872,16
GRDA,919,17
LPD,752,14
LRDA,825,15
Other,913,17
Seizure,1059,19




In [24]:
max_it = 100 if KAGGLE else 100
model = train_GBC(rest_meta, VOTE_COLS, max_it=max_it, grade=Grade.certain)

Number of samples without missing values selected :  85


In [25]:
if not KAGGLE or True:
    predicted_probas = test_model(model, VOTE_COLS, validation_meta)

Number of samples without missing values selected :  4489


In [26]:
if KAGGLE or True:
    target_probas = validation_meta[VOTE_COLS]
    target_probas

In [27]:
ws = compute_wasserstein(predicted_probas, target_probas)
ws

0.8242902619055457

## Test

In [28]:
meta_test = pd.read_csv(os.path.join(Dir.root, "test.csv"))

## Submission

In [31]:
sub = predict_probas_test_set(model, meta_test)
sub.to_csv(os.path.join(Dir.out, "submission.csv"))
sub

Unnamed: 0_level_0,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
eeg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3911565283,0.000767,0.681,0.006176,0.09378,0.01321,0.205068
