# Logistic regression model 

Apply logistic regression model on EEG data and first features (moment order one and two)

Next steps :
- Cross validation training 
- Regularisation

In [57]:
import os
from typing import Optional, Union
import numpy as np
from enum import Enum
import pandas as pd 
from sklearn.linear_model import LogisticRegression

## Preamble

In [149]:
base_dir = os.path.join(os.getcwd(), "..")

class Dir(str, Enum):
    root = base_dir
    eeg_train = os.path.join(base_dir, "train_eegs")
    eeg_test = os.path.join(base_dir, "test_eegs")
    spc_train = os.path.join(base_dir, "train_spectrograms")
    spc_test = os.path.join(base_dir, "test_spectrograms")
    out = os.path.join(base_dir, "submissions")

class Const(int, Enum):
    eeg_len = 50
    fq_eeg = 200

train_eeg_names = os.listdir(Dir.eeg_train)
train_spc_names = os.listdir(Dir.spc_train)
len(train_eeg_names), len(train_spc_names)

(17300, 11138)

## Functions definition

In [136]:
def open_train_metadata(folder:str) -> pd.DataFrame:
    """
    open and process train.csv file
    """
    train = pd.read_csv(os.path.join(Dir.root, "train.csv"))
    vote_cols = [c for c in train.columns if c.endswith("vote")]
    train["n_votes"] = train[vote_cols].sum(axis=1)
    for c in vote_cols:
        train[c] = train[c] / train["n_votes"]
    train["eeg_length"] = train["eeg_label_offset_seconds"].diff().shift(-1).fillna(-1).astype(int)
    return train

def open_file_id(folder:str, id:Union[str, int], prefix:str="", extension:str=".parquet") -> pd.DataFrame:
    return pd.read_parquet(os.path.join(folder, prefix + str(id) + extension))

def get_eeg_sample(id:int, subid:int, offset:int, length:int) -> pd.DataFrame:
    eeg_samp = open_file_id(Dir.eeg_train, id)
    start, end = offset * Const.fq_eeg, (offset + length) * Const.fq_eeg
    return eeg_samp.iloc[start:end]

def open_files(folder:str, max_files:Optional[int]=100) -> pd.DataFrame:
    file_names = os.listdir(folder)
    if max_files:
        file_names = file_names[:max_files]
    files = []
    for fn in file_names:
        files.append(pd.read_parquet(os.path.join(folder, fn)))
    NotImplementedError
    # return pd.concat

def extract_features_eeg(eeg:pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(eeg.mean(axis=0)).T

def process_target(Y:pd.DataFrame) -> pd.DataFrame:
    classes = Y.columns
    Y = pd.DataFrame(Y.idxmax(axis=1))
    return Y

def train_logistic_regression(train:pd.DataFrame, y_cols:str, max_it:Optional[int]=1000) -> LogisticRegression:
    Y_all = train[y_cols]
    Y_all = process_target(Y_all)
    n = len(Y_all)
    if max_it:
        n = np.min([n, max_it])
    X_ = []
    Y_ = []
    for j in range(n):
        sample = train.iloc[j]
        if sample["eeg_length"] > 0:
            eeg = get_eeg_sample(sample["eeg_id"], sample["eeg_sub_id"], int(sample["eeg_label_offset_seconds"]), sample["eeg_length"])
            X_.append(extract_features_eeg(eeg))
            Y_.append(Y_all.iloc[j])
    X = pd.concat(X_, axis=0)
    Y = pd.concat(Y_, axis=0)
    # return X, Y
    model = LogisticRegression()
    model.fit(X, Y)
    return model


## Train

In [137]:
train = open_train_metadata(base_dir)
VOTE_COLS = [c for c in train.columns if c.endswith("vote")]
train.iloc[:10]


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,n_votes,eeg_length
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,2
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,10
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,2
5,1628180742,5,26.0,353733,5,26.0,2413091605,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,4
6,1628180742,6,30.0,353733,6,30.0,364593930,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,6
7,1628180742,7,36.0,353733,7,36.0,3811483573,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,4
8,1628180742,8,40.0,353733,8,40.0,3388718494,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3,-40
9,2277392603,0,0.0,924234,0,0.0,1978807404,30539,GPD,0.0,0.0,0.454545,0.0,0.090909,0.454545,11,2


In [138]:
logreg = train_logistic_regression(train, VOTE_COLS)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Test

In [142]:
eeg_test = pd.read_parquet(os.path.join(Dir.eeg_test, os.listdir(Dir.eeg_test)[0]))
predicted_probas = logreg.predict_proba(extract_features_eeg(eeg_test))
predicted_probas[0]


array([0.01752642, 0.17826888, 0.22644989, 0.14480482, 0.24166371,
       0.19128627])

## Submission

In [145]:
sub = pd.read_csv(os.path.join(base_dir, "sample_submission.csv"))
for i in range(1, len(sub.columns)):
    sub[sub.columns[i]][0]= predicted_probas[0][i-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub[sub.columns[i]][0]= predicted_probas[0][i-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub[sub.columns[i]][0]= predicted_probas[0][i-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub[sub.columns[i]][0]= predicted_probas[0][i-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub[sub.columns[i]][0

In [150]:
sub.set_index("eeg_id").to_csv(os.path.join(Dir.out, "submission.csv"))