# Train/Val/Test

Creation of training, validation and test sets, keeping the distributions.

Output:

"../data/03_v3_idxs_train.npy"  
"../data/03_v3_idxs_val.npy"  
"../data/03_v3_idxs_test.npy"  

<!-- Indices in df_traincsv, after removing high number of NaNs and keeping constant votes in eegs. -->

TODO:
- For mixed votes, include more sub eegs to incorporate all votes (keeping proportions). Do some experiments with toy data.

**Table of contents**<a id='toc0_'></a>    
- [Version 1 (v1)](#toc1_)    
- [Version 2 (v2)](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=2
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import pandas as pd
import numpy as np

np.random.seed(536)

# base_dir = "../../kaggle_data/hms"
base_dir = "../../data/hms"
# base_dir = "/kaggle/input/hms-harmful-brain-activity-classification"

df_traincsv = pd.read_csv(f'{base_dir}/train.csv')

# Removing observations where eeg has more than 2 NaN rows.
idxs = np.load("../data/02_eegs_sub_50_idxs_less_3_nan.npy")
df = df_traincsv.loc[idxs]

eeg_ids = df['eeg_id'].unique()

print('observations:',len(df))

cats = df['expert_consensus'].unique()
print(cats)

patient_ids = np.unique(df['patient_id'])
patient_ids = df['patient_id'].unique()
print('patient ids:', len(patient_ids))

observations: 106556
['Seizure' 'GPD' 'LRDA' 'Other' 'GRDA' 'LPD']
patient ids: 1945


In [3]:
#
# Saving all idxs with distance offset_sep to reduce overlapping.
#

# For each eeg_id pick several eeg_sub_id indexes satisfying:
# there is at least offset_sep seconds between offsets, to reduce overlapping.

# Possible tunning for reducing overlapping: set the value of offset_sep.

offset_sep = 15.0

idxs = []
for eeg_id in eeg_ids:
     _idx = df.loc[(df['eeg_id'] == eeg_id)].index.values[0]
     while _idx:
          current_off = df.loc[_idx].eeg_label_offset_seconds
          idxs.append(_idx)
          _df = df.loc[(df['eeg_id'] == eeg_id) & (df['eeg_label_offset_seconds'] > (current_off + offset_sep))].sort_values(by='eeg_label_offset_seconds')
          if len(_df)>0:
               _idx = _df.index.values[0]
          else:
               _idx = False

np.save("../data/03_same_distribution_idxs_all.npy", idxs)

In [5]:
len(idxs)

38519

In [19]:
idxs = np.load("../data/03_same_distribution_idxs_all.npy")
df = df_traincsv.iloc[idxs].copy()


## Two classes: Other and the rest.

In [20]:
df.loc[df.expert_consensus == 'Seizure', 'target'] = 0
df.loc[df.expert_consensus == 'LPD', 'target'] = 0
df.loc[df.expert_consensus == 'GPD', 'target'] = 0
df.loc[df.expert_consensus == 'LRDA', 'target'] = 0
df.loc[df.expert_consensus == 'GRDA', 'target'] = 0
df.loc[df.expert_consensus == 'Other', 'target'] = 1
df['target'] = df['target'].astype(int)

In [21]:
df.sample(10)

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
56328,3664010139,7,18.0,1124691539,7,18.0,2796144221,3838,GRDA,0,0,3,2,6,2,0
91003,2944106527,0,0.0,1856500288,0,0.0,4082364928,10599,Seizure,3,0,0,0,0,0,0
95044,3383891046,0,0.0,1924518057,14,832.0,223611160,60145,Other,0,0,0,0,0,3,1
4471,1058522598,0,0.0,70636080,83,1350.0,1627139768,6935,Other,0,0,0,0,0,2,1
31420,4041885663,0,0.0,610465195,0,0.0,90572826,195,Other,0,0,0,1,4,10,1
14174,2830622523,1,16.0,275859291,18,872.0,910246033,32316,LPD,1,17,0,0,0,0,0
44433,942973929,1,20.0,862174834,1,20.0,1462152351,19638,Other,0,2,1,6,0,8,1
38658,1140647100,10,38.0,757048309,10,38.0,1913303710,51636,Seizure,3,0,0,0,0,0,0
53376,2814281366,11,36.0,1057040168,11,36.0,427740232,2944,GRDA,0,0,0,0,3,0,0
25210,1653126185,9,76.0,495433108,9,76.0,174482876,32143,Seizure,3,0,0,0,0,0,0


In [23]:
patient_ids = np.unique(df['patient_id'])
len(patient_ids)

1945

In [24]:
rng = np.random.default_rng(3233)
patient_ids = rng.permutation(np.unique(df['patient_id']))


In [26]:
1945/5

389.0