# Train/Val/Test

Creation of training, validation and test sets.

Output:

"../data/03_stratified_v1_idxs_train.npy"  
"../data/03_stratified_v1_idxs_val.npy"  
"../data/03_stratified_v1_idxs_test.npy"  

Indices in df_traincsv, after removing high number of NaNs and keeping constant votes in eegs.

TODO:
- For mixed votes, include more sub eegs to incorporate all votes (keeping proportions). Do some experiments with toy data.

In [179]:
import pandas as pd
import numpy as np

np.random.seed(536)

base_dir = "../../kaggle_data/hms"
# base_dir = "../../data/hms"
# base_dir = "/kaggle/input/hms-harmful-brain-activity-classification"

df_traincsv = pd.read_csv(f'{base_dir}/train.csv')
df_traincsv.head()

# Indices of sub eegs where:
# 2 or less NaN rows in 50 s sample (to interpolate).
# No change of votes in the different sub eegs for the same eeg_id.
#
idxs = np.load("../data/02_idx_constant_votes.npy")

df = df_traincsv.iloc[idxs]
print(len(df))

83893


In [180]:
patient_ids = np.unique(df['patient_id'])
len(patient_ids)

1847

In [181]:
_df = df.groupby(by=['patient_id', 'eeg_id', 'expert_consensus'], as_index=False).size()
_df.groupby(by=['patient_id','expert_consensus'], as_index=False).size().sort_values(by='size')[-100:]

Unnamed: 0,patient_id,expert_consensus,size
117,2641,LPD,30
118,2641,Other,30
3211,64791,Other,30
957,19311,LPD,30
3021,61174,Seizure,31
...,...,...,...
2803,56450,LPD,126
1794,35627,LPD,132
1411,28330,LPD,137
1955,38549,GPD,139


In [182]:
#
# Train/Val/Test split (v1)
#

ptrain = 0.65
pval = 0.2


# Set the seed for reproducibility.
rng = np.random.default_rng(113) # v1
# rng = np.random.default_rng(45163)
# rng = np.random.default_rng(3233)

ids = rng.permutation(np.unique(df['patient_id']))

cut1 = int(ptrain * len(ids))
cut2 = int((ptrain + pval) * len(ids))
ids_train = ids[0:cut1]
ids_val = ids[cut1:cut2]
ids_test = ids[cut2:]

# 1) Pick one sample of each patient_id/eeg_id combination for each set.

idxs_train = []
for id in ids_train:
     for eeg_id in np.unique(df.loc[df['patient_id'] == id]['eeg_id']):
          idxs_train.append(df.loc[df['eeg_id'] == eeg_id].sample().index.values[0])
     
idxs_val = []
for id in ids_val:
     for eeg_id in np.unique(df.loc[df['patient_id'] == id]['eeg_id']):
          idxs_val.append(df.loc[df['eeg_id'] == eeg_id].sample().index.values[0])

idxs_test = []
for id in ids_test:
     for eeg_id in np.unique(df.loc[df['patient_id'] == id]['eeg_id']):
          idxs_test.append(df.loc[df['eeg_id'] == eeg_id].sample().index.values[0])

# 2) Extract the number for the 3rd largest group in test set
          
_df = df.loc[idxs_test].groupby(by='expert_consensus', as_index=False).size().sort_values(by='size')
qty = _df.iloc[3]['size']


In [183]:
# 3) Balance test set reducing and populating above and below.

_df = df.loc[idxs_test]
__df = _df.groupby(by='expert_consensus', as_index=False).size().sort_values(by='size')
qty = __df.iloc[3]['size']
# Reduce the ones on top.
item_qty = __df.iloc[5]['size']
item_ec = __df.iloc[5]['expert_consensus']
_idxs = _df.loc[_df['expert_consensus'] == item_ec].sample(item_qty - qty).index
_df = _df.loc[_df.index.difference(_idxs)]
item_qty = __df.iloc[4]['size']
item_ec = __df.iloc[4]['expert_consensus']
_idxs = _df.loc[_df['expert_consensus'] == item_ec].sample(item_qty - qty).index
_df = _df.loc[_df.index.difference(_idxs)]
# Populate the ones below.
idxs_test = _df.index.values
item_qty = __df.iloc[2]['size']
item_ec = __df.iloc[2]['expert_consensus']
idxs_test = np.concatenate([idxs_test, df.loc[df['patient_id'].isin(ids_test)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])
item_qty = __df.iloc[1]['size']
item_ec = __df.iloc[1]['expert_consensus']
idxs_test = np.concatenate([idxs_test, df.loc[df['patient_id'].isin(ids_test)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])
item_qty = __df.iloc[0]['size']
item_ec = __df.iloc[0]['expert_consensus']
idxs_test = np.concatenate([idxs_test, df.loc[df['patient_id'].isin(ids_test)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])

_df = df.loc[idxs_test]
_df.groupby(by='expert_consensus', as_index=False).size().sort_values(by='size')


Unnamed: 0,expert_consensus,size
0,GPD,395
1,GRDA,395
2,LPD,395
3,LRDA,395
4,Other,395
5,Seizure,395


In [184]:
# 4) Balance val set reducing and populating above and below.

_df = df.loc[idxs_val]
__df = _df.groupby(by='expert_consensus', as_index=False).size().sort_values(by='size')
qty = __df.iloc[3]['size']
# Reduce the ones on top.
item_qty = __df.iloc[5]['size']
item_ec = __df.iloc[5]['expert_consensus']
_idxs = _df.loc[_df['expert_consensus'] == item_ec].sample(item_qty - qty).index
_df = _df.loc[_df.index.difference(_idxs)]
item_qty = __df.iloc[4]['size']
item_ec = __df.iloc[4]['expert_consensus']
_idxs = _df.loc[_df['expert_consensus'] == item_ec].sample(item_qty - qty).index
_df = _df.loc[_df.index.difference(_idxs)]
# Populate the ones below.
idxs_val = _df.index.values
item_qty = __df.iloc[2]['size']
item_ec = __df.iloc[2]['expert_consensus']
idxs_val = np.concatenate([idxs_val, df.loc[df['patient_id'].isin(ids_val)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])
item_qty = __df.iloc[1]['size']
item_ec = __df.iloc[1]['expert_consensus']
idxs_val = np.concatenate([idxs_val, df.loc[df['patient_id'].isin(ids_val)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])
item_qty = __df.iloc[0]['size']
item_ec = __df.iloc[0]['expert_consensus']
idxs_val = np.concatenate([idxs_val, df.loc[df['patient_id'].isin(ids_val)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])

_df = df.loc[idxs_val]
_df.groupby(by='expert_consensus', as_index=False).size().sort_values(by='size')


Unnamed: 0,expert_consensus,size
0,GPD,368
1,GRDA,368
2,LPD,368
3,LRDA,368
4,Other,368
5,Seizure,368


In [185]:
# 5) Balance train set reducing and populating above and below.

_df = df.loc[idxs_train]
__df = _df.groupby(by='expert_consensus', as_index=False).size().sort_values(by='size')
qty = __df.iloc[3]['size']
# Reduce the ones on top.
item_qty = __df.iloc[5]['size']
item_ec = __df.iloc[5]['expert_consensus']
_idxs = _df.loc[_df['expert_consensus'] == item_ec].sample(item_qty - qty).index
_df = _df.loc[_df.index.difference(_idxs)]
item_qty = __df.iloc[4]['size']
item_ec = __df.iloc[4]['expert_consensus']
_idxs = _df.loc[_df['expert_consensus'] == item_ec].sample(item_qty - qty).index
_df = _df.loc[_df.index.difference(_idxs)]
# Populate the ones below.
idxs_train = _df.index.values
item_qty = __df.iloc[2]['size']
item_ec = __df.iloc[2]['expert_consensus']
idxs_train = np.concatenate([idxs_train, df.loc[df['patient_id'].isin(ids_train)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])
item_qty = __df.iloc[1]['size']
item_ec = __df.iloc[1]['expert_consensus']
idxs_train = np.concatenate([idxs_train, df.loc[df['patient_id'].isin(ids_train)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])
item_qty = __df.iloc[0]['size']
item_ec = __df.iloc[0]['expert_consensus']
idxs_train = np.concatenate([idxs_train, df.loc[df['patient_id'].isin(ids_train)].loc[df['expert_consensus'] == item_ec].sample(qty-item_qty).index.values])

_df = df.loc[idxs_train]
_df.groupby(by='expert_consensus', as_index=False).size().sort_values(by='size')


Unnamed: 0,expert_consensus,size
0,GPD,1388
1,GRDA,1388
2,LPD,1388
3,LRDA,1388
4,Other,1388
5,Seizure,1388


In [186]:
np.save("../data/03_stratified_v1_idxs_train.npy", idxs_train)
np.save("../data/03_stratified_v1_idxs_val.npy", idxs_val)
np.save("../data/03_stratified_v1_idxs_test.npy", idxs_test)