# train 데이터 전처리하기

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/raw/train.csv')

In [3]:
df.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [4]:
TARGETS = df.columns[-6:]
print(TARGETS)

Index(['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote',
       'other_vote'],
      dtype='object')


In [5]:
train = (
    df
    .groupby(
        ['eeg_id', 'expert_consensus']
        )[
            [
                'eeg_label_offset_seconds', 
                'spectrogram_id',
                'label_id',
                'spectrogram_label_offset_seconds'
            ]
         ]
    .agg(
        {
            'eeg_label_offset_seconds': 'min', 
            'spectrogram_id': 'first',
            'label_id': 'first',
            'spectrogram_label_offset_seconds': 'min'
        })
)
train.columns = ['eeg_min', 'spec_id', 'label_id', 'spec_min']
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,eeg_min,spec_id,label_id,spec_min
eeg_id,expert_consensus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
568657,Other,0.0,789577333,1825637311,0.0
582999,LPD,0.0,1552638400,1722186807,0.0
642382,Other,0.0,14960202,3254468733,1008.0
751790,GPD,0.0,618728447,2898467035,908.0
778705,Other,0.0,52296320,3255875127,0.0


In [6]:
tmp = (
    df
    .groupby(
        ['eeg_id', 'expert_consensus']
        )[['eeg_label_offset_seconds','spectrogram_label_offset_seconds']]
    .agg(
        {
            'eeg_label_offset_seconds': 'max', 
            'spectrogram_label_offset_seconds':'max'
        })
)
train[['eeg_max', 'spec_max']] = tmp
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,eeg_min,spec_id,label_id,spec_min,eeg_max,spec_max
eeg_id,expert_consensus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
568657,Other,0.0,789577333,1825637311,0.0,16.0,16.0
582999,LPD,0.0,1552638400,1722186807,0.0,38.0,38.0
642382,Other,0.0,14960202,3254468733,1008.0,24.0,1032.0
751790,GPD,0.0,618728447,2898467035,908.0,0.0,908.0
778705,Other,0.0,52296320,3255875127,0.0,0.0,0.0


In [7]:
tmp = (
    df
    .groupby(['eeg_id', 'expert_consensus'])[['patient_id']]
    .agg('first')
)
train['patient_id'] = tmp
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,eeg_min,spec_id,label_id,spec_min,eeg_max,spec_max,patient_id
eeg_id,expert_consensus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
568657,Other,0.0,789577333,1825637311,0.0,16.0,16.0,20654
582999,LPD,0.0,1552638400,1722186807,0.0,38.0,38.0,20230
642382,Other,0.0,14960202,3254468733,1008.0,24.0,1032.0,5955
751790,GPD,0.0,618728447,2898467035,908.0,0.0,908.0,38549
778705,Other,0.0,52296320,3255875127,0.0,0.0,0.0,40955


In [8]:
tmp = df.groupby(['eeg_id', 'expert_consensus'])[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values
    
tmp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
eeg_id,expert_consensus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
568657,Other,0,0,12,0,8,28
582999,LPD,0,132,0,11,0,11
642382,Other,0,0,0,0,0,2
751790,GPD,0,0,1,0,0,0
778705,Other,0,0,0,0,0,2


In [9]:
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,eeg_min,spec_id,label_id,spec_min,eeg_max,spec_max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
eeg_id,expert_consensus,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
568657,Other,0.0,789577333,1825637311,0.0,16.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333
582999,LPD,0.0,1552638400,1722186807,0.0,38.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429
642382,Other,0.0,14960202,3254468733,1008.0,24.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0
751790,GPD,0.0,618728447,2898467035,908.0,0.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0
778705,Other,0.0,52296320,3255875127,0.0,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
train.reset_index(inplace=True)
train.sort_values(by=['eeg_id', 'eeg_min'], inplace=True, ignore_index=True)
train.rename(columns={'expert_consensus': 'target'}, inplace=True)
print('Train non-overlapp eeg_id shape:', train.shape )
train.head()

Train non-overlapp eeg_id shape: (18013, 15)


Unnamed: 0,eeg_id,target,eeg_min,spec_id,label_id,spec_min,eeg_max,spec_max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,568657,Other,0.0,789577333,1825637311,0.0,16.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333
1,582999,LPD,0.0,1552638400,1722186807,0.0,38.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429
2,642382,Other,0.0,14960202,3254468733,1008.0,24.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0
3,751790,GPD,0.0,618728447,2898467035,908.0,0.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0
4,778705,Other,0.0,52296320,3255875127,0.0,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
train[75:85]

Unnamed: 0,eeg_id,target,eeg_min,spec_id,label_id,spec_min,eeg_max,spec_max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
75,19780434,Other,0.0,1326122787,858348352,0.0,24.0,24.0,20406,0.0,0.0,0.0,0.0,0.066667,0.933333
76,20299905,Seizure,0.0,149028695,1387208195,0.0,274.0,274.0,26342,0.941176,0.029412,0.0,0.0,0.0,0.029412
77,20697410,Other,0.0,1456975380,2065691910,0.0,0.0,0.0,47284,0.0,0.0,0.0,0.0,0.0,1.0
78,20915334,Other,0.0,874919861,2327648838,676.0,0.0,676.0,44623,0.0,0.0,0.0,0.0,0.0,1.0
79,21054661,LRDA,0.0,1067342787,4174823112,118.0,366.0,484.0,37979,0.0,0.0,0.0,0.8,0.0,0.2
80,21379701,Other,0.0,1841034439,3556945898,2204.0,158.0,2362.0,32481,0.214286,0.0,0.0,0.214286,0.0,0.571429
81,21379701,Seizure,30.0,1841034439,1565893564,2234.0,48.0,2252.0,32481,0.6,0.0,0.0,0.1,0.0,0.3
82,21379701,LPD,62.0,1841034439,1468675763,2266.0,110.0,2314.0,32481,0.0,0.75,0.0,0.25,0.0,0.0
83,21498048,Other,0.0,1686467936,1227827692,0.0,0.0,0.0,16947,0.0,0.0,0.0,0.0,0.0,1.0
84,21557190,Other,0.0,1538429652,2191835398,0.0,8.0,8.0,23337,0.0,0.25,0.0,0.25,0.0,0.5


In [12]:
train.tail()

Unnamed: 0,eeg_id,target,eeg_min,spec_id,label_id,spec_min,eeg_max,spec_max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
18008,4293354003,GRDA,0.0,1188113564,447244163,0.0,0.0,0.0,16610,0.0,0.0,0.0,0.0,0.5,0.5
18009,4293843368,GRDA,0.0,1549502620,1618953053,0.0,0.0,0.0,15065,0.0,0.0,0.0,0.0,0.5,0.5
18010,4294455489,Other,0.0,2105480289,469526364,0.0,0.0,0.0,56,0.0,0.0,0.0,0.0,0.0,1.0
18011,4294858825,Other,0.0,657299228,561576493,0.0,12.0,12.0,4312,0.0,0.0,0.0,0.0,0.066667,0.933333
18012,4294958358,Other,0.0,260520016,2788887007,2508.0,0.0,2508.0,25986,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
train = train[
            ['target', 'eeg_id', 'eeg_min', 'eeg_max',
             'spec_id', 'spec_min', 'spec_max', 'label_id', 'patient_id', 
             'seizure_vote', 'lpd_vote', 'gpd_vote',
             'lrda_vote', 'grda_vote', 'other_vote']
             ]
train.head()

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,spec_id,spec_min,spec_max,label_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,Other,568657,0.0,16.0,789577333,0.0,16.0,1825637311,20654,0.0,0.0,0.25,0.0,0.166667,0.583333
1,LPD,582999,0.0,38.0,1552638400,0.0,38.0,1722186807,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429
2,Other,642382,0.0,24.0,14960202,1008.0,1032.0,3254468733,5955,0.0,0.0,0.0,0.0,0.0,1.0
3,GPD,751790,0.0,0.0,618728447,908.0,908.0,2898467035,38549,0.0,0.0,1.0,0.0,0.0,0.0
4,Other,778705,0.0,0.0,52296320,0.0,0.0,3255875127,40955,0.0,0.0,0.0,0.0,0.0,1.0


eeg_id 당 병명이 여러 개 나온 경우의 index 추출

In [16]:
condition = train['eeg_id'].value_counts()
idx_condition = condition[condition > 1].index
df_multi = train[train['eeg_id'].isin(idx_condition)].copy()
df_multi

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,spec_id,spec_min,spec_max,label_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
80,Other,21379701,0.0,158.0,1841034439,2204.0,2362.0,3556945898,32481,0.214286,0.000000,0.0,0.214286,0.000000,0.571429
81,Seizure,21379701,30.0,48.0,1841034439,2234.0,2252.0,1565893564,32481,0.600000,0.000000,0.0,0.100000,0.000000,0.300000
82,LPD,21379701,62.0,110.0,1841034439,2266.0,2314.0,1468675763,32481,0.000000,0.750000,0.0,0.250000,0.000000,0.000000
85,Other,21746311,0.0,0.0,1343094925,0.0,0.0,3249053595,6489,0.400000,0.000000,0.0,0.000000,0.000000,0.600000
86,Seizure,21746311,2.0,54.0,1343094925,2.0,54.0,2917720091,6489,0.720430,0.000000,0.0,0.000000,0.000000,0.279570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17969,Seizure,4283246598,4.0,6.0,329782959,4.0,6.0,4220128961,27986,1.000000,0.000000,0.0,0.000000,0.000000,0.000000
17978,Other,4284659653,0.0,46.0,1841034439,2696.0,2742.0,2080594355,32481,0.176471,0.000000,0.0,0.176471,0.039216,0.607843
17979,Seizure,4284659653,54.0,58.0,1841034439,2750.0,2754.0,4284231588,32481,0.333333,0.333333,0.0,0.000000,0.000000,0.333333
17980,LPD,4284659653,70.0,138.0,1841034439,2766.0,2834.0,2653269667,32481,0.000000,0.909091,0.0,0.000000,0.000000,0.090909


문제가 되는 경우 예: ['eeg_id'] == 21379701  
이 경우 other가 마지막에 한 번 더 나와서 eeg_max와 spec_max가 전체를 덮는다

In [15]:
df[df['eeg_id'] == 21379701]

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
90269,21379701,0,0.0,1841034439,158,2204.0,3556945898,32481,Other,1,0,0,1,0,2
90270,21379701,1,10.0,1841034439,159,2214.0,3225189518,32481,Other,1,0,0,1,0,2
90271,21379701,2,12.0,1841034439,160,2216.0,3059518038,32481,Other,1,0,0,1,0,2
90272,21379701,3,30.0,1841034439,161,2234.0,1565893564,32481,Seizure,2,0,0,1,0,1
90273,21379701,4,44.0,1841034439,162,2248.0,2941979767,32481,Seizure,2,0,0,0,0,1
90274,21379701,5,48.0,1841034439,163,2252.0,4290242864,32481,Seizure,2,0,0,0,0,1
90275,21379701,6,62.0,1841034439,164,2266.0,1468675763,32481,LPD,0,3,0,1,0,0
90276,21379701,7,76.0,1841034439,165,2280.0,4133046008,32481,LPD,0,3,0,1,0,0
90277,21379701,8,78.0,1841034439,166,2282.0,330235701,32481,LPD,0,3,0,1,0,0
90278,21379701,9,80.0,1841034439,167,2284.0,2090915320,32481,LPD,0,3,0,1,0,0


'shift'칼럼을 만들어서 다음 행과 비교하기

In [17]:
df_multi['shift'] = (
    df_multi
    .groupby('eeg_id')['eeg_min']
    .transform('shift', periods= -1, fill_value=np.inf)
)

In [18]:
df_multi[['target', 'eeg_id', 'eeg_min', 'eeg_max', 'shift', 
          'spec_id', 'spec_min', 'spec_max', 'label_id']].head()

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,shift,spec_id,spec_min,spec_max,label_id
80,Other,21379701,0.0,158.0,30.0,1841034439,2204.0,2362.0,3556945898
81,Seizure,21379701,30.0,48.0,62.0,1841034439,2234.0,2252.0,1565893564
82,LPD,21379701,62.0,110.0,inf,1841034439,2266.0,2314.0,1468675763
85,Other,21746311,0.0,0.0,2.0,1343094925,0.0,0.0,3249053595
86,Seizure,21746311,2.0,54.0,inf,1343094925,2.0,54.0,2917720091


아래는 의사의 판단이 하나의 eeg 내에서 바뀐 경우 찾기

In [19]:
df_view = df_multi[['target', 'eeg_id', 'eeg_min', 'eeg_max', 'shift', 
                    'spec_id', 'spec_min', 'spec_max', 'label_id']]

df_view

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,shift,spec_id,spec_min,spec_max,label_id
80,Other,21379701,0.0,158.0,30.0,1841034439,2204.0,2362.0,3556945898
81,Seizure,21379701,30.0,48.0,62.0,1841034439,2234.0,2252.0,1565893564
82,LPD,21379701,62.0,110.0,inf,1841034439,2266.0,2314.0,1468675763
85,Other,21746311,0.0,0.0,2.0,1343094925,0.0,0.0,3249053595
86,Seizure,21746311,2.0,54.0,inf,1343094925,2.0,54.0,2917720091
...,...,...,...,...,...,...,...,...,...
17969,Seizure,4283246598,4.0,6.0,inf,329782959,4.0,6.0,4220128961
17978,Other,4284659653,0.0,46.0,54.0,1841034439,2696.0,2742.0,2080594355
17979,Seizure,4284659653,54.0,58.0,70.0,1841034439,2750.0,2754.0,4284231588
17980,LPD,4284659653,70.0,138.0,160.0,1841034439,2766.0,2834.0,2653269667


In [20]:
df_view[df_view['eeg_max'] > df_view['shift']]

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,shift,spec_id,spec_min,spec_max,label_id
80,Other,21379701,0.0,158.0,30.0,1841034439,2204.0,2362.0,3556945898
128,Other,32067830,0.0,80.0,38.0,2060670605,9422.0,9502.0,2148393111
187,LPD,46287152,0.0,98.0,46.0,1516469502,1232.0,1330.0,3673462713
212,LRDA,51565620,0.0,54.0,18.0,2077600099,2670.0,2724.0,2916140792
234,Other,56878715,0.0,40.0,6.0,1026469553,0.0,40.0,3375752082
...,...,...,...,...,...,...,...,...,...
17796,GPD,4239311496,0.0,290.0,86.0,58683771,2490.0,2780.0,223336409
17834,Other,4247287858,0.0,64.0,20.0,764146759,7652.0,7716.0,3890605516
17836,LRDA,4247287858,94.0,222.0,120.0,764146759,7746.0,7874.0,2315515890
17848,Seizure,4249475128,0.0,256.0,28.0,617579510,0.0,256.0,3097261777


In [22]:
df_view[df_view['eeg_max'] > df_view['shift']].eeg_id.nunique()

314

In [23]:
# 21379701
df_view[(df_view['eeg_max'] > df_view['shift']) & (df_view['eeg_id'] == 21379701)]

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,shift,spec_id,spec_min,spec_max,label_id
80,Other,21379701,0.0,158.0,30.0,1841034439,2204.0,2362.0,3556945898


In [24]:
idx_shift = df_view.loc[df_view['eeg_max'] > df_view['shift']].index + 1
idx_shift

Index([   81,   129,   188,   213,   235,   294,   327,   552,   697,   699,
       ...
       17467, 17524, 17550, 17758, 17783, 17797, 17835, 17837, 17849, 17867],
      dtype='int64', length=352)

In [25]:
array_shift = train.loc[idx_shift, 'label_id'].values

In [27]:
df[df['label_id'].isin(array_shift)]

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
967,628369060,11,76.0,13143748,13,270.0,2050707482,34998,GRDA,0,1,2,1,7,4
991,3088095149,1,6.0,13143748,37,1560.0,2169014734,34998,GRDA,0,0,4,0,5,4
994,3088095149,4,24.0,13143748,40,1578.0,17509994,34998,GPD,1,0,2,0,0,1
1015,3088095149,25,128.0,13143748,61,1682.0,2166878269,34998,Seizure,4,4,2,1,0,2
1710,2536169515,9,60.0,19384736,64,1454.0,309376330,56450,Seizure,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104111,876555825,2,10.0,2079265651,2,10.0,3451801944,2053,GRDA,1,0,0,0,10,5
104966,1293845790,1,4.0,2099328860,1,4.0,1863137113,35437,GPD,4,4,8,0,0,1
105025,4179620200,1,22.0,2099486123,21,910.0,3026690772,41114,Seizure,3,0,0,0,0,0
105775,4021513849,2,32.0,2119832042,2,32.0,182875776,7573,Other,4,0,0,0,0,16


In [28]:
idx_new = df[df['label_id'].isin(array_shift)].index -1
df.loc[idx_new, ['eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']]

Unnamed: 0,eeg_label_offset_seconds,spectrogram_label_offset_seconds
966,64.0,258.0
990,0.0,1554.0
993,10.0,1564.0
1014,118.0,1672.0
1709,34.0,1428.0
...,...,...
104110,6.0,6.0
104965,0.0,0.0
105024,0.0,888.0
105774,16.0,16.0


In [29]:
df.loc[idx_new, ['eeg_id', 'eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']].eeg_id.nunique()

314

In [30]:
# 21379701
# 왜 안 맞지 하고 생각해봤는데, 정렬이 안 돼서 그런 것 같다. 내가 헛짓한 듯.
# 위에서 다시 정렬하고 오니까 맞게 나온다.
condition = df.index.isin(idx_new) & (df['eeg_id'] == 21379701)
df.loc[condition, ['eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']]

Unnamed: 0,eeg_label_offset_seconds,spectrogram_label_offset_seconds
90271,12.0,2216.0


이제 조건에 맞게 칼럼에 값 대치하기

In [31]:
df_temp = df.loc[
            idx_new, 
            ['eeg_id', 'eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']
          ].copy()
df_temp.sort_values(by=['eeg_id', 'eeg_label_offset_seconds'], inplace=True)

In [32]:
df_temp

Unnamed: 0,eeg_id,eeg_label_offset_seconds,spectrogram_label_offset_seconds
90271,21379701,12.0,2216.0
102842,32067830,24.0,9446.0
75506,46287152,0.0,1232.0
104049,51565620,0.0,2670.0
52048,56878715,0.0,0.0
...,...,...,...
3616,4239311496,62.0,2552.0
39346,4247287858,0.0,7652.0
39365,4247287858,94.0,7746.0
31887,4249475128,20.0,20.0


In [33]:
df_temp[['eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']].values

array([[1.2000e+01, 2.2160e+03],
       [2.4000e+01, 9.4460e+03],
       [0.0000e+00, 1.2320e+03],
       [0.0000e+00, 2.6700e+03],
       [0.0000e+00, 0.0000e+00],
       [4.4000e+01, 4.4000e+01],
       [1.4000e+01, 1.4000e+01],
       [4.0000e+00, 4.0000e+00],
       [1.2000e+01, 1.0700e+03],
       [0.0000e+00, 1.1420e+03],
       [1.0000e+02, 1.2126e+04],
       [0.0000e+00, 2.2600e+03],
       [4.0000e+00, 4.0000e+00],
       [1.6000e+01, 6.0200e+02],
       [5.2000e+01, 6.3800e+02],
       [8.0000e+00, 4.3800e+02],
       [5.8000e+01, 4.8800e+02],
       [1.2320e+03, 1.6620e+03],
       [8.8000e+01, 8.8000e+01],
       [1.0400e+02, 4.7940e+03],
       [1.1600e+02, 4.8060e+03],
       [1.5400e+02, 4.4200e+02],
       [1.0800e+02, 1.0800e+02],
       [0.0000e+00, 3.2200e+02],
       [0.0000e+00, 0.0000e+00],
       [1.9200e+02, 1.9200e+02],
       [0.0000e+00, 1.2120e+03],
       [0.0000e+00, 2.9360e+03],
       [3.0000e+01, 3.0000e+01],
       [0.0000e+00, 6.7520e+03],
       [2.

In [34]:
df_view.loc[df_view['eeg_max'] > df_view['shift'], ['eeg_id', 'eeg_max', 'spec_max']]

Unnamed: 0,eeg_id,eeg_max,spec_max
80,21379701,158.0,2362.0
128,32067830,80.0,9502.0
187,46287152,98.0,1330.0
212,51565620,54.0,2724.0
234,56878715,40.0,40.0
...,...,...,...
17796,4239311496,290.0,2780.0
17834,4247287858,64.0,7716.0
17836,4247287858,222.0,7874.0
17848,4249475128,256.0,256.0


In [38]:
idx_fix = df_view.loc[df_view['eeg_max'] > df_view['shift'], ['eeg_id', 'eeg_max', 'spec_max']].index

In [39]:
train.loc[idx_fix]

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,spec_id,spec_min,spec_max,label_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
80,Other,21379701,0.0,158.0,1841034439,2204.0,2362.0,3556945898,32481,0.214286,0.000000,0.000000,0.214286,0.000000,0.571429
128,Other,32067830,0.0,80.0,2060670605,9422.0,9502.0,2148393111,55705,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
187,LPD,46287152,0.0,98.0,1516469502,1232.0,1330.0,3673462713,56450,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
212,LRDA,51565620,0.0,54.0,2077600099,2670.0,2724.0,2916140792,41114,0.000000,0.000000,0.000000,0.500000,0.000000,0.500000
234,Other,56878715,0.0,40.0,1026469553,0.0,40.0,3375752082,48272,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17796,GPD,4239311496,0.0,290.0,58683771,2490.0,2780.0,223336409,3838,0.000000,0.123779,0.302932,0.185668,0.185668,0.201954
17834,Other,4247287858,0.0,64.0,764146759,7652.0,7716.0,3890605516,30631,0.148649,0.054054,0.000000,0.094595,0.000000,0.702703
17836,LRDA,4247287858,94.0,222.0,764146759,7746.0,7874.0,2315515890,30631,0.000000,0.000000,0.000000,0.562500,0.062500,0.375000
17848,Seizure,4249475128,0.0,256.0,617579510,0.0,256.0,3097261777,36301,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [40]:
df_view.loc[df_view['eeg_max'] > df_view['shift'], ['eeg_max', 'spec_max']] = (
    df_temp[['eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']].values
)
df_view.loc[df_view['eeg_max'] > df_view['shift'], ['eeg_id', 'eeg_max', 'spec_max']]

Unnamed: 0,eeg_id,eeg_max,spec_max


In [41]:
train.loc[idx_fix]

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,spec_id,spec_min,spec_max,label_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
80,Other,21379701,0.0,158.0,1841034439,2204.0,2362.0,3556945898,32481,0.214286,0.000000,0.000000,0.214286,0.000000,0.571429
128,Other,32067830,0.0,80.0,2060670605,9422.0,9502.0,2148393111,55705,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
187,LPD,46287152,0.0,98.0,1516469502,1232.0,1330.0,3673462713,56450,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
212,LRDA,51565620,0.0,54.0,2077600099,2670.0,2724.0,2916140792,41114,0.000000,0.000000,0.000000,0.500000,0.000000,0.500000
234,Other,56878715,0.0,40.0,1026469553,0.0,40.0,3375752082,48272,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17796,GPD,4239311496,0.0,290.0,58683771,2490.0,2780.0,223336409,3838,0.000000,0.123779,0.302932,0.185668,0.185668,0.201954
17834,Other,4247287858,0.0,64.0,764146759,7652.0,7716.0,3890605516,30631,0.148649,0.054054,0.000000,0.094595,0.000000,0.702703
17836,LRDA,4247287858,94.0,222.0,764146759,7746.0,7874.0,2315515890,30631,0.000000,0.000000,0.000000,0.562500,0.062500,0.375000
17848,Seizure,4249475128,0.0,256.0,617579510,0.0,256.0,3097261777,36301,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [44]:
train.loc[idx_fix, ['eeg_max', 'spec_max']] = (
    df_temp[['eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']].values
)

In [45]:
train.loc[75:85]

Unnamed: 0,target,eeg_id,eeg_min,eeg_max,spec_id,spec_min,spec_max,label_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
75,Other,19780434,0.0,24.0,1326122787,0.0,24.0,858348352,20406,0.0,0.0,0.0,0.0,0.066667,0.933333
76,Seizure,20299905,0.0,274.0,149028695,0.0,274.0,1387208195,26342,0.941176,0.029412,0.0,0.0,0.0,0.029412
77,Other,20697410,0.0,0.0,1456975380,0.0,0.0,2065691910,47284,0.0,0.0,0.0,0.0,0.0,1.0
78,Other,20915334,0.0,0.0,874919861,676.0,676.0,2327648838,44623,0.0,0.0,0.0,0.0,0.0,1.0
79,LRDA,21054661,0.0,366.0,1067342787,118.0,484.0,4174823112,37979,0.0,0.0,0.0,0.8,0.0,0.2
80,Other,21379701,0.0,12.0,1841034439,2204.0,2216.0,3556945898,32481,0.214286,0.0,0.0,0.214286,0.0,0.571429
81,Seizure,21379701,30.0,48.0,1841034439,2234.0,2252.0,1565893564,32481,0.6,0.0,0.0,0.1,0.0,0.3
82,LPD,21379701,62.0,110.0,1841034439,2266.0,2314.0,1468675763,32481,0.0,0.75,0.0,0.25,0.0,0.0
83,Other,21498048,0.0,0.0,1686467936,0.0,0.0,1227827692,16947,0.0,0.0,0.0,0.0,0.0,1.0
84,Other,21557190,0.0,8.0,1538429652,0.0,8.0,2191835398,23337,0.0,0.25,0.0,0.25,0.0,0.5
