In [1]:
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd
import glob
import numpy as np

# inference

In [2]:
class InferenceFHDataset(Dataset):

    def __init__(self, data_paths):
        self.data_paths=data_paths
        self.ds_type='ea'
        self.use_feat=True
        self.use_hour=True
        self.make_hour_emb=False
        self.rolling_ranges = [17, 33, 65]
        self.train_len = (6+24+6)*60*12
        self.X=[]
        self.Y=[]
        self.ids=[]
        for path in tqdm(self.data_paths):
            cur_id = int(path.split('/')[-1].split('.')[0][2:])
            xy = pd.read_csv(path)
            y = xy[['onset','wakeup']]
            # gen feat
            x = self._gen_feat(xy)
            # change columns order - hour 을 맨 뒤에 놓는다.
            cols=x.columns.tolist()
            cols=cols[1:2]+cols[3:9]+cols[0:1]+cols[9:]+cols[2:3]
            x=x[cols]
            # crop series into fixed size
            it=len(x)//self.train_len
            num_pad = self.train_len-len(x)%self.train_len
            for i in range(it+1):
                # 마지막 iteration zeropad로 길이 맞춰주기
                if i==it:
                    st= i*self.train_len
                    X = np.pad(x.to_numpy()[st:].copy(), ((0,num_pad),(0,0)),'constant',constant_values=0)
                    Y = np.pad(y.to_numpy()[st:].copy(), ((0,num_pad),(0,0)),'constant',constant_values=0)
                    self.X.append(X)
                    self.Y.append(Y)
                    self.ids.append((cur_id,i))
                else:
                    st = i*self.train_len
                    ed = (i+1)*self.train_len
                    self.X.append(x[st:ed])
                    self.Y.append(y[st:ed])
                    self.ids.append((cur_id,i))
    
    # before evaluate, change state according to 내 모델.
    def change_mode(self,ds_type, use_feat, use_hour, make_hour_emb):
        self.ds_type=ds_type
        self.use_feat=use_feat
        self.use_hour=use_hour
        self.make_hour_emb=make_hour_emb
            
    def __len__(self):
        return len(self.data_paths)
    
    def _gen_feat(self,xy):
        # enmo
        for r in self.rolling_ranges:
            tmp_feat = xy['enmo'].rolling(r, center=True)
            xy[f'enmo_mean_{r}'] = tmp_feat.mean()
            xy[f'enmo_std_{r}'] = tmp_feat.std()
        # anglez
        for r in self.rolling_ranges:
            tmp_feat = xy['anglez'].rolling(r, center=True)
            xy[f'anglez_mean_{r}'] = tmp_feat.mean()
            xy[f'anglez_std_{r}'] = tmp_feat.std()
        return xy.drop(columns=['onset','wakeup']).fillna(0)
    
    def __getitem__(self,i):
        
        x = self.X[i].copy()
        h = False
        y = self.Y[i].copy()
        id_ = self.ids[i]
        
        if not self.use_feat:
            x = x[['enmo','anglez','hour']]
        if not self.use_hour:
            x.drop(columns=['hour'], inplace=True)
        if self.use_hour and self.make_hour_emb:
            h = x[['hour']]
            x.drop(columns=['hour'], inplace=True)
        if self.ds_type=='e':
            x.drop(list(x.filter(regex='anglez')), axis=1, inplace=True)
        elif self.ds_type=='a':
            x.drop(list(x.filter(regex='enmo')), axis=1, inplace=True)
        x = x.to_numpy()
        y = y.to_numpy()
        if self.use_hour and self.make_hour_emb:
            h = h.to_numpy()
        return x, h, y, id_

In [3]:
class InferenceFHDataset_SanityCheck(Dataset):

    def __init__(self, data_paths):
        self.data_paths=data_paths
        self.ds_type='ea'
        self.use_feat=True
        self.use_hour=True
        self.make_hour_emb=False
        self.rolling_ranges = [17, 33, 65]
        self.train_len = (6+24+6)*60*12
        self.X=[]
        self.Y=[]
        self.ids=[]
        for path in tqdm(self.data_paths):
            cur_id = int(path.split('/')[-1].split('.')[0][2:])
            xy = pd.read_csv(path)
            y = xy[['onset','wakeup']]
            # gen feat
            x = self._gen_feat(xy)
            # change columns order - hour 을 맨 뒤에 놓는다.
            cols=x.columns.tolist()
            cols=cols[1:2]+cols[3:9]+cols[0:1]+cols[9:]+cols[2:3]
            x=x[cols]
            # crop series into fixed size
            it=len(x)//self.train_len
            num_pad = self.train_len-len(x)%self.train_len
            for i in range(it+1):
                # 마지막 iteration zeropad로 길이 맞춰주기
                if i==it:
                    st= i*self.train_len
                    X = np.pad(x.to_numpy()[st:].copy(), ((0,num_pad),(0,0)),'constant',constant_values=0)
                    Y = np.pad(y.to_numpy()[st:].copy(), ((0,num_pad),(0,0)),'constant',constant_values=0)
                    self.X.append(X)
                    self.Y.append(Y)
                    self.ids.append((cur_id,i))
                else:
                    st = i*self.train_len
                    ed = (i+1)*self.train_len
                    self.X.append(x[st:ed])
                    self.Y.append(y[st:ed])
                    self.ids.append((cur_id,i))
    
    # before evaluate, change state according to 내 모델.
    def change_mode(self,ds_type, use_feat, use_hour, make_hour_emb):
        self.ds_type=ds_type
        self.use_feat=use_feat
        self.use_hour=use_hour
        self.make_hour_emb=make_hour_emb
            
    def __len__(self):
        return len(self.data_paths)
    
    def _gen_feat(self,xy):
        # enmo
        for r in self.rolling_ranges:
            tmp_feat = xy['enmo'].rolling(r, center=True)
            xy[f'enmo_mean_{r}'] = tmp_feat.mean()
            xy[f'enmo_std_{r}'] = tmp_feat.std()
        # anglez
        for r in self.rolling_ranges:
            tmp_feat = xy['anglez'].rolling(r, center=True)
            xy[f'anglez_mean_{r}'] = tmp_feat.mean()
            xy[f'anglez_std_{r}'] = tmp_feat.std()
        return xy.drop(columns=['onset','wakeup']).fillna(0)
    
    def __getitem__(self,i):
        
        x = self.X[i].copy()
        h = False
        y = self.Y[i].copy()
        id_ = self.ids[i]
        
        if not self.use_feat:
            x = x[['enmo','anglez']]
        if self.ds_type=='e':
            x.drop(list(x.filter(regex='anglez')), axis=1, inplace=True)
        elif self.ds_type=='a':
            x.drop(list(x.filter(regex='enmo')), axis=1, inplace=True)
        display(x)
        display(h)
        x = x.to_numpy()
        y = y.to_numpy()
        return x, h, y, id_

# train

In [4]:
class TrainFHDataset(Dataset):
    # 
    def __init__(self, data_paths):
        self.data_paths=data_paths
        self.exclude_time=0
        self.ds_type='ea'
        self.use_feat=True
        self.use_hour=True
        self.make_hour_emb=False
        self.rolling_ranges = [17, 33, 65]
        self.train_len = (6+24+6)*60*12
        self.X=[]
        self.Y=[]
        for path in tqdm(self.data_paths):
            xy = pd.read_csv(path)
            y = xy[['onset','wakeup']]
            # gen feat
            x = self._gen_feat(xy)
            # change columns order - hour 을 맨 뒤에 놓는다.
            cols=x.columns.tolist()
            cols=cols[1:2]+cols[3:9]+cols[0:1]+cols[9:]+cols[2:3]
            x=x[cols]
            
            self.X.append(x)
            self.Y.append(y)
    
    def change_mode(self,ds_type, use_feat, use_hour, make_hour_emb, exclude_time):
        self.ds_type = ds_type
        self.use_feat = use_feat
        self.use_hour = use_hour
        self.make_hour_emb = make_hour_emb
        self.exclude_time = exclude_time
    
    def __len__(self):
        return len(self.data_paths)
    
    def _gen_feat(self,xy):
        # enmo
        for r in self.rolling_ranges:
            tmp_feat = xy['enmo'].rolling(r, center=True)
            xy[f'enmo_mean_{r}'] = tmp_feat.mean()
            xy[f'enmo_std_{r}'] = tmp_feat.std()
        # anglez
        for r in self.rolling_ranges:
            tmp_feat = xy['anglez'].rolling(r, center=True)
            xy[f'anglez_mean_{r}'] = tmp_feat.mean()
            xy[f'anglez_std_{r}'] = tmp_feat.std()
        return xy.drop(columns=['onset','wakeup']).fillna(0)
    
    def __getitem__(self,i):
        x = self.X[i].copy()
        h = False
        y = self.Y[i].copy()
        
        if not self.use_feat:
            x = x[['enmo','anglez','hour']]
        if not self.use_hour:
            x.drop(columns=['hour'], inplace=True)
        if self.use_hour and self.make_hour_emb:
            h = x[['hour']]
            x.drop(columns=['hour'], inplace=True)
        if self.ds_type=='e':
            x.drop(list(x.filter(regex='anglez')), axis=1, inplace=True)
        elif self.ds_type=='a':
            x.drop(list(x.filter(regex='enmo')), axis=1, inplace=True)
        
        if not self.exclude_time==0:
            # 36 - exclude_time 
            st = self.exclude_time*60*12
            ed = -self.exclude_time*60*12
            x = x[st:ed]
            y = y[st:ed]
            if self.use_hour and self.make_hour_emb:
                h = h[st:ed]
        x = x.to_numpy()
        y = y.to_numpy()
        if self.use_hour and self.make_hour_emb:
            h = h.to_numpy()
        
        return x, h, y

In [5]:
class TrainFHDataset_SanityCheck(Dataset):
    # 
    def __init__(self, data_paths):
        self.data_paths=data_paths
        self.exclude_time=0
        self.ds_type='ea'
        self.use_feat=True
        self.use_hour=True
        self.make_hour_emb=False
        self.rolling_ranges = [17, 33, 65]
        self.train_len = (6+24+6)*60*12
        self.X=[]
        self.Y=[]
        for path in tqdm(self.data_paths):
            xy = pd.read_csv(path)
            y = xy[['onset','wakeup']]
            # gen feat
            x = self._gen_feat(xy)
            # change columns order - hour 을 맨 뒤에 놓는다.
            cols=x.columns.tolist()
            cols=cols[1:2]+cols[3:9]+cols[0:1]+cols[9:]+cols[2:3]
            x=x[cols]
            
            self.X.append(x)
            self.Y.append(y)
    
    def change_mode(self,ds_type, use_feat, use_hour, make_hour_emb, exclude_time):
        self.ds_type = ds_type
        self.use_feat = use_feat
        self.use_hour = use_hour
        self.make_hour_emb = make_hour_emb
        self.exclude_time = exclude_time
    
    def __len__(self):
        return len(self.data_paths)
    
    def _gen_feat(self,xy):
        # enmo
        for r in self.rolling_ranges:
            tmp_feat = xy['enmo'].rolling(r, center=True)
            xy[f'enmo_mean_{r}'] = tmp_feat.mean()
            xy[f'enmo_std_{r}'] = tmp_feat.std()
        # anglez
        for r in self.rolling_ranges:
            tmp_feat = xy['anglez'].rolling(r, center=True)
            xy[f'anglez_mean_{r}'] = tmp_feat.mean()
            xy[f'anglez_std_{r}'] = tmp_feat.std()
        return xy.drop(columns=['onset','wakeup']).fillna(0)
    
    def __getitem__(self,i):
        x = self.X[i].copy()
        h = False
        y = self.Y[i].copy()
        
        if not self.use_feat:
            x = x[['enmo','anglez']]
        if self.ds_type=='e':
            x.drop(list(x.filter(regex='anglez')), axis=1, inplace=True)
        elif self.ds_type=='a':
            x.drop(list(x.filter(regex='enmo')), axis=1, inplace=True)
        
        if not self.exclude_time==0:
            # 36 - exclude_time 
            st = self.exclude_time*60*12
            ed = -self.exclude_time*60*12
            x = x[st:ed]
            y = y[st:ed]
        display(x)
        display(y)
        x = x.to_numpy()
        y = y.to_numpy()
        
        return x, h, y

# sanity check

In [6]:
# train enmo anglez hour
path1_1=glob.glob('/kaggle/input/gaussian-enmo-anglez-hour/fine/*.csv')[:2]

# train enmo anglez
path1_2=glob.glob('/kaggle/input/enmo-anglez/fine/*.csv')[:2]


# valid enmo anglez hour
path2_2=glob.glob('/kaggle/input/id-series-inference/archive/data/*.csv')[:2]

# valid enmo anglez
path2_1=glob.glob('/kaggle/input/id-series-hour-inference/archive/data/*.csv')[:2]

In [7]:
train_ds1 = TrainFHDataset(path1_1)
train_ds2 = TrainFHDataset_SanityCheck(path1_2)

val_ds1 = InferenceFHDataset(path2_1)
val_ds2 = InferenceFHDataset_SanityCheck(path2_2)

100%|██████████| 2/2 [00:00<00:00, 11.04it/s]
100%|██████████| 2/2 [00:00<00:00, 14.48it/s]
100%|██████████| 2/2 [00:01<00:00,  1.41it/s]
100%|██████████| 2/2 [00:01<00:00,  1.59it/s]


In [8]:
train_ds1.change_mode(ds_type='ea', use_feat=True, use_hour=False, make_hour_emb=False, exclude_time=0)
train_ds2.change_mode(ds_type='ea', use_feat=True, use_hour=False, make_hour_emb=False, exclude_time=0)

train_ds1[0][0], train_ds2[0][0] # same

Unnamed: 0,enmo,enmo_std_17,enmo_mean_33,enmo_std_33,enmo_mean_65,enmo_std_65,anglez_mean_17,anglez,anglez_std_17,anglez_mean_33,anglez_std_33,anglez_mean_65,anglez_std_65,enmo_mean_17
0,-0.160220,0.0,0.0,0.0,0.0,0.0,0.0,-0.055204,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.178879,0.0,0.0,0.0,0.0,0.0,0.0,-0.063385,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.172004,0.0,0.0,0.0,0.0,0.0,0.0,-0.080518,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.172004,0.0,0.0,0.0,0.0,0.0,0.0,-0.123325,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.182807,0.0,0.0,0.0,0.0,0.0,0.0,-0.126222,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25915,-0.404748,0.0,0.0,0.0,0.0,0.0,0.0,-1.105244,0.0,0.0,0.0,0.0,0.0,0.0
25916,-0.405730,0.0,0.0,0.0,0.0,0.0,0.0,-1.102609,0.0,0.0,0.0,0.0,0.0,0.0
25917,-0.405730,0.0,0.0,0.0,0.0,0.0,0.0,-1.102719,0.0,0.0,0.0,0.0,0.0,0.0
25918,-0.405730,0.0,0.0,0.0,0.0,0.0,0.0,-1.101387,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,onset,wakeup
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
25915,0.0,0.0
25916,0.0,0.0
25917,0.0,0.0
25918,0.0,0.0


(array([[-0.16022003,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.17887877,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.17200449,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [-0.4057298 ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.4057298 ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.4057298 ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]]),
 array([[-0.16022003,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.17887877,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.17200449,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [-0.4057298 ,  0.        ,  0.        , ...,  

In [9]:
val_ds1.change_mode(ds_type='ea', use_feat=True, use_hour=True, make_hour_emb=False)
val_ds2.change_mode(ds_type='ea', use_feat=True, use_hour=False, make_hour_emb=False)

val_ds1[0][0], val_ds2[0][0] # same

Unnamed: 0,enmo,enmo_std_17,enmo_mean_33,enmo_std_33,enmo_mean_65,enmo_std_65,anglez_mean_17,anglez,anglez_std_17,anglez_mean_33,anglez_std_33,anglez_mean_65,anglez_std_65,enmo_mean_17
0,-0.402784,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.249918,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,-0.030591,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.161677,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.662729,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.914919,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.667639,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.720298,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.844406,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.651459,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25915,-0.307526,0.002023,-0.307556,0.002361,-0.307601,0.002245,-1.611758,-1.612312,0.000685,-1.611198,0.001976,-1.611441,0.003433,-0.307179
25916,-0.308508,0.002175,-0.307704,0.002386,-0.307662,0.002269,-1.611777,-1.611794,0.000663,-1.610963,0.002052,-1.611236,0.003336,-0.307353
25917,-0.310472,0.001964,-0.307794,0.002280,-0.307707,0.002296,-1.611646,-1.611912,0.000550,-1.610856,0.001961,-1.611081,0.003193,-0.307526
25918,-0.309490,0.001960,-0.307764,0.002277,-0.307692,0.002294,-1.611327,-1.611707,0.001211,-1.610725,0.002022,-1.610910,0.003005,-0.307410


False

(array([[-4.02783700e-01,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  1.30000000e+01],
        [-3.05908700e-02,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  1.30000000e+01],
        [ 6.62728800e-01,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  1.30000000e+01],
        ...,
        [-3.10472000e-01, -3.07525906e-01,  1.96407594e-03, ...,
         -1.61108066e+00,  3.19298756e-03,  1.00000000e+00],
        [-3.09489970e-01, -3.07410372e-01,  1.96046223e-03, ...,
         -1.61091006e+00,  3.00502695e-03,  1.00000000e+00],
        [-3.07525930e-01, -3.07756973e-01,  2.44351680e-03, ...,
         -1.61071174e+00,  2.88199079e-03,  1.00000000e+00]]),
 array([[-0.4027837 ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.03059087,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.6627288 ,  0.

# Checking

In [10]:
val_ds1.change_mode(ds_type='ea', use_feat=False, use_hour=False, make_hour_emb=False)

val_ds1[0]

(array([[-0.4027837 ,  1.2499185 ],
        [-0.03059087,  1.1616771 ],
        [ 0.6627288 ,  0.9149194 ],
        ...,
        [-0.310472  , -1.6119123 ],
        [-0.30948997, -1.6117067 ],
        [-0.30752593, -1.6114984 ]]),
 False,
 array([[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]]),
 (148, 0))

In [11]:
train_ds1.change_mode(ds_type='ea', use_feat=True, use_hour=False, make_hour_emb=False, exclude_time=0)

train_ds1[0][0].shape

(25920, 14)