### Downloading data

In [1]:
!mkdir ~/.kaggle
!wget -O ~/.kaggle/kaggle.json "https://drive.google.com/uc?export=download&id=1eN1kfpt1EQS-KQk-A9M2hWO-uhEBMwE0"
!chmod 600 ~/.kaggle/kaggle.json

--2023-10-23 22:25:35--  https://drive.google.com/uc?export=download&id=1eN1kfpt1EQS-KQk-A9M2hWO-uhEBMwE0
Resolving drive.google.com (drive.google.com)... 142.251.163.138, 142.251.163.113, 142.251.163.139, ...
Connecting to drive.google.com (drive.google.com)|142.251.163.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-14-80-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/qrgokhjl9frs20s4udb4s5nmb6675g3r/1698099900000/12009704024140886017/*/1eN1kfpt1EQS-KQk-A9M2hWO-uhEBMwE0?e=download&uuid=41867bbc-e08a-4518-ba09-0bac15ebf2ba [following]
--2023-10-23 22:25:35--  https://doc-14-80-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/qrgokhjl9frs20s4udb4s5nmb6675g3r/1698099900000/12009704024140886017/*/1eN1kfpt1EQS-KQk-A9M2hWO-uhEBMwE0?e=download&uuid=41867bbc-e08a-4518-ba09-0bac15ebf2ba
Resolving doc-14-80-docs.googleusercontent.com (doc-14-80-docs.googleusercontent.com)... 172.253.62.132, 2

In [2]:
!kaggle competitions download 'child-mind-institute-detect-sleep-states'
!mkdir data
!unzip child-mind-institute-detect-sleep-states.zip -d data

Downloading child-mind-institute-detect-sleep-states.zip to /content
 99% 803M/811M [00:08<00:00, 89.4MB/s]
100% 811M/811M [00:09<00:00, 85.5MB/s]
Archive:  child-mind-institute-detect-sleep-states.zip
  inflating: data/sample_submission.csv  
  inflating: data/test_series.parquet  
  inflating: data/train_events.csv   
  inflating: data/train_series.parquet  


### Preprocessing

In [1]:
import gc
import os
import pyarrow as pa
import pyarrow.parquet
import pandas as pd
import numpy as np

In [2]:
train_events = pd.read_csv(
    'data/train_events.csv',
    dtype={
        'series_id': 'str',
        'night': 'Int32',
        'event': 'str',
        'step': 'Int64',
    }
)
series_list = train_events['series_id'].unique().tolist()
n_series = len(series_list)
print(series_list)

['038441c925bb', '03d92c9f6f8a', '0402a003dae9', '04f547b8017d', '05e1944c3818', '062cae666e2a', '062dbd4c95e6', '08db4255286f', '0a96f4993bd7', '0cd1e3d0ed95', '0ce74d6d2106', '0cfc06c129cc', '0d0ad1e77851', '0dee4fda51c3', '0ec9fc461819', '0ef7d94fde99', '0f572d690310', '0f9e60a8e56d', '10469f6765bf', '1087d7b0ff2e', '10f8bc1f7b07', '12d01911d509', '1319a1935f48', '137771d19ca2', '137b99e936ab', '13b4d6a01d27', '148471991ffb', '154fe824ed87', '16fe2798ed0f', '1716cd4163b2', '1762ab70ec76', '188d4b7cd28b', '18a0ca03431d', '18b61dd5aae8', '1955d568d987', '1b92be89db4c', '1c7c0bad1263', '1d4569cbac0f', '1e6717d93c1d', '1f96b9668bdf', '207eded97727', '25e2b3dd9c3b', '2654a87be968', '27f09a6a858f', '280e08693c6d', '292a75c0b94e', '29c75c018220', '29d3469bd15d', '2b0a1fa8eba8', '2b8d87addea9', '2cd2340ca14d', '2e9ced2c7976', '2f7504d0f426', '2fbbee1a38e3', '2fc653ca75c7', '31011ade7c0a', '3318a0e3ed6f', '33ceeba8918a', '3452b878e596', '349c5562ee2c', '35826366dfc7', '361366da569e', '3664fe

In [3]:
train_events = train_events[pd.notna(train_events['step'])]

In [4]:
data_root = 'data'

In [5]:
series_schema = pa.schema([
    ('series_id', pa.string()),
    ('step', pa.uint32()),
    ('timestamp', pa.string()),
    ('anglez', pa.float32()),
    ('enmo', pa.float32())
])

train_series = pyarrow.parquet.read_table(os.path.join(data_root, 'train_series.parquet'), schema=series_schema)

In [16]:
X = [] # input data
Y = [] # labels

In [7]:
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [8]:
def get_timeofday(date):
  return date.second // 5 + date.minute * 12 + date.hour * 720

In [9]:
def get_timeofday_series(len_series, first_step_of_day):
  valuesForDay = np.linspace(0, 1, 17280, endpoint= True)
  series = np.empty(len_series)
  idx = 17280 - first_step_of_day
  series[0:idx] = valuesForDay[first_step_of_day:]
  while (idx < len_series):
    series[idx:idx+17280] = valuesForDay[:min(17280, len_series - idx)]
    idx += 17280
  return series

In [10]:
def preprocess_series(series_id):
  series = train_series.filter(pa.compute.field('series_id') == series_id)
  events = train_events[train_events['series_id'] == series_id].reset_index()

  len_series = len(series)
  series_data = np.empty([3, len_series])
  labels = np.zeros([2, len_series], dtype='int32')

  series_data[1] = series['enmo'].to_numpy()
  series_data[2] = series['anglez'].to_numpy()

  series_data[1] = StandardScaler().fit_transform(series_data[1].reshape(-1, 1)).reshape(-1,)
  series_data[2] = StandardScaler().fit_transform(series_data[2].reshape(-1, 1)).reshape(-1,)

  start_time_str = series['timestamp'][0].as_py()[:-5]
  start_time = get_timeofday(datetime.strptime(start_time_str, '%Y-%m-%dT%H:%M:%S'))

  series_data[0] = get_timeofday_series(len_series, start_time)

  for i in range(0, len(events)):
    if pd.isna(events['step'][i]):
      continue
    if i % 2 == 0:
      labels[0, events['step'][i]] = 1
    elif i % 2 == 1:
      labels[1, events['step'][i]] = 1

  return series_data, labels

In [11]:
for idx, series_id in enumerate(series_list):
  print(f'[{idx + 1}/{n_series}] Processing series {series_id}')
  series, labels = preprocess_series(series_id)
  # temp memory deallocation to prevent crash in colab
  if idx == 150:
    train_series = train_series[50_000_000:]
    gc.collect()
  X.append(series)
  Y.append(labels)

[1/277] Processing series 038441c925bb
[2/277] Processing series 03d92c9f6f8a
[3/277] Processing series 0402a003dae9
[4/277] Processing series 04f547b8017d
[5/277] Processing series 05e1944c3818
[6/277] Processing series 062cae666e2a
[7/277] Processing series 062dbd4c95e6
[8/277] Processing series 08db4255286f
[9/277] Processing series 0a96f4993bd7
[10/277] Processing series 0cd1e3d0ed95
[11/277] Processing series 0ce74d6d2106
[12/277] Processing series 0cfc06c129cc
[13/277] Processing series 0d0ad1e77851
[14/277] Processing series 0dee4fda51c3
[15/277] Processing series 0ec9fc461819
[16/277] Processing series 0ef7d94fde99
[17/277] Processing series 0f572d690310
[18/277] Processing series 0f9e60a8e56d
[19/277] Processing series 10469f6765bf
[20/277] Processing series 1087d7b0ff2e
[21/277] Processing series 10f8bc1f7b07
[22/277] Processing series 12d01911d509
[23/277] Processing series 1319a1935f48
[24/277] Processing series 137771d19ca2
[25/277] Processing series 137b99e936ab
[26/277] 

In [12]:
train_series = None
gc.collect()

0

In [13]:
print(len(X))
print(len(Y))

277
277


In [14]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.15)

In [15]:
print(len(X_train), len(Y_train))
print(len(X_valid), len(Y_valid))

235 235
42 42
