[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/greentea-ai/DetectSleepStates/blob/milestone1/data_preprocess.ipynb)

### Downloading data

In [None]:
!mkdir -p ~/.kaggle
!wget -O ~/.kaggle/kaggle.json "https://drive.google.com/uc?export=download&id=1eN1kfpt1EQS-KQk-A9M2hWO-uhEBMwE0"
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download 'child-mind-institute-detect-sleep-states'
!mkdir -p data
!unzip -qqn child-mind-institute-detect-sleep-states.zip -d data

### Preprocessing

In [None]:
import gc
import os
import pyarrow as pa
import pyarrow.parquet
import pandas as pd
import numpy as np

In [None]:
train_events = pd.read_csv(
    'data/train_events.csv',
    dtype={
        'series_id': 'str',
        'night': 'Int32',
        'event': 'str',
        'step': 'Int64',
    }
)
series_list = train_events['series_id'].unique()
n_series = len(series_list)
print(series_list)

In [None]:
train_events = train_events[pd.notna(train_events['step'])]

In [None]:
data_root = 'data'

In [None]:
series_schema = pa.schema([
    ('series_id', pa.string()),
    ('step', pa.uint32()),
    ('timestamp', pa.timestamp('s', tz='UTC')),
    ('anglez', pa.float32()),
    ('enmo', pa.float32())
])

train_series = pyarrow.parquet.read_table(os.path.join(data_root, 'train_series.parquet'), schema=series_schema)

display(train_series)

In [None]:
X = [] # input data
Y = [] # labels

In [None]:
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
def get_timeofday(date):
  return date.second // 5 + date.minute * 12 + date.hour * 720

In [None]:
def get_timeofday_series(len_series, first_step_of_day):
  valuesForDay = np.linspace(0, 1, 17280, endpoint= True)
  series = np.empty(len_series)
  idx = 17280 - first_step_of_day
  series[0:idx] = valuesForDay[first_step_of_day:]
  while (idx < len_series):
    series[idx:idx+17280] = valuesForDay[:min(17280, len_series - idx)]
    idx += 17280
  return series

In [None]:
def preprocess_series(series_id):
  series = train_series.filter(pa.compute.field('series_id') == series_id)
  events = train_events[train_events['series_id'] == series_id].reset_index()
  
  len_series = len(series)
  series_data = np.empty([3, len_series])
  labels = np.zeros([2, len_series], dtype='int32')

  series_data[1] = series['enmo'].to_numpy()
  series_data[2] = series['anglez'].to_numpy()

  series_data[1] = StandardScaler().fit_transform(series_data[1].reshape(-1, 1)).reshape(-1,)
  series_data[2] = StandardScaler().fit_transform(series_data[2].reshape(-1, 1)).reshape(-1,)

  start_time = get_timeofday(series['timestamp'][0].as_py())

  series_data[0] = get_timeofday_series(len_series, start_time)

  for i in range(0, len(events)):
    if pd.isna(events['step'][i]):
      continue
    if i % 2 == 0:
      labels[0, events['step'][i]] = 1
    elif i % 2 == 1:
      labels[1, events['step'][i]] = 1

  return series_data, labels

In [None]:
for idx, series_id in enumerate(series_list):
  print(f'[{idx + 1}/{n_series}] Processing series {series_id}')
  series, labels = preprocess_series(series_id)
  # temp memory deallocation to prevent crash in colab
  if idx == 150:
    train_series = train_series[50_000_000:]
    gc.collect()
  X.append(series)
  Y.append(labels)

In [None]:
train_series = None
gc.collect()

In [None]:
print(len(X))
print(len(Y))

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.15)

In [None]:
print(len(X_train), len(Y_train))
print(len(X_valid), len(Y_valid))