In [32]:
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle competitions download 'child-mind-institute-detect-sleep-states'

Downloading child-mind-institute-detect-sleep-states.zip to /content
100% 809M/811M [00:21<00:00, 38.9MB/s]
100% 811M/811M [00:21<00:00, 38.7MB/s]


In [5]:
!mkdir data
!unzip child-mind-institute-detect-sleep-states.zip -d data

Archive:  child-mind-institute-detect-sleep-states.zip
  inflating: data/sample_submission.csv  
  inflating: data/test_series.parquet  
  inflating: data/train_events.csv   
  inflating: data/train_series.parquet  


In [19]:
import gc
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet

In [26]:
train_events = pd.read_csv('data/train_events.csv')
series_list = train_events['series_id'].unique().tolist()
print(series_list)

['038441c925bb', '03d92c9f6f8a', '0402a003dae9', '04f547b8017d', '05e1944c3818', '062cae666e2a', '062dbd4c95e6', '08db4255286f', '0a96f4993bd7', '0cd1e3d0ed95', '0ce74d6d2106', '0cfc06c129cc', '0d0ad1e77851', '0dee4fda51c3', '0ec9fc461819', '0ef7d94fde99', '0f572d690310', '0f9e60a8e56d', '10469f6765bf', '1087d7b0ff2e', '10f8bc1f7b07', '12d01911d509', '1319a1935f48', '137771d19ca2', '137b99e936ab', '13b4d6a01d27', '148471991ffb', '154fe824ed87', '16fe2798ed0f', '1716cd4163b2', '1762ab70ec76', '188d4b7cd28b', '18a0ca03431d', '18b61dd5aae8', '1955d568d987', '1b92be89db4c', '1c7c0bad1263', '1d4569cbac0f', '1e6717d93c1d', '1f96b9668bdf', '207eded97727', '25e2b3dd9c3b', '2654a87be968', '27f09a6a858f', '280e08693c6d', '292a75c0b94e', '29c75c018220', '29d3469bd15d', '2b0a1fa8eba8', '2b8d87addea9', '2cd2340ca14d', '2e9ced2c7976', '2f7504d0f426', '2fbbee1a38e3', '2fc653ca75c7', '31011ade7c0a', '3318a0e3ed6f', '33ceeba8918a', '3452b878e596', '349c5562ee2c', '35826366dfc7', '361366da569e', '3664fe

In [21]:
series_schema = pa.schema([
    ('series_id', pa.string()),
    ('step', pa.uint32()),
    ('timestamp', pa.string()),
    ('anglez', pa.float32()),
    ('enmo', pa.float32())
])

def create_separated_dataset(data_root):
    data_series_dir = os.path.join(data_root, 'train_series')
    os.mkdir(data_series_dir)
    print('Loading data series')
    train_series = pyarrow.parquet.read_table(os.path.join(data_root, 'train_series.parquet'), schema=series_schema)
    for i, series_id in enumerate(series_list):
        print(f'[{i + 1}/{len(series_list)}] Processing series {series_id}')
        series = train_series.filter(pa.compute.field('series_id') == series_id)
        series = series.drop(['series_id']).to_pandas()
        series['timestamp'] = pd.to_datetime(series['timestamp'], format='%Y-%m-%dT%H:%M:%S%Z')
        file = os.path.join(data_series_dir, f'{series_id}.parquet')
        series.to_parquet(file)
        series = None
        gc.collect()
    train_series = None
    gc.collect()

In [None]:
create_separated_dataset('data')

In [32]:
# read specific series
series = pd.read_parquet('data/train_series/03d92c9f6f8a.parquet')
display(series)

Unnamed: 0,step,timestamp,anglez,enmo
0,0,2018-05-31 12:00:00-04:00,38.892899,0.0803
1,1,2018-05-31 12:00:05-04:00,29.374399,0.0752
2,2,2018-05-31 12:00:10-04:00,37.225101,0.1791
3,3,2018-05-31 12:00:15-04:00,46.937000,0.0922
4,4,2018-05-31 12:00:20-04:00,60.486698,0.0342
...,...,...,...,...
724135,724135,2018-07-12 09:44:35-04:00,-12.066200,0.0382
724136,724136,2018-07-12 09:44:40-04:00,-15.913500,0.0254
724137,724137,2018-07-12 09:44:45-04:00,-10.857800,0.0281
724138,724138,2018-07-12 09:44:50-04:00,-8.534700,0.0274
