In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import os
from pathlib import Path

import numpy as np
import pandas as pd

import pyarrow.parquet as pq

# **Create Directories**

In [None]:
DATASET_VERSION = 'v1'

In [None]:
# Create a directory for the clean dataset.
train_output_directory = Path(f'detect-sleep-states/train-series_{DATASET_VERSION}/')
train_series_path = Path(f"/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet")

# Remove the directory if it already exists and re-create it.
!rm -rf {train_output_directory}
train_output_directory.mkdir(parents=True)

In [None]:
# Create a directory for the clean dataset.
test_output_directory = Path(f'detect-sleep-states/test-series_{DATASET_VERSION}/')
test_series_path = Path(f"/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")

# Remove the directory if it already exists and re-create it.
!rm -rf {test_output_directory}
test_output_directory.mkdir(parents=True)

# **Create CSV Files**

In [None]:
def parquet_by_chunks_to_csv(parquet_file,output_directory):
    parquet_file = pq.ParquetFile(parquet_file)
    for chunk in parquet_file.iter_batches(batch_size=10000):
        chunk_df = chunk.to_pandas()
        # not used yet
        #chunk_df = preprocessing(chunk_df)
        series = chunk_df["series_id"].unique()
        for serie in series:
            df = chunk_df[chunk_df["series_id"]==f'{serie}']
            serie_path = Path(output_directory/f'{serie}.csv')
            if serie_path.exists():
                df.to_csv(output_directory/f'{serie}.csv',mode = 'a',index = False,header = False)
            else: 
                df.to_csv(output_directory/f'{serie}.csv',mode = 'w',index = False,header = True)

In [None]:
# train series
parquet_by_chunks_to_csv(train_series_path,train_output_directory)

In [None]:
# test series
parquet_by_chunks_to_csv(test_series_path,test_output_directory)

# **Create Dataset**

In [None]:
# Copy the other datasets as is given they are not as big.
!cp "/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv" {train_output_directory}
!cp "/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv" {train_output_directory}

In [None]:
!tree -h {output_directory.parent.parent}

In [None]:
!echo '{"username":"","key":""} '> /root/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!echo '{"title": "Detect sleep states", "id": "doubleman89/detect-sleep-states", "licenses": [{"name": "copyright-authors"}]}' > dataset-metadata.json

In [None]:
# Upload the csv files as is without compressing it.
!kaggle datasets create -r skip --dir-mode tar

