In [1]:
import gc
import pandas as pd
import numpy as np
import datetime as dt

# pleaseee uncomment the lines below if it is not installed on your local
# %pip install matplotlib
# %pip install seaborn
# %pip install plotly
# %pip install polars
# %pip install pyarrow

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import pyarrow

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from colorama import Fore, Style, init
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

In [8]:
%%time
# inspired by https://www.kaggle.com/code/enricomanosperti/detect-sleep-states-first-preprocessing-and-eda
import polars as pl
train_series_binary = (pl.scan_parquet('../data/Zzzs_train.parquet')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

CPU times: total: 8.94 s
Wall time: 15.1 s


In [9]:
train_series_binary.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,year,month,day,hour
0,08db4255286f,0,2018-11-05 10:00:00,-30.845301,0.0447,1,2018,11,5,10
1,08db4255286f,1,2018-11-05 10:00:05,-34.181801,0.0443,1,2018,11,5,10
2,08db4255286f,2,2018-11-05 10:00:10,-33.877102,0.0483,1,2018,11,5,10
3,08db4255286f,3,2018-11-05 10:00:15,-34.282101,0.068,1,2018,11,5,10
4,08db4255286f,4,2018-11-05 10:00:20,-34.385799,0.0768,1,2018,11,5,10


In [12]:
train_series_binary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13165560 entries, 0 to 13165559
Data columns (total 10 columns):
 #   Column     Dtype         
---  ------     -----         
 0   series_id  object        
 1   step       uint32        
 2   timestamp  datetime64[us]
 3   anglez     float32       
 4   enmo       float32       
 5   awake      int64         
 6   year       int32         
 7   month      uint32        
 8   day        uint32        
 9   hour       uint32        
dtypes: datetime64[us](1), float32(2), int32(1), int64(1), object(1), uint32(4)
memory usage: 652.9+ MB


In [11]:
%%time
# inspired by https://www.kaggle.com/code/enricomanosperti/detect-sleep-states-first-preprocessing-and-eda
import polars as pl
train_series_multi = (pl.scan_parquet('../data/Zzzs_train_multi.parquet')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

CPU times: total: 11.2 s
Wall time: 20.3 s


In [13]:
train_series_multi.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,year,month,day,hour
0,08db4255286f,0,2018-11-05 10:00:00,-30.845301,0.0447,1,2018,11,5,10
1,08db4255286f,1,2018-11-05 10:00:05,-34.181801,0.0443,1,2018,11,5,10
2,08db4255286f,2,2018-11-05 10:00:10,-33.877102,0.0483,1,2018,11,5,10
3,08db4255286f,3,2018-11-05 10:00:15,-34.282101,0.068,1,2018,11,5,10
4,08db4255286f,4,2018-11-05 10:00:20,-34.385799,0.0768,1,2018,11,5,10


In [14]:
train_series_multi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16289820 entries, 0 to 16289819
Data columns (total 10 columns):
 #   Column     Dtype         
---  ------     -----         
 0   series_id  object        
 1   step       uint32        
 2   timestamp  datetime64[us]
 3   anglez     float32       
 4   enmo       float32       
 5   awake      int64         
 6   year       int32         
 7   month      uint32        
 8   day        uint32        
 9   hour       uint32        
dtypes: datetime64[us](1), float32(2), int32(1), int64(1), object(1), uint32(4)
memory usage: 807.8+ MB


In [15]:
train_series_multi.head(100)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,year,month,day,hour
0,08db4255286f,0,2018-11-05 10:00:00,-30.845301,0.0447,1,2018,11,5,10
1,08db4255286f,1,2018-11-05 10:00:05,-34.181801,0.0443,1,2018,11,5,10
2,08db4255286f,2,2018-11-05 10:00:10,-33.877102,0.0483,1,2018,11,5,10
3,08db4255286f,3,2018-11-05 10:00:15,-34.282101,0.0680,1,2018,11,5,10
4,08db4255286f,4,2018-11-05 10:00:20,-34.385799,0.0768,1,2018,11,5,10
...,...,...,...,...,...,...,...,...,...,...
95,08db4255286f,95,2018-11-05 10:07:55,-25.232300,0.0448,1,2018,11,5,10
96,08db4255286f,96,2018-11-05 10:08:00,-24.980301,0.0952,1,2018,11,5,10
97,08db4255286f,97,2018-11-05 10:08:05,-25.480801,0.0461,1,2018,11,5,10
98,08db4255286f,98,2018-11-05 10:08:10,-26.091999,0.0632,1,2018,11,5,10
