In [15]:
import gc
import pandas as pd
import numpy as np
import datetime as dt
from time import time

# pleaseee uncomment the lines below if it is not installed on your local
# %pip install matplotlib
# %pip install seaborn
# %pip install plotly
# %pip install polars
# %pip install pyarrow

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import pyarrow

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from colorama import Fore, Style, init
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

In [9]:
%%time
# inspired by https://www.kaggle.com/code/enricomanosperti/detect-sleep-states-first-preprocessing-and-eda
import polars as pl
train_series_multi = (pl.scan_parquet('../data/Zzzs_train_multi.parquet')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

CPU times: total: 10.8 s
Wall time: 20 s


In [10]:
train_series_multi.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,year,month,day,hour
0,08db4255286f,0,2018-11-05 10:00:00,-30.845301,0.0447,1,2018,11,5,10
1,08db4255286f,1,2018-11-05 10:00:05,-34.181801,0.0443,1,2018,11,5,10
2,08db4255286f,2,2018-11-05 10:00:10,-33.877102,0.0483,1,2018,11,5,10
3,08db4255286f,3,2018-11-05 10:00:15,-34.282101,0.068,1,2018,11,5,10
4,08db4255286f,4,2018-11-05 10:00:20,-34.385799,0.0768,1,2018,11,5,10


In [11]:
train_series_multi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16289820 entries, 0 to 16289819
Data columns (total 10 columns):
 #   Column     Dtype         
---  ------     -----         
 0   series_id  object        
 1   step       uint32        
 2   timestamp  datetime64[us]
 3   anglez     float32       
 4   enmo       float32       
 5   awake      int64         
 6   year       int32         
 7   month      uint32        
 8   day        uint32        
 9   hour       uint32        
dtypes: datetime64[us](1), float32(2), int32(1), int64(1), object(1), uint32(4)
memory usage: 807.8+ MB


In [12]:
%%time
# inspired by https://www.kaggle.com/code/renatoreggiani/reduce-memory-usage-zzzs-cmi
# with tweaks determined by the selected polars loading strategy
# tweaks inspired by https://github.com/softhints/Pandas-Tutorials/blob/master/column/3.check-dtype-column-columns-pandas-dataframe.ipynb
from pandas.api.types import is_datetime64_ns_dtype
def reduce_mem_usage(df):
    """ iterate through all numeric columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and 'datetime' not in str(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float16)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f'Decreased by {decrease:.2f}%')                                                                                    
    
    return df

train_series_multi = reduce_mem_usage(train_series_multi)

Memory usage of dataframe is 807.83 MB
Memory usage after optimization is: 512.66 MB
Decreased by 36.54%
CPU times: total: 672 ms
Wall time: 1.84 s


In [6]:
# features, feature_cols = [pl.col('hour')], ['hour']

# for mins in [5, 30, 60*2, 60*8] :
    
#     for var in ['enmo', 'anglez'] :
        
#         features += [
#             pl.col(var).rolling_mean(12 * mins, center=True, min_periods=1).abs().alias(f'{var}_{mins}m_mean'),
#             pl.col(var).rolling_max(12 * mins, center=True, min_periods=1).abs().alias(f'{var}_{mins}m_max'),
#             pl.col(var).rolling_std(12 * mins, center=True, min_periods=1).abs().alias(f'{var}_{mins}m_std')
#         ]

#         feature_cols += [ 
#             f'{var}_{mins}m_mean', f'{var}_{mins}m_max', f'{var}_{mins}m_std'
#         ]

#         # Getting first variations
#         features += [
#             (pl.col(var).diff().abs().rolling_mean(12 * mins, center=True, min_periods=1)*10).abs().alias(f'{var}_1v_{mins}m_mean'),
#             (pl.col(var).diff().abs().rolling_max(12 * mins, center=True, min_periods=1)*10).abs().alias(f'{var}_1v_{mins}m_max'),
#             (pl.col(var).diff().abs().rolling_std(12 * mins, center=True, min_periods=1)*10).abs().alias(f'{var}_1v_{mins}m_std')
#         ]

#         feature_cols += [ 
#             f'{var}_1v_{mins}m_mean', f'{var}_1v_{mins}m_max', f'{var}_1v_{mins}m_std'
#         ]

# id_cols = ['series_id', 'step', 'timestamp']

# train_series_multi = pl.DataFrame(train_series_multi)

# train_series_multi= train_series_multi.with_columns(features).select(id_cols + feature_cols)

In [7]:
# train_series_multi

series_id,step,timestamp,hour,enmo_5m_mean,enmo_5m_max,enmo_5m_std,enmo_1v_5m_mean,enmo_1v_5m_max,enmo_1v_5m_std,anglez_5m_mean,anglez_5m_max,anglez_5m_std,anglez_1v_5m_mean,anglez_1v_5m_max,anglez_1v_5m_std,enmo_30m_mean,enmo_30m_max,enmo_30m_std,enmo_1v_30m_mean,enmo_1v_30m_max,enmo_1v_30m_std,anglez_30m_mean,anglez_30m_max,anglez_30m_std,anglez_1v_30m_mean,anglez_1v_30m_max,anglez_1v_30m_std,enmo_120m_mean,enmo_120m_max,enmo_120m_std,enmo_1v_120m_mean,enmo_1v_120m_max,enmo_1v_120m_std,anglez_120m_mean,anglez_120m_max,anglez_120m_std,anglez_1v_120m_mean,anglez_1v_120m_max,anglez_1v_120m_std,enmo_480m_mean,enmo_480m_max,enmo_480m_std,enmo_1v_480m_mean,enmo_1v_480m_max,enmo_1v_480m_std,anglez_480m_mean,anglez_480m_max,anglez_480m_std,anglez_1v_480m_mean,anglez_1v_480m_max,anglez_1v_480m_std
str,u32,datetime[μs],u32,f32,f32,f32,u32,u32,u32,f32,f32,f32,u32,u32,u32,f32,f32,f32,u32,u32,u32,f32,f32,f32,u32,u32,u32,f32,f32,f32,u32,u32,u32,f32,f32,f32,u32,u32,u32,f32,f32,f32,u32,u32,u32,f32,f32,f32,u32,u32,u32
"""08db4255286f""",0,2018-11-05 10:00:00,10,0.058323,0.1073,0.014657,0,0,0,28.554617,18.2644,4.431949,18,65,16,0.075161,0.198,0.032024,0,1,0,27.203093,18.2644,3.414099,18,96,15,0.074588,0.1982,0.028441,0,1,0,27.999287,16.633101,3.721624,18,96,15,0.075437,0.992,0.085659,0,7,0,21.868397,89.286697,43.233776,42,1315,94
"""08db4255286f""",1,2018-11-05 10:00:05,10,0.058735,0.1073,0.014592,0,0,0,28.644566,18.2644,4.386139,19,65,17,0.075302,0.198,0.031992,0,1,0,27.164049,18.2644,3.444891,17,96,15,0.074584,0.1982,0.028421,0,1,0,27.998989,16.633101,3.719062,18,96,15,0.075414,0.992,0.085653,0,7,0,21.862902,89.286697,43.227276,42,1315,94
"""08db4255286f""",2,2018-11-05 10:00:10,10,0.059122,0.1073,0.01452,0,0,0,28.74365,18.2644,4.351065,19,65,17,0.075318,0.198,0.031904,0,1,0,27.140644,18.2644,3.449843,18,96,15,0.074574,0.1982,0.028403,0,1,0,27.999813,16.633101,3.716552,18,96,15,0.07539,0.992,0.085647,0,7,0,21.857286,89.286697,43.220825,42,1315,94
"""08db4255286f""",3,2018-11-05 10:00:15,10,0.059336,0.1073,0.014345,0,0,0,28.706924,18.2644,4.287739,19,65,17,0.075215,0.198,0.031847,0,1,0,27.101316,18.2644,3.481234,18,96,15,0.074628,0.1982,0.028419,0,1,0,27.998137,16.633101,3.714246,18,96,15,0.075367,0.992,0.085641,0,7,0,21.851757,89.286697,43.214348,42,1315,94
"""08db4255286f""",4,2018-11-05 10:00:20,10,0.059706,0.1073,0.014289,0,0,0,28.650814,18.2644,4.234933,19,65,17,0.075251,0.198,0.031763,0,1,0,27.068127,18.2644,3.500781,18,96,15,0.074633,0.1982,0.0284,0,1,0,27.997713,16.633101,3.711699,18,96,15,0.075343,0.992,0.085636,0,7,0,21.846146,89.286697,43.207901,42,1315,93
"""08db4255286f""",5,2018-11-05 10:00:25,10,0.059714,0.1073,0.014077,0,0,0,28.590364,18.2644,4.187486,18,65,17,0.075238,0.198,0.031677,0,1,0,27.033394,18.2644,3.523058,17,96,15,0.074638,0.1982,0.028381,0,1,0,27.999739,16.633101,3.709531,18,96,15,0.07532,0.992,0.08563,0,7,0,21.84042,89.286697,43.201504,42,1315,93
"""08db4255286f""",6,2018-11-05 10:00:30,10,0.059553,0.1073,0.013909,0,0,0,28.502232,18.2644,4.160964,18,65,17,0.075137,0.198,0.031622,0,1,0,27.008038,18.2644,3.530502,17,96,15,0.074625,0.1982,0.028363,0,1,0,27.997675,16.633101,3.70739,18,96,15,0.075313,0.992,0.085616,0,7,0,21.837666,89.286697,43.194267,42,1315,93
"""08db4255286f""",7,2018-11-05 10:00:35,10,0.059378,0.1073,0.013755,0,0,0,28.541035,18.2644,4.10955,19,65,17,0.075172,0.198,0.03154,0,1,0,26.988329,18.2644,3.531302,17,96,15,0.074631,0.1982,0.028344,0,1,0,27.993689,16.633101,3.706404,18,96,15,0.075288,0.992,0.085612,0,7,0,21.836229,89.286697,43.186855,42,1315,93
"""08db4255286f""",8,2018-11-05 10:00:40,10,0.059921,0.1073,0.013974,0,0,0,28.657349,18.2644,4.116571,19,65,17,0.075256,0.198,0.031477,0,1,0,26.969297,18.2644,3.531514,17,96,15,0.074695,0.1982,0.028378,0,1,0,27.983849,16.633101,3.713347,18,96,15,0.075264,0.992,0.085607,0,7,0,21.833025,89.286697,43.179718,42,1315,93
"""08db4255286f""",9,2018-11-05 10:00:45,10,0.060364,0.1073,0.014064,0,0,0,28.775024,18.2644,4.127983,19,65,17,0.075321,0.198,0.031406,0,1,0,26.948715,18.2644,3.533453,17,96,15,0.07473,0.1982,0.028374,0,1,0,27.974318,16.633101,3.719704,18,96,15,0.07524,0.992,0.085601,0,7,0,21.830107,89.286697,43.172527,42,1315,93


In [16]:
# signal_awake = dict(zip(range(1440), np.sin(np.linspace(0, np.pi, 1440) + 0.208 * np.pi) ** 24))
# signal_onset = dict(zip(range(1440), np.sin(np.linspace(0, np.pi, 1440) + 0.555 * np.pi) ** 24))

def diff_feature(df: pd.Series, col: str, window_size: int = 60):
    ret = df.groupby('series_id')[col].diff(periods=window_size)
    ret = ret.fillna(ret.loc[ret.first_valid_index()])
    gc.collect()
    return ret


def rolling_feature(df: pd.Series, col: str, agg: str, window_size: int = 60):
    ret = df.groupby('series_id')[col].rolling(window_size, center=True).agg(agg)
    ret = ret.fillna(method='bfill').fillna(method='ffill')
    gc.collect()
    return ret


def create_features(df_, window_size=60, verbose=False):
    start_time = time()
    df = df_.copy()
    
    if 'awake' in df.columns:
        df.awake = df.awake.astype(np.uint8)
    
    df['weekday'] = df['timestamp'].dt.weekday.astype(np.int8)
#     df['signal_onset'] = (df.timestamp.dt.hour * 60 + df.timestamp.dt.minute).map(signal_onset).astype(np.float16)
#     df['signal_awake'] = (df.timestamp.dt.hour * 60 + df.timestamp.dt.minute).map(signal_awake).astype(np.float16)
    
    df['anglez'] = abs(df['anglez'])
    
    df['enmo_rolling_mean']     = rolling_feature(df, col='enmo', agg='mean', window_size=window_size).values
#     df['enmo_rolling_min']      = rolling_feature(df, col='enmo', agg='min', window_size=window_size).astype('float16').values
    df['enmo_rolling_max']      = rolling_feature(df, col='enmo', agg='max', window_size=window_size).values
    df['enmo_rolling_std']      = rolling_feature(df, col='enmo', agg='std', window_size=window_size).astype('float16').values
    df['enmo_rolling_median']   = rolling_feature(df, col='enmo', agg='median', window_size=window_size).values
    
    df['anglez_rolling_mean']     = rolling_feature(df, col='anglez', agg='mean', window_size=window_size).values
#     df['anglez_rolling_min']      = rolling_feature(df, col='anglez', agg='min', window_size=window_size).astype('float16').values
    df['anglez_rolling_max']      = rolling_feature(df, col='anglez', agg='max', window_size=window_size).values
    df['anglez_rolling_std']      = rolling_feature(df, col='anglez', agg='std', window_size=window_size).astype('float16').values
    df['anglez_rolling_median']   = rolling_feature(df, col='anglez', agg='median', window_size=window_size).values

    df['enmo_prev_diff_rolling_mean']  = diff_feature(df, col='enmo_rolling_mean', window_size=window_size).astype('float16').values
    df['enmo_prev_diff_rolling_max']   = diff_feature(df, col='enmo_rolling_max', window_size=window_size).astype('float16').values
    
    df['anglez_prev_diff_rolling_mean']  = diff_feature(df, col='anglez_rolling_mean', window_size=window_size).astype('float16').values
    df['anglez_prev_diff_rolling_max']   = diff_feature(df, col='anglez_rolling_max', window_size=window_size).astype('float16').values

    gc.collect()
    
    x = (df.anglez - df.anglez_rolling_median).abs().to_frame(name='ad').assign(series_id=df.series_id)
    x = x.groupby('series_id')['ad'].rolling(window_size, center=True).agg('median')
    df['anglez_mad'] = x.fillna(method='bfill').fillna(method='ffill').astype('float16').values
    
    x = (df.enmo - df.enmo_rolling_median).abs().to_frame(name='ad').assign(series_id=df.series_id)
    x = x.groupby('series_id')['ad'].rolling(window_size, center=True).agg('median')
    df['enmo_mad'] = x.fillna(method='bfill').fillna(method='ffill').astype('float16').values

    z = np.maximum(0., df.enmo - 0.02)
    z = z.to_frame(name='lids').assign(series_id=df.series_id)
    z = rolling_feature(z, col='lids', agg='sum', window_size=120)
    z = 100 / (z + 1)
    z = z.to_frame().reset_index()
    z = rolling_feature(z, col='lids', agg='mean', window_size=360)
    df['lids'] = z.astype('float16').values
    
    del (df['enmo_rolling_median'], df['anglez_rolling_median'])
    
    cols_astype_float16 = df.columns[df.columns.str.contains('enmo|anglez|signal|lids')].tolist()
    for col in cols_astype_float16:
        df[col] = df[col].astype('float16')
    
    gc.collect()    
    
    end_time = time()
    eta = int(end_time - start_time)
    if verbose:
        print(f'Done in: {eta}s')
    
    return df

In [17]:
window_size = 36
# 1 step = 5s, 10 steps = 50s, 20 steps = 100s
# 1 min = 12 steps, 3 mins = 36 steps, 5 mins = 60 steps

train = create_features(train_series_multi, window_size=window_size, verbose=True)
train.info()

Done in: 421s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16289820 entries, 0 to 16289819
Data columns (total 26 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   series_id                      object        
 1   step                           float32       
 2   timestamp                      datetime64[us]
 3   anglez                         float16       
 4   enmo                           float16       
 5   awake                          uint8         
 6   year                           int16         
 7   month                          float16       
 8   day                            float16       
 9   hour                           float16       
 10  weekday                        int8          
 11  signal_onset                   float16       
 12  signal_awake                   float16       
 13  enmo_rolling_mean              float16       
 14  enmo_rolling_max               float16       
 15 

In [18]:
train

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,year,month,day,hour,...,anglez_rolling_mean,anglez_rolling_max,anglez_rolling_std,enmo_prev_diff_rolling_mean,enmo_prev_diff_rolling_max,anglez_prev_diff_rolling_mean,anglez_prev_diff_rolling_max,anglez_mad,enmo_mad,lids
0,08db4255286f,0.0,2018-11-05 10:00:00,30.843750,0.044708,1,2018,11.0,5.0,10.0,...,28.500000,34.93750,4.160156,0.022598,0.090698,-1.001953,-1.6875,2.81250,0.008675,13.039062
1,08db4255286f,1.0,2018-11-05 10:00:05,34.187500,0.044312,1,2018,11.0,5.0,10.0,...,28.500000,34.93750,4.160156,0.022598,0.090698,-1.001953,-1.6875,2.81250,0.008675,13.039062
2,08db4255286f,2.0,2018-11-05 10:00:10,33.875000,0.048309,1,2018,11.0,5.0,10.0,...,28.500000,34.93750,4.160156,0.022598,0.090698,-1.001953,-1.6875,2.81250,0.008675,13.039062
3,08db4255286f,3.0,2018-11-05 10:00:15,34.281250,0.067993,1,2018,11.0,5.0,10.0,...,28.500000,34.93750,4.160156,0.022598,0.090698,-1.001953,-1.6875,2.81250,0.008675,13.039062
4,08db4255286f,4.0,2018-11-05 10:00:20,34.375000,0.076782,1,2018,11.0,5.0,10.0,...,28.500000,34.93750,4.160156,0.022598,0.090698,-1.001953,-1.6875,2.81250,0.008675,13.039062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16289815,390b487231ce,396175.0,2018-05-19 14:29:35,1.424805,0.050201,2,2018,5.0,19.0,14.0,...,14.914062,20.71875,3.445312,0.007843,0.010498,4.164062,2.1875,13.03125,0.082336,58.625000
16289816,390b487231ce,396176.0,2018-05-19 14:29:40,2.564453,0.049805,2,2018,5.0,19.0,14.0,...,14.914062,20.71875,3.445312,0.006889,0.010498,4.000000,2.1875,13.03125,0.082336,58.625000
16289817,390b487231ce,396177.0,2018-05-19 14:29:45,4.082031,0.042786,2,2018,5.0,19.0,14.0,...,14.914062,20.71875,3.445312,0.005825,0.010498,3.775391,2.1875,13.03125,0.082336,58.625000
16289818,390b487231ce,396178.0,2018-05-19 14:29:50,12.703125,0.064209,2,2018,5.0,19.0,14.0,...,14.914062,20.71875,3.445312,0.005043,0.010498,3.523438,2.1875,13.03125,0.082336,58.625000


In [None]:
train.to_csv("feature_engine_v1.csv")