In [1]:
import pandas as pd
import numpy as np
from itertools import groupby
import gc
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

train = pd.read_csv("new.csv")

In [2]:
train.head(10)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217,1
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215,1
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216,1
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213,1
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215,1
5,038441c925bb,5,2018-08-14T15:30:25-0400,2.6367,0.0217,1
6,038441c925bb,6,2018-08-14T15:30:30-0400,2.6367,0.0217,1
7,038441c925bb,7,2018-08-14T15:30:35-0400,2.6367,0.0218,1
8,038441c925bb,8,2018-08-14T15:30:40-0400,2.798,0.0223,1
9,038441c925bb,9,2018-08-14T15:30:45-0400,3.0847,0.0217,1


In [108]:
def make_features(df):
    # parse the timestamp and create an "hour" feature
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df["hour"] = df["timestamp"].dt.hour
    
    periods = 20
    df["anglez"] = abs(df["anglez"])
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill").astype('float16')
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill").astype('float16')
    df["anglez_rolling_mean"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_rolling_mean"] = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_rolling_max"] = df["anglez"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_rolling_max"] = df["enmo"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_rolling_std"] = df["anglez"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_rolling_std"] = df["enmo"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_diff_rolling_mean"] = df["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_diff_rolling_mean"] = df["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_diff_rolling_max"] = df["anglez_diff"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_diff_rolling_max"] = df["enmo_diff"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    
    return df

features = ["hour",
            "anglez",
            "anglez_rolling_mean",
            "anglez_rolling_max",
            "anglez_rolling_std",
            "anglez_diff",
            "anglez_diff_rolling_mean",
            "anglez_diff_rolling_max",
            "enmo",
            "enmo_rolling_mean",
            "enmo_rolling_max",
            "enmo_rolling_std",
            "enmo_diff",
            "enmo_diff_rolling_mean",
            "enmo_diff_rolling_max",
           ]

In [None]:
train   = make_features(train)

X_train = train[features]
y_train = train["awake"]

# save some memory
# del train
# gc.collect();

In [None]:
X_train.shape, y_train.shape

In [135]:
%%time

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=50,
                                    min_samples_leaf=300,
                                    random_state=42,n_jobs=-1)

X_train = lda.fit_transform(X_train, y_train)

classifier.fit(X_train, y_train)

# save some memory
# del X_train, y_train
# gc.collect();

CPU times: user 543 ms, sys: 36.9 ms, total: 580 ms
Wall time: 582 ms


In [None]:
# test  = pd.read_parquet("data/child-mind-institute-detect-sleep-states/test_series.parquet")
test  = pd.read_csv("test_standard.csv")

test  = make_features(test)

X_test = test[features]
X_test = lda.transform(X_test)

test["not_awake"] = classifier.predict_proba(X_test)[:,0]
test["awake"]     = classifier.predict_proba(X_test)[:,1]


In [141]:
# smoothing the predictions
smoothing_length = 50
test["score"]  = test["awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
test["smooth"] = test["not_awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
# re-binarize
test["smooth"] = test["smooth"].round()

# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)

  test["score"]  = test["awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
  test["smooth"] = test["not_awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")


In [142]:
sample_submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
sample_submission.to_csv('submission.csv', index=False)

In [129]:


lda = LDA()