# Exploratory Data Analysis (EDA) and Initial Visualization

## Configuration

In [48]:
import os
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff

from pathlib import Path
from IPython.display import Image

from ts_clf_event.data_handler.feat_eng import FeatureEngineer

## Dataset loading

In [49]:
project_dir = Path("../").resolve() 
data_path = os.path.join(project_dir,"data/test_dataframe.csv")
data = pd.read_csv(data_path, index_col=0)
data

Unnamed: 0,start_value,value,speed,level,frequency,status,process,datetime,provider
0,0.0,30.36,0.0,1.0,0.0,0.0,0,2020-06-01 00:00:00,2
1,0.0,26.82,0.0,0.0,0.0,-1.0,0,2020-06-01 00:00:21,1
2,0.0,29.72,0.0,1.0,0.0,0.0,0,2020-06-01 00:01:01,2
3,0.0,27.57,0.0,0.0,0.0,-1.0,0,2020-06-01 00:01:22,1
4,0.0,30.79,0.0,1.0,0.0,0.0,0,2020-06-01 00:02:01,2
...,...,...,...,...,...,...,...,...,...
86931,0.0,28.35,0.0,1.0,0.0,0.0,0,2020-07-01 23:57:36,2
86932,0.0,25.70,0.0,0.0,0.0,-1.0,0,2020-07-01 23:58:19,1
86933,0.0,27.83,0.0,1.0,0.0,0.0,0,2020-07-01 23:58:37,2
86934,0.0,24.34,0.0,0.0,0.0,-1.0,0,2020-07-01 23:59:20,1


## Feature extraction

In [50]:

windows = [30, 40, 60]
features_to_roll = ["value", "level", "frequency", "speed"]
diff_lags = [1, 2]
features_to_diff = features_to_roll.copy()

feature_engineer = FeatureEngineer(
    windows, features_to_roll, diff_lags, features_to_diff
)
df_engineered = feature_engineer.engineer_all_features(data)

df_engineered

Unnamed: 0_level_0,start_value,value,speed,level,frequency,status,process,provider,value_rolling_mean_30,value_rolling_std_30,...,speed_rolling_min_60,speed_rolling_max_60,value_diff_1,level_diff_1,frequency_diff_1,speed_diff_1,value_diff_2,level_diff_2,frequency_diff_2,speed_diff_2
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-01 00:00:00,0.0,30.36,0.0,1.0,0.0,0.0,0,2,,,...,,,,,,,,,,
2020-06-01 00:00:21,0.0,26.82,0.0,0.0,0.0,-1.0,0,1,,,...,,,,,,,,,,
2020-06-01 00:01:01,0.0,29.72,0.0,1.0,0.0,0.0,0,2,,,...,,,-0.64,0.0,0.0,0.0,,,,
2020-06-01 00:01:22,0.0,27.57,0.0,0.0,0.0,-1.0,0,1,,,...,,,0.75,0.0,0.0,0.0,,,,
2020-06-01 00:02:01,0.0,30.79,0.0,1.0,0.0,0.0,0,2,,,...,,,1.07,0.0,0.0,0.0,0.43,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-01 23:57:36,0.0,28.35,0.0,1.0,0.0,0.0,0,2,28.384667,0.501960,...,0.0,0.0,1.11,0.0,0.0,0.0,0.01,0.0,0.0,0.0
2020-07-01 23:58:19,0.0,25.70,0.0,0.0,0.0,-1.0,0,1,24.906667,0.509404,...,0.0,0.0,0.14,0.0,0.0,0.0,1.32,0.0,0.0,0.0
2020-07-01 23:58:37,0.0,27.83,0.0,1.0,0.0,0.0,0,2,28.389333,0.495893,...,0.0,0.0,-0.52,0.0,0.0,0.0,0.59,0.0,0.0,0.0
2020-07-01 23:59:20,0.0,24.34,0.0,0.0,0.0,-1.0,0,1,24.909000,0.506539,...,0.0,0.0,-1.36,0.0,0.0,0.0,-1.22,0.0,0.0,0.0


In [51]:
# Print the new features
for feat in df_engineered.columns:
    if 'rolling' in feat or 'diff' in feat:
        print(feat)

value_rolling_mean_30
value_rolling_std_30
value_rolling_min_30
value_rolling_max_30
level_rolling_mean_30
level_rolling_std_30
level_rolling_min_30
level_rolling_max_30
frequency_rolling_mean_30
frequency_rolling_std_30
frequency_rolling_min_30
frequency_rolling_max_30
speed_rolling_mean_30
speed_rolling_std_30
speed_rolling_min_30
speed_rolling_max_30
value_rolling_mean_40
value_rolling_std_40
value_rolling_min_40
value_rolling_max_40
level_rolling_mean_40
level_rolling_std_40
level_rolling_min_40
level_rolling_max_40
frequency_rolling_mean_40
frequency_rolling_std_40
frequency_rolling_min_40
frequency_rolling_max_40
speed_rolling_mean_40
speed_rolling_std_40
speed_rolling_min_40
speed_rolling_max_40
value_rolling_mean_60
value_rolling_std_60
value_rolling_min_60
value_rolling_max_60
level_rolling_mean_60
level_rolling_std_60
level_rolling_min_60
level_rolling_max_60
frequency_rolling_mean_60
frequency_rolling_std_60
frequency_rolling_min_60
frequency_rolling_max_60
speed_rolling_mea