In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

import time
from tqdm import tqdm

%matplotlib inline
sns.set(style='darkgrid', font_scale=1.5)

In [2]:
data_df = pd.read_csv('./data.csv', index_col=0)
person_df = pd.read_csv('./person.csv', index_col='_id')

In [36]:
window_sizes = [50, 100, 150, 200]
steps = [13, 25, 38, 50]

In [37]:
def mean_ser(series):
    return np.mean(series)

def std_ser(series):
    return np.std(series)

def max_ser(series):
    return np.max(series)

def min_ser(series):
    return np.min(series)

def energy_ser(series):
    return np.mean(series**2)
    
def iqr_ser(series):
    return np.quantile(series, 0.75) - np.quantile(series, 0.25)

In [53]:
time_measurements = defaultdict(list)
extracted_df_list = []

In [54]:
for step, window_size in tqdm(zip(steps, window_sizes)):
    extracted_df = pd.DataFrame()
    
    #standart metrics
    standart_metrics = [mean_ser, std_ser, max_ser, min_ser, energy_ser, iqr_ser]
    standart_metrics_name = ['mean', 'std', 'max', 'min', 'energy', 'iqr']
    
    for metric_name, metric_func in tqdm(zip(standart_metrics_name, standart_metrics)):
        start_time = time.process_time()

        extracted = data_df.groupby(by='_id', sort=False)\
            .rolling(window=window_size, min_periods=window_size)\
            .agg(metric_func)\
            .dropna(axis=0, how='any')[::step]\
            .drop('_id', axis=1)
        extracted.columns = extracted.columns + '_' + metric_name
        
        time_measurements[metric_name].append(time.process_time() - start_time)
        extracted_df[extracted.columns] = extracted
    
    # correlation
    pairs = [['x', 'y'], ['x', 'z'], ['y', 'z']]

    for pair in pairs:
        start_time = time.process_time()

        corr_ser = data_df.groupby(by='_id', sort=False)[pair]\
        .rolling(window=window_size)\
        .corr().iloc[0::2,-1]\
        .dropna(axis=0, how='any')[::step]

        corr_ser.index = corr_ser.index.droplevel(2)

        col_name = ''.join(pair) + '_corr'

        time_measurements[col_name].append(time.process_time() - start_time)
        extracted_df[col_name] = corr_ser
        
    # SMA
    data = data_df.copy()

    start_time = time.process_time()

    data['magnitude'] = np.sqrt(data['x']**2 + data['y']**2 + data['z']**2)
    SMA = data.groupby(by='_id', sort=False)\
        .rolling(window=window_size, min_periods=window_size)\
        .agg([mean_ser])\
        .dropna(axis=0, how='any')[::step]\
        .drop('_id', axis=1)['magnitude']
    SMA = SMA.rename(columns={'mean':'sma'})

    time_measurements['sma'].append(time.process_time() - start_time)
    extracted_df['sma'] = SMA
    
    extracted_df_list.append(extracted_df)

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
1it [01:36, 96.35s/it][A
2it [02:55, 86.06s/it][A
3it [04:12, 82.17s/it][A
4it [05:33, 81.58s/it][A
5it [09:53, 146.07s/it][A
6it [14:38, 146.46s/it][A
1it [15:57, 957.58s/it]
0it [00:00, ?it/s][A
1it [00:56, 56.14s/it][A
2it [01:46, 52.84s/it][A
3it [02:36, 51.69s/it][A
4it [03:28, 51.53s/it][A
5it [06:11, 91.68s/it][A
6it [09:32, 95.38s/it] [A
2it [26:45, 775.41s/it]
0it [00:00, ?it/s][A
1it [00:54, 54.11s/it][A
2it [01:43, 51.25s/it][A
3it [02:32, 50.50s/it][A
4it [03:22, 50.05s/it][A
5it [06:02, 89.74s/it][A
6it [09:22, 93.69s/it] [A
3it [37:22, 712.00s/it]
0it [00:00, ?it/s][A
1it [00:53, 53.57s/it][A
2it [01:40, 49.79s/it][A
3it [02:28, 48.72s/it][A
4it [03:16, 48.46s/it][A
5it [05:52, 87.33s/it][A
6it [09:07, 91.25s/it] [A
4it [47:41, 715.40s/it]


In [57]:
time_measurements

defaultdict(list,
            {'mean': [95.92640327499976,
              57.56708646600009,
              55.652058427000156,
              54.97909100299967],
             'std': [78.93823732999999,
              51.46836115999986,
              49.92602739100039,
              47.231076016999396],
             'max': [77.67433541799983,
              51.514375437000126,
              50.75487862600039,
              48.239064551999945],
             'min': [80.82940106099977,
              52.763987130999794,
              50.480267036999976,
              49.18580470100005],
             'energy': [260.9317656889998,
              167.38690551199943,
              167.91885368800013,
              162.44145394799943],
             'iqr': [284.8460654790001,
              201.1751206609997,
              199.72938628799966,
              195.03122535900002],
             'xy_corr': [1.907734125999923,
              1.9553242120000505,
              1.9337899799993465,
              1

Сохраняем датасеты с разными `window_size`

In [63]:
for window_size, df in zip(window_sizes, extracted_df_list):
    save_path = 'data_windowed_' + str(window_size)+'.csv'
    df.to_csv(save_path)