In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

import time
import timeit
from tqdm import tqdm

%matplotlib inline
sns.set(style='darkgrid', font_scale=1.5)

In [4]:
data_df = pd.read_csv('./data.csv', index_col=0)
person_df = pd.read_csv('./person.csv', index_col='_id')

In [5]:
window_sizes = [50, 100, 150, 200]
steps = [13, 25, 38, 50]

In [6]:
def mean_ser(series):
    return np.mean(series)

def std_ser(series):
    return np.std(series)

def max_ser(series):
    return np.max(series)

def min_ser(series):
    return np.min(series)

def energy_ser(series):
    return np.mean(series**2)
    
def iqr_ser(series):
    return np.quantile(series, 0.75) - np.quantile(series, 0.25)

### Годится для извлечения данных, но не для замера времени

In [12]:
time_measurements = defaultdict(list)
extracted_df_list = []

In [13]:
for step, window_size in tqdm(zip(steps, window_sizes)):
    extracted_df = pd.DataFrame()
    
    #standart metrics
    standart_metrics = [mean_ser, std_ser, max_ser, min_ser, energy_ser, iqr_ser]
    standart_metrics_name = ['mean', 'std', 'max', 'min', 'energy', 'iqr']
    
    for metric_name, metric_func in tqdm(zip(standart_metrics_name, standart_metrics)):
        start_time = time.process_time()

        extracted = data_df.groupby(by='_id', sort=False)\
            .rolling(window=window_size, min_periods=window_size)\
            .agg(metric_func)\
            .dropna(axis=0, how='any')[::step]\
            .drop('_id', axis=1)
        extracted.columns = extracted.columns + '_' + metric_name
        
        time_elapsed = time.process_time() - start_time
        time_measurements[metric_name].append(time_elapsed / extracted.shape[0])
        extracted_df[extracted.columns] = extracted
    
    # correlation
    pairs = [['x', 'y'], ['x', 'z'], ['y', 'z']]

    for pair in pairs:
        start_time = time.process_time()

        corr_ser = data_df.groupby(by='_id', sort=False)[pair]\
        .rolling(window=window_size)\
        .corr().iloc[0::2,-1]\
        .dropna(axis=0, how='any')[::step]

        corr_ser.index = corr_ser.index.droplevel(2)

        col_name = ''.join(pair) + '_corr'

        time_elapsed = time.process_time() - start_time
        time_measurements[col_name].append(time_elapsed / corr_ser.shape[0])
        extracted_df[col_name] = corr_ser
        
    # SMA
    data = data_df.copy()

    start_time = time.process_time()

    data['magnitude'] = np.sqrt(data['x']**2 + data['y']**2 + data['z']**2)
    SMA = data.groupby(by='_id', sort=False)\
        .rolling(window=window_size, min_periods=window_size)\
        .agg([mean_ser])\
        .dropna(axis=0, how='any')[::step]\
        .drop('_id', axis=1)['magnitude']
    SMA = SMA.rename(columns={'mean':'sma'})

    time_elapsed = time.process_time() - start_time
    time_measurements['sma'].append(time_elapsed / SMA.shape[0])
    extracted_df['sma'] = SMA
    
    extracted_df_list.append(extracted_df)

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
1it [00:59, 59.70s/it][A
2it [02:46, 83.29s/it][A
0it [02:46, ?it/s]


KeyboardInterrupt: 

In [None]:
time_measurementsrim

Сохраняем датасеты с разными `window_size`

In [63]:
for window_size, df in zip(window_sizes, extracted_df_list):
    save_path = 'data_windowed_' + str(window_size)+'.csv'
    df.to_csv(save_path)

### Балдежное измерение времени

Возьмем по 15 замеров.

In [46]:
def sma_ser(series):
    magnitude = np.sqrt((series**2).sum(axis=1))
    sma = np.mean(magnitude)
    return sma

def corr_ser(series):
    return np.corrcoef(series.T)

In [47]:
repeat = 20
number = 1000

In [48]:
time_measurements = defaultdict(list)

In [49]:
func_dict = {
    'mean':'mean_ser(series)',
    'std':'std_ser(series)',
    'max':'max_ser(series)',
    'min':'max_ser(series)',
    'energy':'energy_ser(series)',
    'iqr':'iqr_ser(series)',
    'sma':'sma_ser(series)',
    'corr':'corr_ser(series)'
}

In [50]:
for func_name, func_call in func_dict.items():
    for window_size in window_sizes:
        if func_name == 'sma' or func_name == 'corr':
            series = data_df[:window_size].drop('_id', axis=1).to_numpy()
        else:
            series = data_df[:window_size]['z'].to_numpy()
        time_measurement = min(timeit.Timer(func_call, globals=globals()).repeat(repeat=repeat, number=number))
        time_measurements[func_name].append(time_measurement)

In [51]:
time_measurements

defaultdict(list,
            {'mean': [0.006599732998438412,
              0.006530471997393761,
              0.00686915299957036,
              0.007195094000053359],
             'std': [0.02075718700143625,
              0.02029280100032338,
              0.02181401399866445,
              0.02148805399701814],
             'max': [0.0032749980018706992,
              0.003266619998612441,
              0.0033461340026406106,
              0.0033581530005903915],
             'min': [0.0032845709974935744,
              0.003343918000609847,
              0.0033848149978439324,
              0.0033657359999779146],
             'energy': [0.007898673000454437,
              0.007891338998888386,
              0.008521169002051465,
              0.008560580001358176],
             'iqr': [0.1424261239990301,
              0.14614813499792945,
              0.14543706299809855,
              0.13928175099863438],
             'sma': [0.012280238999665016,
              0.01234640999