In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import pandavro as pdx
from shl.prepare import fill_na, calculate_abs_values, calculate_change, calculate_pct_change, calculate_window, calculate_shift
from typing import List, Optional, Tuple

os.chdir('../')
data_path = './data'

pd.set_option('display.max_columns', None)

In [7]:
def prepare_features(df, df_label, window_sizes, window_center, window_functions, shift_periods, shift_column_patterns):  
        
    # Add time rounded to seconds
    df['epoch_time'] = df['Epoch time [ms]'].round(-3)
    
    # Group values    
    df = df[['epoch_time', 'SNR']].groupby('epoch_time')['SNR'].median().reset_index(name='SNR')    
            
    # Join with label
    # (use the same ds if label df in None)    
    if df_label is None:
        df_label = df[['epoch_time']]       
        
    df = df_label.merge(df, on='epoch_time', how='left').drop('label', axis=1, errors='ignore')       
    
    # Ensure that values are sorted before calculating diff
    df = df.sort_values(by='epoch_time')   
        
    # Calculate percent change
    df_pct = calculate_pct_change(df)    
    df_pct.drop(['epoch_time_pct_change'], axis=1, inplace=True) 
    fill_na(df_pct)
    
    # Merge new features back to the main dataframe
    df = df[['epoch_time']].merge(df_pct, left_index=True, right_index=True, how='left')
    
    # Add moving windows features
    df = calculate_window(df, columns=None, functions=window_functions, window_sizes=window_sizes, window_center=True)       
    
    # Add shifted features
    calculate_shift(df, periods=shift_periods, columns_patterns=shift_column_patterns) 
   
    return df

In [12]:
# Run processing

settings = {
'window_sizes': [5, 10, 30, 60, 300],
'window_center': True,
'window_functions': ['mean', 'std'],
'shift_periods': [5, 10, 30, 60, 300],
'shift_column_patterns': ['window_5_', 'window_10_', 'window_30_', 'window_60_', 'window_300_'],
}

join_label = False

input_file_name = 'GPS.parquet'
input_file_name_label = 'Label.parquet'
data_types = ['validate', 'train', 'test']
# data_types = ['validate']
full_input_file_names = [os.path.join(data_path, data_type, input_file_name) for data_type in data_types]
full_input_file_names_label = [os.path.join(data_path, data_type, input_file_name_label) for data_type in data_types]

base_output_file_name = 'features_gps'
output_format = 'parquet'
output_file_name = base_output_file_name + '.' + output_format
full_output_file_names = [os.path.join(data_path, data_type, output_file_name) for data_type in data_types]

for file_in, file_in_label, file_out in zip(full_input_file_names, full_input_file_names_label, full_output_file_names):
    df = pd.read_parquet(file_in)
    df_label = None
    if join_label is True and os.path.exists(file_in_label):
        df_label = pd.read_parquet(file_in_label)
    df = prepare_features(df, df_label, **settings)
    if output_format == 'parquet':
        df.to_parquet(file_out, index=False)
    elif output_format == 'avro':
        pdx.to_avro(file_out, df)        


In [9]:
df

Unnamed: 0,epoch_time,SNR_pct_change,SNR_pct_change_window_5_mean,SNR_pct_change_window_5_std,SNR_pct_change_window_10_mean,SNR_pct_change_window_10_std,SNR_pct_change_window_30_mean,SNR_pct_change_window_30_std,SNR_pct_change_window_60_mean,SNR_pct_change_window_60_std,SNR_pct_change_window_300_mean,SNR_pct_change_window_300_std,SNR_pct_change_window_5_mean_shift_5_past,SNR_pct_change_window_5_std_shift_5_past,SNR_pct_change_window_5_mean_shift_5_future,SNR_pct_change_window_5_std_shift_5_future,SNR_pct_change_window_10_mean_shift_10_past,SNR_pct_change_window_10_std_shift_10_past,SNR_pct_change_window_10_mean_shift_10_future,SNR_pct_change_window_10_std_shift_10_future,SNR_pct_change_window_30_mean_shift_30_past,SNR_pct_change_window_30_std_shift_30_past,SNR_pct_change_window_30_mean_shift_30_future,SNR_pct_change_window_30_std_shift_30_future,SNR_pct_change_window_60_mean_shift_60_past,SNR_pct_change_window_60_std_shift_60_past,SNR_pct_change_window_60_mean_shift_60_future,SNR_pct_change_window_60_std_shift_60_future,SNR_pct_change_window_300_mean_shift_300_past,SNR_pct_change_window_300_std_shift_300_past,SNR_pct_change_window_300_mean_shift_300_future,SNR_pct_change_window_300_std_shift_300_future
0,1497426498000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.034965,0.045225,0.000000,0.000000,-0.003980,0.031553,0.000000,0.000000,0.002251,0.111266,0.000000,0.000000,0.003318,0.085315,0.000000,0.000000,0.001101,0.034207
1,1497426499000,0.166667,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.020150,0.041160,0.000000,0.000000,-0.013071,0.041733,0.000000,0.000000,0.005281,0.109871,0.000000,0.000000,0.003735,0.085235,0.000000,0.000000,0.001101,0.034207
2,1497426500000,-0.142857,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.017484,0.036479,0.000000,0.000000,0.001277,0.053205,0.000000,0.000000,0.000281,0.108820,0.000000,0.000000,0.003735,0.085235,0.000000,0.000000,0.001101,0.034207
3,1497426501000,-0.074074,-0.026053,0.119037,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.017484,0.036479,0.000000,0.000000,-0.000996,0.052669,0.000000,0.000000,0.000281,0.108820,0.000000,0.000000,0.000765,0.083919,0.000000,0.000000,0.001020,0.034239
4,1497426502000,-0.080000,-0.068082,0.052464,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.008788,0.033819,0.000000,0.000000,-0.005541,0.054503,0.000000,0.000000,0.000919,0.108598,0.000000,0.000000,0.001522,0.083699,0.000000,0.000000,0.000936,0.034272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124645,7497460949000,0.000000,0.013439,0.020222,0.013923,0.019778,0.0,0.0,0.0,0.0,0.0,0.0,0.014407,0.021694,0.000000,0.000000,0.000058,0.011361,0.000000,0.000000,0.016395,0.123885,0.000000,0.000000,0.000088,0.086031,0.000000,0.000000,0.003691,0.065586,0.000000,0.000000
124646,7497460950000,0.000000,0.013439,0.020222,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.014407,0.021694,0.000000,0.000000,0.002497,0.017901,0.000000,0.000000,0.016132,0.123858,0.000000,0.000000,-0.000099,0.086157,0.000000,0.000000,0.003889,0.065552,0.000000,0.000000
124647,7497460951000,0.021739,0.004348,0.009722,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.023498,0.023588,0.000000,0.000000,0.004823,0.019017,0.000000,0.000000,0.018216,0.123013,0.000000,0.000000,0.001252,0.085499,0.000000,0.000000,0.003688,0.065538,0.000000,0.000000
124648,7497460952000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.013742,0.020388,0.000000,0.000000,0.004823,0.019017,0.000000,0.000000,0.018216,0.123013,0.000000,0.000000,0.006742,0.093681,0.000000,0.000000,0.003688,0.065538,0.000000,0.000000


In [11]:
df.isnull().sum().sum()

0

### Export for analysis

In [13]:
# Join labels for analysis
features_file = './data/validate/features_gps.parquet'
features = pd.read_parquet(features_file)


# cells_raw = './data/train/Cells.parquet'
# cells_raw_df = pd.read_parquet(cells_raw)

# cells_raw_df['epoch_time_s'] = cells_raw_df['epoch_time'].round(-3)

# label_file = './data/validate/Label.parquet'
# label = pd.read_parquet(label_file)

# features = label.merge(features).sort_values(by='epoch_time')
# features.to_csv('./data/cells_analysis.csv')
features

Unnamed: 0,epoch_time,SNR_pct_change,SNR_pct_change_window_5_mean,SNR_pct_change_window_5_std,SNR_pct_change_window_10_mean,SNR_pct_change_window_10_std,SNR_pct_change_window_30_mean,SNR_pct_change_window_30_std,SNR_pct_change_window_60_mean,SNR_pct_change_window_60_std,SNR_pct_change_window_300_mean,SNR_pct_change_window_300_std,SNR_pct_change_window_5_mean_shift_5_past,SNR_pct_change_window_5_std_shift_5_past,SNR_pct_change_window_5_mean_shift_5_future,SNR_pct_change_window_5_std_shift_5_future,SNR_pct_change_window_10_mean_shift_10_past,SNR_pct_change_window_10_std_shift_10_past,SNR_pct_change_window_10_mean_shift_10_future,SNR_pct_change_window_10_std_shift_10_future,SNR_pct_change_window_30_mean_shift_30_past,SNR_pct_change_window_30_std_shift_30_past,SNR_pct_change_window_30_mean_shift_30_future,SNR_pct_change_window_30_std_shift_30_future,SNR_pct_change_window_60_mean_shift_60_past,SNR_pct_change_window_60_std_shift_60_past,SNR_pct_change_window_60_mean_shift_60_future,SNR_pct_change_window_60_std_shift_60_future,SNR_pct_change_window_300_mean_shift_300_past,SNR_pct_change_window_300_std_shift_300_past,SNR_pct_change_window_300_mean_shift_300_future,SNR_pct_change_window_300_std_shift_300_future
0,1497426498000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.034965,0.045225,0.000000,0.000000,-0.003980,0.031553,0.000000,0.000000,0.002251,0.111266,0.000000,0.000000,0.003318,0.085315,0.000000,0.000000,0.001101,0.034207
1,1497426499000,0.166667,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.020150,0.041160,0.000000,0.000000,-0.013071,0.041733,0.000000,0.000000,0.005281,0.109871,0.000000,0.000000,0.003735,0.085235,0.000000,0.000000,0.001101,0.034207
2,1497426500000,-0.142857,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.017484,0.036479,0.000000,0.000000,0.001277,0.053205,0.000000,0.000000,0.000281,0.108820,0.000000,0.000000,0.003735,0.085235,0.000000,0.000000,0.001101,0.034207
3,1497426501000,-0.074074,-0.026053,0.119037,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.017484,0.036479,0.000000,0.000000,-0.000996,0.052669,0.000000,0.000000,0.000281,0.108820,0.000000,0.000000,0.000765,0.083919,0.000000,0.000000,0.001020,0.034239
4,1497426502000,-0.080000,-0.068082,0.052464,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.008788,0.033819,0.000000,0.000000,-0.005541,0.054503,0.000000,0.000000,0.000919,0.108598,0.000000,0.000000,0.001522,0.083699,0.000000,0.000000,0.000936,0.034272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124645,7497460949000,0.000000,0.013439,0.020222,0.013923,0.019778,0.0,0.0,0.0,0.0,0.0,0.0,0.014407,0.021694,0.000000,0.000000,0.000058,0.011361,0.000000,0.000000,0.016395,0.123885,0.000000,0.000000,0.000088,0.086031,0.000000,0.000000,0.003691,0.065586,0.000000,0.000000
124646,7497460950000,0.000000,0.013439,0.020222,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.014407,0.021694,0.000000,0.000000,0.002497,0.017901,0.000000,0.000000,0.016132,0.123858,0.000000,0.000000,-0.000099,0.086157,0.000000,0.000000,0.003889,0.065552,0.000000,0.000000
124647,7497460951000,0.021739,0.004348,0.009722,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.023498,0.023588,0.000000,0.000000,0.004823,0.019017,0.000000,0.000000,0.018216,0.123013,0.000000,0.000000,0.001252,0.085499,0.000000,0.000000,0.003688,0.065538,0.000000,0.000000
124648,7497460952000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.013742,0.020388,0.000000,0.000000,0.004823,0.019017,0.000000,0.000000,0.018216,0.123013,0.000000,0.000000,0.006742,0.093681,0.000000,0.000000,0.003688,0.065538,0.000000,0.000000
