In [1]:
import numpy as np
import pandas as pd
import os

os.chdir('../')
data_path = './data'

pd.set_option('display.max_columns', None)

In [27]:
def fill_empty_lines(input_file_name, data_type, df, df_label, time_column, interpolation_limit):
    
    print(f'Processing {input_file_name} - {data_type}')
    initial_shape = df.shape[0]
    print(f'Initial shape: {initial_shape}')
    
    df['epoch_time_s'] = df[time_column].round(-3)
    
    df = df_label[['epoch_time']].rename(columns={'epoch_time': 'epoch_time_label'}).merge(df, left_on='epoch_time_label', right_on='epoch_time_s', how='outer')    
    matching_rows = df[(~df.epoch_time_s.isna())&(~df.epoch_time_label.isna())].shape[0]
    print(f'Rows matching with labels: {matching_rows}')
    
    df.sort_values(by=['epoch_time_label', time_column], inplace=True)    
    df = df.interpolate(limit=interpolation_limit)    

    df.drop(['epoch_time_label', 'epoch_time_s'], axis=1, inplace=True)   
    df.drop(df[df[time_column].isna()].index, inplace=True)
    
    final_shape = df.shape[0]
    print(f'Final shape: {final_shape}')
    print(f'Filled rows: {final_shape - initial_shape}')
    print(f'Added data %: {round((final_shape - initial_shape)/initial_shape * 100, 2)}%\n')
    
    return df

In [28]:
def run_processing(data_types, input_file_name, input_file_name_label, time_column, interpolation_limit, output_file_suffix):
    for data_type in data_types:
        full_input_file_name = os.path.join(data_path, data_type, input_file_name)
        full_input_file_names_label = os.path.join(data_path, data_type, input_file_name_label)
        df_feature = pd.read_parquet(full_input_file_name)
        df_label = pd.read_parquet(full_input_file_names_label)
        df_result = fill_empty_lines(input_file_name, data_type, df_feature, df_label, time_column, interpolation_limit)
        df_result.to_parquet(os.path.join(data_path, data_type, input_file_name.split('.')[0] + output_file_suffix + '.parquet'), index=False)

In [29]:
settings = {
'data_types': ['validate', 'train'],
'input_file_name_label': 'Label.parquet',
'interpolation_limit': 5,
'output_file_suffix': '_i'
}

settings.update(input_file_name = 'Location.parquet', time_column = 'epoch_time')
run_processing(**settings)

settings.update(input_file_name = 'Cells.parquet', time_column = 'epoch_time')
run_processing(**settings)

settings.update(input_file_name = 'WiFi.parquet', time_column = 'Epoch time [ms]')
run_processing(**settings)

settings.update(input_file_name = 'GPS.parquet', time_column = 'Epoch time [ms]')
run_processing(**settings)



Processing Location.parquet - validate
Initial shape: 101524
Rows matching with labels: 84711
Final shape: 103413
Filled rows: 1889
Added data %: 1.86%

Processing Location.parquet - train
Initial shape: 911109
Rows matching with labels: 662506
Final shape: 924248
Filled rows: 13139
Added data %: 1.44%

Processing Cells.parquet - validate
Initial shape: 333901
Rows matching with labels: 273240
Final shape: 336811
Filled rows: 2910
Added data %: 0.87%

Processing Cells.parquet - train
Initial shape: 4474380
Rows matching with labels: 2813310
Final shape: 4498873
Filled rows: 24493
Added data %: 0.55%

Processing WiFi.parquet - validate
Initial shape: 1486478
Rows matching with labels: 1121591
Final shape: 1488122
Filled rows: 1644
Added data %: 0.11%

Processing WiFi.parquet - train
Initial shape: 12604135
Rows matching with labels: 8761269
Final shape: 12615960
Filled rows: 11825
Added data %: 0.09%

Processing GPS.parquet - validate
Initial shape: 1516199
Rows matching with labels: 12