In [1]:
import numpy as np
import pandas as pd
import os

os.chdir('../')
data_path = './data'

pd.set_option('display.max_columns', None)

In [230]:
def fill_empty_lines(input_file_name, data_type, df, df_label, time_column, fill_limit):   
    
    print(f'Processing {input_file_name} / {data_type}')
    initial_shape = df.shape[0]    
    print(f'Initial shape: {initial_shape}')
    
    col_types = df.dtypes
    
    df['epoch_time_s'] = df[time_column].round(-3)    
    
    df = df_label[['epoch_time']].rename(columns={'epoch_time': 'epoch_time_label'}).merge(df, left_on='epoch_time_label', right_on='epoch_time_s', how='outer')  
    
    diff = df[['epoch_time_label']].diff()
    diff.rename(columns={'epoch_time_label': 'time_diff'}, inplace=True)
    df = df.merge(diff, left_index=True, right_index=True)    
    
    df.sort_values(by=['epoch_time_label', time_column], inplace=True)    
    
    # Create segments with time diff greater than fill limit
    df['segment_change'] = df['time_diff'].apply(lambda x: 1 if x >= fill_limit * 1000 else 0)    
    df['segment_id'] = df['segment_change'].cumsum(axis=0)
    
    # Save original timestamp before it gets filled
    df_time = df[[time_column]]
    
    df = df.groupby('segment_id').ffill(limit=fill_limit).bfill(limit=fill_limit).drop(['epoch_time_s', time_column, 'time_diff', 'segment_change'], axis=1)
    
    df = df_time.merge(df, left_index=True, right_index=True) 
    df[time_column] = df[time_column].fillna(df['epoch_time_label'])
    
    df.drop('epoch_time_label', axis=1, inplace=True)   
        
    df = df.drop(df[df.iloc[:, 1].isna()].index)
    df.sort_values(by=[time_column], inplace=True)    
    df = df.astype(col_types)   
    
    final_shape = df.shape[0]    
    print(f'Final shape: {final_shape}')
    print(f'Added rows: {final_shape - initial_shape}\n')
    
    
    return df

In [231]:
def run_processing(data_types, input_file_name, input_file_name_label, time_column, fill_limit, output_file_suffix):
    for data_type in data_types:
        full_input_file_name = os.path.join(data_path, data_type, input_file_name)
        full_input_file_names_label = os.path.join(data_path, data_type, input_file_name_label)
        df_feature = pd.read_parquet(full_input_file_name)
        df_label = pd.read_parquet(full_input_file_names_label)
        df_result = fill_empty_lines(input_file_name, data_type, df_feature, df_label, time_column, fill_limit)
        df_result.to_parquet(os.path.join(data_path, data_type, input_file_name.split('.')[0] + output_file_suffix + '.parquet'), index=False)


In [232]:
settings = {
'data_types': ['validate', 'train'],
'input_file_name_label': 'Label.parquet',
'fill_limit': 30,
'output_file_suffix': '_i'
}

settings.update(input_file_name = 'Location.parquet', time_column = 'epoch_time')
df = run_processing(**settings)

settings.update(input_file_name = 'Cells.parquet', time_column = 'epoch_time')
run_processing(**settings)

settings.update(input_file_name = 'WiFi.parquet', time_column = 'Epoch time [ms]')
run_processing(**settings)

settings.update(input_file_name = 'GPS.parquet', time_column = 'Epoch time [ms]')
run_processing(**settings)



Processing Location.parquet / validate
Initial shape: 101524
Final shape: 107659
Added rows: 6135

Processing Location.parquet / train
Initial shape: 911109
Final shape: 954265
Added rows: 43156

Processing Cells.parquet / validate
Initial shape: 333901
Final shape: 340304
Added rows: 6403

Processing Cells.parquet / train
Initial shape: 4474380
Final shape: 4525006
Added rows: 50626

Processing WiFi.parquet / validate
Initial shape: 1486478
Final shape: 1489225
Added rows: 2747

Processing WiFi.parquet / train
Initial shape: 12604135
Final shape: 12623106
Added rows: 18971

Processing GPS.parquet / validate
Initial shape: 1516199
Final shape: 1521278
Added rows: 5079

Processing GPS.parquet / train
Initial shape: 14046969
Final shape: 14076453
Added rows: 29484

