# Libraries

In [None]:
import joblib

import numpy as np

import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Constants

In [None]:
INPUT_DIR = os.path.join('..', 'data', 'interim', 'version_02.csv')
SCALER_DIR = os.path.join('..', 'data', 'scalers')
OUTPUT_DIR = os.path.join('..', 'data', 'processed')
GROUPS = ['record_id', 'aircraft_id']
ACTIONS_FEATURES = ['cord_x', 'cord_y', 'cord_z']
TEST_RATIO = 0.30
RANDOM_STATE = 42
LOOKBACK_LAG = 180
FORECAST_LAG = 60

# Import Data

In [None]:
source = pd.read_csv(INPUT_DIR, sep=';')

# Preprocess

## Exclude enemy perspective

In [None]:
source.query('aircraft_id == "ally_1"', inplace=True)
source.reset_index(drop=True, inplace=True)

## Calculate distances instead of positions

In [None]:
# dx = source.loc[:, 'transform_matrix_3'] - source.loc[:, 'transform_matrix_3_bandit']
# dz = source.loc[:, 'transform_matrix_11'] - source.loc[:, 'transform_matrix_11_bandit']

# source.loc[:, 'horizontal_distance'] = np.sqrt(dx**2 + dz**2)

# source.drop(
#     columns=[
#         'transform_matrix_3', 'transform_matrix_3_bandit',
#         'transform_matrix_11', 'transform_matrix_11_bandit'],
#     inplace=True)

## Rename transform matrix fields

In [None]:
source.rename(
    columns={
        'transform_matrix_3': 'cord_x', 'transform_matrix_3_bandit': 'cord_x_bandit',
        'transform_matrix_11': 'cord_z', 'transform_matrix_11_bandit': 'cord_z_bandit',
        'transform_matrix_7': 'cord_y', 'transform_matrix_7_bandit': 'cord_y_bandit'},
    inplace=True)

## Identify training and testing datasets

In [None]:
grouped = source.groupby(GROUPS)

train = pd.DataFrame()
test = pd.DataFrame()
for name, df_0 in grouped:
    df_0_train, df_0_test = train_test_split(df_0, test_size=TEST_RATIO, random_state=RANDOM_STATE, shuffle=False)
    train = pd.concat([train, df_0_train], ignore_index=True)
    test = pd.concat([test, df_0_test], ignore_index=True)

## Normalize

In [None]:
def normalize_field_w_bandit(
        df_train: pd.DataFrame,
        df_test: pd.DataFrame,
        cols: list,
        name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    '''
    normalizes a field based on both our and bandit aircraft.
    saves the scaler.
    returns the normalized dataframe
    '''

    # identify values to calculate stats
    all_fields = tuple()
    for col in cols:
        all_fields = all_fields + (df_train.loc[:, [col]].to_numpy(), )
        all_fields = all_fields + (df_train.loc[:, [f'{col}_bandit']].to_numpy(), )

    all_values = np.concatenate(all_fields)

    # fit
    scaler = MinMaxScaler()
    scaler.fit(all_values)

    for col in cols:
        # transform
        df_train.loc[:, [col]] = scaler\
            .transform(df_train.loc[:, [col]].to_numpy())
        
        df_train.loc[:, [f'{col}_bandit']] = scaler\
            .transform(df_train.loc[:, [f'{col}_bandit']].to_numpy())
    
        df_test.loc[:, [col]] = scaler\
            .transform(df_test.loc[:, [col]].to_numpy())
        
        df_test.loc[:, [f'{col}_bandit']] = scaler\
            .transform(df_test.loc[:, [f'{col}_bandit']].to_numpy())

    # save scaler
    joblib.dump(scaler, os.path.join(SCALER_DIR, f'{name}.pkl'))
    
    return df_train, df_test

In [None]:
train, test = normalize_field_w_bandit(train, test, ['cord_y'], 'cord_y')
train, test = normalize_field_w_bandit(train, test, ['cord_x'], 'cord_x')
train, test = normalize_field_w_bandit(train, test, ['cord_z'], 'cord_z')
train, test = normalize_field_w_bandit(train, test, ['velocity_x', 'velocity_y', 'velocity_z'], 'velocity')

## Produce lags

In [None]:
def get_lookback_lags(df: pd.DataFrame, features: list) -> np.array:
    lookback_steps = range(-LOOKBACK_LAG, 0)
    data = df\
        .loc[:, features]\
        .to_numpy()
    
    lb = np.zeros((
        len(data),
        len(lookback_steps),
        len(features)))
    for i, _ in enumerate(features):
        cov = data[:, i]
        for j, shift in enumerate(lookback_steps):
            lb[:, j, i] = np.roll(cov, shift=abs(shift))
    return lb


def get_forecast_lags(df: pd.DataFrame, features: list) -> np.array:
    forecast_steps = range(0, FORECAST_LAG)
    data = df\
        .loc[:, features]\
        .to_numpy()
    
    fc = np.zeros((
        len(data),
        len(forecast_steps),
        len(features)))
    for i, _ in enumerate(features):
        cov = data[:, i]
        for j, shift in enumerate(forecast_steps):
            fc[:, j, i] = np.roll(cov, shift=abs(shift))
    return fc


def produce_input_and_labels(data: pd.DataFrame):
    grouped = data.groupby(GROUPS)
    
    list_lb_state_our = []
    list_lb_state_bandit = []
    list_fc_state_our = []
    list_fc_state_bandit = []
    list_fc_action_our = []
    for name, df_0 in grouped:
        df_0.drop(columns=['aircraft_id', 'record_id', 'timestep'], inplace=True)
    
        bandit_features = [col for col in df_0.columns if col.endswith('_bandit')]
        our_features = [col for col in df_0.columns if not col.endswith('_bandit')]
   
        lb_state_our = get_lookback_lags(df=df_0, features=our_features)
        lb_state_bandit = get_lookback_lags(df=df_0, features=bandit_features)
    
        fc_state_our = get_forecast_lags(df=df_0, features=['cord_x', 'cord_y', 'cord_z'])
        fc_state_bandit = get_forecast_lags(df=df_0, features=['cord_x_bandit', 'cord_y_bandit', 'cord_z_bandit'])
    
        fc_action_our = get_forecast_lags(df=df_0, features=ACTIONS_FEATURES)
    
        cut_start = LOOKBACK_LAG
        lb_state_our = lb_state_our[cut_start:]
        lb_state_bandit = lb_state_bandit[cut_start:]
        fc_state_our = fc_state_our[cut_start:]
        fc_state_bandit = fc_state_bandit[cut_start:]
        fc_action_our = fc_action_our[cut_start:]
    
        cut_end = (FORECAST_LAG - 1) * -1
        lb_state_our = lb_state_our[:cut_end]
        lb_state_bandit = lb_state_bandit[:cut_end]
        fc_state_our = fc_state_our[:cut_end]
        fc_state_bandit = fc_state_bandit[:cut_end]
        fc_action_our = fc_action_our[:cut_end]
    
        list_lb_state_our.append(lb_state_our)
        list_lb_state_bandit.append(lb_state_bandit)
        list_fc_state_our.append(fc_state_our)
        list_fc_state_bandit.append(fc_state_bandit)
        list_fc_action_our.append(fc_action_our)
    
    final_lb_state_our = np.concatenate(list_lb_state_our, axis=0)
    final_lb_state_bandit = np.concatenate(list_lb_state_bandit, axis=0)
    final_fc_state_our = np.concatenate(list_fc_state_our, axis=0)
    final_fc_state_bandit = np.concatenate(list_fc_state_bandit, axis=0)
    final_fc_action_our = np.concatenate(list_fc_action_our, axis=0)

    inputs = (final_lb_state_our, final_lb_state_bandit, final_fc_action_our)
    labels = (final_fc_state_our, final_fc_state_bandit)

    return inputs, labels

In [None]:
train_X, train_Y = produce_input_and_labels(train)
test_X, test_Y = produce_input_and_labels(test)

# Save

In [None]:
for i, ds in enumerate(['lb_state_our', 'lb_state_bandit', 'fc_action_our']):
    np.save(os.path.join(OUTPUT_DIR, f'train_{ds}.npy'), train_X[i])
    np.save(os.path.join(OUTPUT_DIR, f'test_{ds}.npy'), test_X[i])

for i, ds in enumerate(['fc_state_our', 'fc_state_bandit']):
    np.save(os.path.join(OUTPUT_DIR, f'train_{ds}.npy'), train_Y[i])
    np.save(os.path.join(OUTPUT_DIR, f'test_{ds}.npy'), test_Y[i])