#### File Header Here

#### Preprocessing stuff

In [1]:
import glob
import os
import math
import torch
import random
import pandas as pd
import numpy as np

#list of dataframes into non overlapping sliding windows
def list_df_to_contiguous_sliding_windows(list_df, window_size, rem_beg=True):
    all_windows = []
    for dfs in list_df:
        windows = df_to_contiguous_sliding_windows(dfs, window_size, rem_beg=True)
        all_windows.extend(windows)
    return all_windows

#single dataframe into non overlapping sliding windows
def df_to_contiguous_sliding_windows(df, window_size, rem_beg=True):
    amount_to_remove = len(df)%window_size
    if rem_beg:
        df_rem = remove_df_starting_rows(df, amount_to_remove)
    elif not rem_beg:
        df_rem = remove_df_ending_rows(df, amount_to_remove)
    
    rows = []
    windows = []
    for index, row in df_rem.iterrows():
        rows.append(row.values.tolist())
        if len(rows)==window_size:
            windows.append(rows)
            rows = []
    return windows

#list of dataframes into overlapping sliding windows
def list_df_to_overlapping_sliding_windows(list_df, window_size):
    all_windows = []
    for dfs in list_df:
        windows = df_to_overlapping_sliding_windows(dfs, window_size)
        all_windows.extend(windows)
    return all_windows

#single dataframe into overlapping sliding windows
def df_to_overlapping_sliding_windows(dataframe, window_size):
    values_array = dataframe.values
    s0, s1 = values_array.strides
    row, col = values_array.shape
    windows = np.lib.stride_tricks.as_strided(values_array, shape=(row-window_size+1, window_size, col), strides=(s0, s0, s1))
    return windows

#function to remove N starting rows from a dataframe
def remove_df_starting_rows(dataframe, amount_to_remove):
    if amount_to_remove > 0:
        df = dataframe.iloc[amount_to_remove:]                 
        return df
    else:
        return dataframe

#function to remove N trailing rows from a dataframe
def remove_df_ending_rows(dataframe, amount_to_remove):
    if amount_to_remove > 0:
        df = dataframe.iloc[:-amount_to_remove]                 
        return df
    else:
        return dataframe

#function for ordering files. Very basic (hardcoded positions)
def sortKeyFunc(s):
    return int(os.path.basename(s)[12:-4])

#list of dataframe rows into vectors
def list_df_rows_to_vectors(list_df):
    all_vectors = []
    for dfs in list_df:
        vectors = df_rows_to_vectors(dfs)
        all_vectors.extend(vectors)
    return all_vectors

#dataframe rows into a vector
def df_rows_to_vectors(df):
    vectors = []
    for rows in range(len(df)):
        row = df.iloc[rows]
        vectors.append(row.values.tolist())
    return vectors

#splitting a pythong list or ndarray into 2 parts according to specificied ratio
def split_list_on_ratio(liste, ratio, shuffle=False, random_seed=42):
    len_list = len(liste)
    if shuffle:
        random.seed(random_seed)
        random.shuffle(liste)
    
    index_where_to_split = math.floor(len_list * ratio)

    list_part_a = liste[:index_where_to_split]
    list_part_b = liste[index_where_to_split:]
    
    return list_part_a, list_part_b

#loading multiple csv into multiple dataframes, where each csv is a dataframe. Partly hardcoded for a bombardier flight test CSV files.
def bomb_csv_to_df(csv_stringLoader):
    list_df = []
    list_data_units = []
    list_data_label_type = []
    allFiles = sorted(glob.glob(csv_stringLoader), key=sortKeyFunc)

    for files in allFiles:
        print('Loading:{}'.format(files))
        df = pd.read_csv(files)
        df = df.drop('Description', axis=1)
        df = df.set_index('TIME OF DAY IN SECONDS')

        data_units = df.iloc[0]
        data_units.name = 'Unit'

        data_label_type = df.iloc[1]
        data_label_type.name = 'Type'

        df = df.iloc[3:].reset_index()
        
        list_df.append(df)
        list_data_units.append(data_units)
        list_data_label_type.append(data_label_type)
        
    return list_df, list_data_units, list_data_label_type

#TODO: Func to verify if list_data_units and list_data_label_type are all the same

#purely exogeneous (non-regressive)
def list_df_to_exogeneous_df(list_df, target_choice):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list]
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#purely endogenous (non-autoregressive)
#to verify if the model can learn copying data
def list_df_to_endogeneous_df(list_df, target_choice):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list]
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#exogeneous & endogenous (non-autoregressive)
#to verify if the model can learn copying data with more data
def list_df_to_endo_exo_df(list_df, target_choice):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow'] + ['2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list]
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#forecasting (endogeneous)
def list_df_forecasting_endo_df(list_df, target_choice, shift_delta=1):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list].shift(-shift_delta)
        
        df_features = df_features.drop(df_features.tail(shift_delta).index)
        df_targets = df_targets.drop(df_targets.tail(shift_delta).index)
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#forecasting (exogeneous)
def list_df_forecasting_exo_df(list_df, target_choice, shift_delta=1):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list].shift(-shift_delta)
        
        df_features = df_features.drop(df_features.tail(shift_delta).index)
        df_targets = df_targets.drop(df_targets.tail(shift_delta).index)
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#forecasting (endogeneous & exogeneous)
def list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta=1):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow'] + ['2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list].shift(-shift_delta)
        
        df_features = df_features.drop(df_features.tail(shift_delta).index)
        df_targets = df_targets.drop(df_targets.tail(shift_delta).index)
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets


## Vector Datasets

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data
#from source.utils.preprocessing import *


def get_vectors_datasets(csv_files_path, forecasting, feature_endo, feature_exo, target_choice, shift_delta, train_test_ratio, train_valid_ratio, shuffle=False, random_seed=42):
    #loading CSV
    list_df, list_data_units, list_data_label_type = bomb_csv_to_df(csv_files_path)
    
    #Note: Train test and valid are made on flights, not the amount of vectors in total!
    
    #Splitting list of df into train-test
    list_df_train, list_df_test = split_list_on_ratio(list_df, train_test_ratio, shuffle, random_seed)
    #Splitting list of train into train-valid
    list_df_train, list_df_valid = split_list_on_ratio(list_df_train, train_valid_ratio, shuffle, random_seed)
    
    sfv_ds_train = VectorsDataset(list_df_train, forecasting, feature_endo, feature_exo, target_choice, shift_delta)
    sfv_ds_valid = VectorsDataset(list_df_valid, forecasting, feature_endo, feature_exo, target_choice, shift_delta)
    sfv_ds_test = VectorsDataset(list_df_test, forecasting, feature_endo, feature_exo, target_choice, shift_delta)
    
    return sfv_ds_train, sfv_ds_valid, sfv_ds_test
    
class VectorsDataset(data.Dataset):
    def __init__(self, list_df, forecasting, feature_endo, feature_exo, target_choice, shift_delta=1, target_type_string='Regression'):
        #target_choice is a parameter to pick if we use bay 1 (1), bay 2 (2) or both bays (0) as targets
        #Forecasting decides if the targets are N steps ahead(if True), or if we predict the current time step (if False)
        #Endo is if we want to use the bay temperature in the features
        #Exo is if we want to use the other data (the data that arent bay temp) in the features
        #Target type string is either Regression or Classification. Required for other objects down the training pipeline.
        self.target_type_string = target_type_string
        #Shift Delta is the parameter for forecasting that decides the N step ahead for target prediction
        
        if forecasting: #forecasting task (N step ahead)
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_df(list_df, target_choice, shift_delta)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_forecasting_exo_df(list_df, target_choice, shift_delta)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)
            else:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)

        if not forecasting: #Intra step prediction
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_to_endogeneous_df(list_df, target_choice)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_to_exogeneous_df(list_df, target_choice)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)
            else:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)

        all_vectors_features = list_df_rows_to_vectors(list_df_features)
        all_vectors_targets = list_df_rows_to_vectors(list_df_targets)
        
        self.features = all_vectors_features
        self.targets = all_vectors_targets

    def __getitem__(self, index):
        features_item = self.features[index]
        targets_item = self.targets[index]
        
        #TODO: Verify if returns as "Lists" are problematics for the dataloaders (might want Tensors and basetypes such as floats)
        return features_item, targets_item

    def __len__(self):
        if len(self.features)==len(self.targets):
            return len(self.targets)
        else:
            raise('The dataset does not have one target per feature and vice versa')

In [3]:
csv_files_path = "../DataBombardier/flight_test_*.csv"
forecasting = False
feature_endo = False
feature_exo = True
target_choice = 0
shift_delta = 1
train_test_ratio = 0.8
train_valid_ratio = 0.8

In [4]:
sfv_ds_train, sfv_ds_valid, sfv_ds_test = get_vectors_datasets(csv_files_path,
                                                                forecasting, feature_endo, feature_exo, target_choice, shift_delta, 
                                                                train_test_ratio, train_valid_ratio, 
                                                                shuffle=False, random_seed=42)

Loading:../DataBombardier/flight_test_1.csv
Loading:../DataBombardier/flight_test_2.csv
Loading:../DataBombardier/flight_test_3.csv
Loading:../DataBombardier/flight_test_4.csv
Loading:../DataBombardier/flight_test_5.csv
Loading:../DataBombardier/flight_test_6.csv
Loading:../DataBombardier/flight_test_7.csv
Loading:../DataBombardier/flight_test_8.csv
Loading:../DataBombardier/flight_test_9.csv
Loading:../DataBombardier/flight_test_10.csv
Loading:../DataBombardier/flight_test_11.csv
Loading:../DataBombardier/flight_test_12.csv
Loading:../DataBombardier/flight_test_13.csv


In [8]:
sfvf, sfvt = sfv_ds_train[12]

In [9]:
sfvf

['17926',
 '5.159813',
 '17.693799',
 '29.449458',
 '26.194133',
 '2364.0',
 '0.008864',
 '8.542919',
 '15.155848']

In [10]:
sfvt

['35.230574', '33.70978508573387']

## Contiguous Datasets

In [11]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data
#from source.utils.preprocessing import *


def get_contiguous_windows_datasets(csv_files_path, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, train_test_ratio, train_valid_ratio, remove_beg_rows=True, shuffle=False, random_seed=42):
    #loading CSV
    list_df, list_data_units, list_data_label_type = bomb_csv_to_df(csv_files_path)
    
    #Note: Train test and valid are made on flights, not the amount of vectors in total!
    
    #Splitting list of df into train-test
    list_df_train, list_df_test = split_list_on_ratio(list_df, train_test_ratio, shuffle, random_seed)
    #Splitting list of train into train-valid
    list_df_train, list_df_valid = split_list_on_ratio(list_df_train, train_valid_ratio, shuffle, random_seed)
    
    cw_ds_train = ContiguousWindowsDataset(list_df_train, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, remove_beg_rows)
    cw_ds_valid = ContiguousWindowsDataset(list_df_valid, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, remove_beg_rows)
    cw_ds_test = ContiguousWindowsDataset(list_df_test, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, remove_beg_rows)
    
    return cw_ds_train, cw_ds_valid, cw_ds_test
    
class ContiguousWindowsDataset(data.Dataset):
    def __init__(self, list_df, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta=1, remove_beg_rows=True, target_type_string='Regression'):
        #target_choice is a parameter to pick if we use bay 1 (1), bay 2 (2) or both bays (0) as targets
        #Forecasting decides if the targets are N steps ahead(if True), or if we predict the current time step (if False)
        #Endo is if we want to use the bay temperature in the features
        #Exo is if we want to use the other data (the data that arent bay temp) in the features
        #Target type string is either Regression or Classification. Required for other objects down the training pipeline.
        self.target_type_string = target_type_string
        #Shift Delta is the parameter for forecasting that decides the N step ahead for target prediction
        
        if forecasting: #forecasting task (N step ahead)
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_df(list_df, target_choice, shift_delta)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_forecasting_exo_df(list_df, target_choice, shift_delta)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)
            else:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)

        if not forecasting: #Intra step prediction
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_to_endogeneous_df(list_df, target_choice)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_to_exogeneous_df(list_df, target_choice)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)
            else:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)

        all_contiguous_features_windows = list_df_to_contiguous_sliding_windows(list_df_features, windows_size, rem_beg=remove_beg_rows)
        all_contiguous_targets_windows = list_df_to_contiguous_sliding_windows(list_df_targets, windows_size, rem_beg=remove_beg_rows)
        
        self.features = all_contiguous_features_windows
        self.targets = all_contiguous_targets_windows

    def __getitem__(self, index):
        features_item = self.features[index]
        targets_item = self.targets[index][-1]
        
        #TODO: Verify if returns as "Lists" are problematics for the dataloaders (might want Tensors and basetypes such as floats)
        return features_item, targets_item

    def __len__(self):
        if len(self.features)==len(self.targets):
            return len(self.targets)
        else:
            raise('The dataset does not have one target per feature and vice versa')

In [12]:
csv_files_path = "../DataBombardier/flight_test_*.csv"
forecasting = False
feature_endo = True
feature_exo = False
target_choice = 0
shift_delta = 3
windows_size = 4
train_test_ratio = 0.8
train_valid_ratio = 0.8

In [13]:
cw_ds_train, cw_ds_valid, cw_ds_test = get_contiguous_windows_datasets(csv_files_path, 
                                                                       forecasting, feature_endo, feature_exo, target_choice, 
                                                                       windows_size, shift_delta, 
                                                                       train_test_ratio, train_valid_ratio)

Loading:../DataBombardier/flight_test_1.csv
Loading:../DataBombardier/flight_test_2.csv
Loading:../DataBombardier/flight_test_3.csv
Loading:../DataBombardier/flight_test_4.csv
Loading:../DataBombardier/flight_test_5.csv
Loading:../DataBombardier/flight_test_6.csv
Loading:../DataBombardier/flight_test_7.csv
Loading:../DataBombardier/flight_test_8.csv
Loading:../DataBombardier/flight_test_9.csv
Loading:../DataBombardier/flight_test_10.csv
Loading:../DataBombardier/flight_test_11.csv
Loading:../DataBombardier/flight_test_12.csv
Loading:../DataBombardier/flight_test_13.csv


In [14]:
cwf, cwt = cw_ds_train[0]

In [15]:
cwf

[['17902', '35.240755', '33.76746076717663'],
 ['17904', '35.250933', '33.78781427556584'],
 ['17906', '35.233968', '33.77085186112588'],
 ['17908', '35.244148', '33.78781427556584']]

In [16]:
cwt

['35.244148', '33.78781427556584']

## Overlapping Datasets

In [8]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data
#from source.utils.preprocessing import *


def get_overlapping_windows_datasets(csv_files_path, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, train_test_ratio, train_valid_ratio, shuffle=False, random_seed=42):
    #loading CSV
    list_df, list_data_units, list_data_label_type = bomb_csv_to_df(csv_files_path)
    
    #Note: Train test and valid are made on flights, not the amount of vectors in total!
    
    #Splitting list of df into train-test
    list_df_train, list_df_test = split_list_on_ratio(list_df, train_test_ratio, shuffle, random_seed)
    #Splitting list of train into train-valid
    list_df_train, list_df_valid = split_list_on_ratio(list_df_train, train_valid_ratio, shuffle, random_seed)
    
    ow_ds_train = OverlappingWindowsDataset(list_df_train, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta)
    ow_ds_valid = OverlappingWindowsDataset(list_df_valid, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta)
    ow_ds_test = OverlappingWindowsDataset(list_df_test, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta)
    
    return ow_ds_train, ow_ds_valid, ow_ds_test
    
class OverlappingWindowsDataset(data.Dataset):
    def __init__(self, list_df, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta=1, target_type_string='Regression'):
        #target_choice is a parameter to pick if we use bay 1 (1), bay 2 (2) or both bays (0) as targets
        #Forecasting decides if the targets are N steps ahead(if True), or if we predict the current time step (if False)
        #Endo is if we want to use the bay temperature in the features
        #Exo is if we want to use the other data (the data that arent bay temp) in the features
        #Target type string is either Regression or Classification. Required for other objects down the training pipeline.
        self.target_type_string = target_type_string
        #Shift Delta is the parameter for forecasting that decides the N step ahead for target prediction
        
        if forecasting: #forecasting task (N step ahead)
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_df(list_df, target_choice, shift_delta)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_forecasting_exo_df(list_df, target_choice, shift_delta)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)
            else:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)

        if not forecasting: #Intra step prediction
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_to_endogeneous_df(list_df, target_choice)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_to_exogeneous_df(list_df, target_choice)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)
            else:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)

        all_overlapping_features_windows = list_df_to_overlapping_sliding_windows(list_df_features, windows_size)
        all_overlapping_targets_windows = list_df_to_overlapping_sliding_windows(list_df_targets, windows_size)
        
        self.features = all_overlapping_features_windows
        self.targets = all_overlapping_targets_windows

    def __getitem__(self, index):
        features_item = self.features[index].tolist()
        targets_item = self.targets[index][-1].tolist()
        
        #TODO: Verify if returns as "Lists" are problematics for the dataloaders (might want Tensors and basetypes such as floats)
        return features_item, targets_item

    def __len__(self):
        if len(self.features)==len(self.targets):
            return len(self.targets)
        else:
            raise('The dataset does not have one target per feature and vice versa')

In [9]:
csv_files_path = "../DataBombardier/flight_test_*.csv"
forecasting = False
feature_endo = True
feature_exo = False
target_choice = 0
shift_delta = 1
windows_size = 4
train_test_ratio = 0.8
train_valid_ratio = 0.8

In [10]:
ow_ds_train, ow_ds_valid, ow_ds_test = get_overlapping_windows_datasets(csv_files_path, 
                                                                       forecasting, feature_endo, feature_exo, target_choice, 
                                                                       windows_size, shift_delta, 
                                                                       train_test_ratio, train_valid_ratio)

Loading:../DataBombardier\flight_test_1.csv
Loading:../DataBombardier\flight_test_2.csv
Loading:../DataBombardier\flight_test_3.csv
Loading:../DataBombardier\flight_test_4.csv
Loading:../DataBombardier\flight_test_5.csv
Loading:../DataBombardier\flight_test_6.csv
Loading:../DataBombardier\flight_test_7.csv
Loading:../DataBombardier\flight_test_8.csv
Loading:../DataBombardier\flight_test_9.csv
Loading:../DataBombardier\flight_test_10.csv
Loading:../DataBombardier\flight_test_11.csv
Loading:../DataBombardier\flight_test_12.csv
Loading:../DataBombardier\flight_test_13.csv


In [11]:
owf, owt = ow_ds_train[0]

In [12]:
owf

[['17902', '35.240755', '33.76746076717663'],
 ['17904', '35.250933', '33.78781427556584'],
 ['17906', '35.233968', '33.77085186112588'],
 ['17908', '35.244148', '33.78781427556584']]

In [13]:
owt

['35.244148', '33.78781427556584']