#### File Header Here

#### Preprocessing stuff

In [5]:
import glob
import os
import math
import torch
import random
import pandas as pd
import numpy as np

#list of dataframes into non overlapping sliding windows
def list_df_to_contiguous_sliding_windows(list_df, window_size, rem_beg=True):
    all_windows = []
    for dfs in list_df:
        windows = df_to_contiguous_sliding_windows(dfs, window_size, rem_beg=True)
        all_windows.extend(windows)
    return all_windows

#single dataframe into non overlapping sliding windows
def df_to_contiguous_sliding_windows(df, window_size, rem_beg=True):
    amount_to_remove = len(df)%window_size
    if rem_beg:
        df_rem = remove_df_starting_rows(df, amount_to_remove)
    elif not rem_beg:
        df_rem = remove_df_ending_rows(df, amount_to_remove)
    
    rows = []
    windows = []
    for index, row in df_rem.iterrows():
        rows.append(row.values.tolist())
        if len(rows)==window_size:
            windows.append(rows)
            rows = []
    return windows

#list of dataframes into overlapping sliding windows
def list_df_to_overlapping_sliding_windows(list_df, window_size):
    all_windows = []
    for dfs in list_df:
        windows = df_to_overlapping_sliding_windows(dfs, window_size)
        all_windows.extend(windows)
    return all_windows

#single dataframe into overlapping sliding windows
def df_to_overlapping_sliding_windows(dataframe, window_size):
    values_array = dataframe.values
    s0, s1 = values_array.strides
    row, col = values_array.shape
    windows = np.lib.stride_tricks.as_strided(values_array, shape=(row-window_size+1, window_size, col), strides=(s0, s0, s1))
    return windows

#function to remove N starting rows from a dataframe
def remove_df_starting_rows(dataframe, amount_to_remove):
    if amount_to_remove > 0:
        df = dataframe.iloc[amount_to_remove:]                 
        return df
    else:
        return dataframe

#function to remove N trailing rows from a dataframe
def remove_df_ending_rows(dataframe, amount_to_remove):
    if amount_to_remove > 0:
        df = dataframe.iloc[:-amount_to_remove]                 
        return df
    else:
        return dataframe

#function for ordering files. Very basic (hardcoded positions)
def sortKeyFunc(s):
    return int(os.path.basename(s)[12:-4])

#list of dataframe rows into vectors
def list_df_rows_to_vectors(list_df):
    all_vectors = []
    for dfs in list_df:
        vectors = df_rows_to_vectors(dfs)
        all_vectors.extend(vectors)
    return all_vectors

#dataframe rows into a vector
def df_rows_to_vectors(df):
    vectors = []
    for rows in range(len(df)):
        row = df.iloc[rows]
        vectors.append(row.values.tolist())
    return vectors

#splitting a pythong list or ndarray into 2 parts according to specificied ratio
def split_list_on_ratio(liste, ratio, shuffle=False, random_seed=42):
    len_list = len(liste)
    if shuffle:
        random.seed(random_seed)
        random.shuffle(liste)
    
    index_where_to_split = math.floor(len_list * ratio)

    list_part_a = liste[:index_where_to_split]
    list_part_b = liste[index_where_to_split:]
    
    return list_part_a, list_part_b

#loading multiple csv into multiple dataframes, where each csv is a dataframe. Partly hardcoded for a bombardier flight test CSV files.
def bomb_csv_to_df(csv_stringLoader):
    list_df = []
    list_data_units = []
    list_data_label_type = []
    allFiles = sorted(glob.glob(csv_stringLoader), key=sortKeyFunc)

    for files in allFiles:
        print('Loading:{}'.format(files))
        df = pd.read_csv(files)
        df = df.drop('Description', axis=1)
        df = df.set_index('TIME OF DAY IN SECONDS')

        data_units = df.iloc[0]
        data_units.name = 'Unit'

        data_label_type = df.iloc[1]
        data_label_type.name = 'Type'

        df = df.iloc[3:].reset_index()
        df = df.apply(pd.to_numeric)
        
        list_df.append(df)
        list_data_units.append(data_units)
        list_data_label_type.append(data_label_type)
        
    return list_df, list_data_units, list_data_label_type

#TODO: Func to verify if list_data_units and list_data_label_type are all the same

#List df to max array
def list_df_to_max_array(list_df):
    maxdflist = []
    for df in list_df:
        maxdflist.append(df.describe(include='all').loc[ "max", :].to_numpy())
        #print(df.describe(include='all').loc[ "max", :].to_numpy())
    for m in range(len(maxdflist)):
        #print(m)
        if m==0:
            maxarray = maxdflist[m]
        else:
            maxarray = np.maximum(maxarray, maxdflist[m])
    #print(maxarray)
    return maxarray

#list df to min array
def list_df_to_min_array(list_df):
    mindflist = []
    for df in list_df:
        mindflist.append(df.describe(include='all').loc[ "min", :].to_numpy())
        #print(df.describe(include='all').loc[ "max", :].to_numpy())
    for m in range(len(mindflist)):
        #print(m)
        if m==0:
            minarray = mindflist[m]
        else:
            minarray = np.minimum(minarray, mindflist[m])
    #print(minarray)
    return minarray

#max and min array to rescaling array (bigger values of both, column-wise)
def min_max_arrays_to_rescaling_array(minarray, maxarray):
    rescaling_array = np.maximum(np.absolute(minarray), maxarray)
    return rescaling_array

#to rescale columns of a single df from a rescaling array
def rescale_single_df(df, rescaling_array):
    rescaled_df = pd.DataFrame()
    for e, (columnName, columnData) in enumerate(df.iteritems()):
        #print(e)
        #print(columnName)
        #print(columnData)
        rescaled_df[columnName] = columnData/(rescaling_array[e]+1)
    return rescaled_df

#rescale a list of df, column-wise
def rescale_list_of_df(list_df):
    minarray = list_df_to_min_array(list_df)
    maxarray = list_df_to_max_array(list_df)
    rescaling_array = min_max_arrays_to_rescaling_array(minarray, maxarray)
    
    rescaled_list_df = []
    for df in list_df:
        rescaled_list_df.append(rescale_single_df(df, rescaling_array))
    return rescaled_list_df

#purely exogeneous (non-regressive)
def list_df_to_exogeneous_df(list_df, target_choice):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list]
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#purely endogenous (non-autoregressive)
#to verify if the model can learn copying data
def list_df_to_endogeneous_df(list_df, target_choice):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list]
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#exogeneous & endogenous (non-autoregressive)
#to verify if the model can learn copying data with more data
def list_df_to_endo_exo_df(list_df, target_choice):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow'] + ['2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list]
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#forecasting (endogeneous)
def list_df_forecasting_endo_df(list_df, target_choice, shift_delta=1):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list].shift(-shift_delta)
        
        df_features = df_features.drop(df_features.tail(shift_delta).index)
        df_targets = df_targets.drop(df_targets.tail(shift_delta).index)
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#forecasting (exogeneous)
def list_df_forecasting_exo_df(list_df, target_choice, shift_delta=1):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list].shift(-shift_delta)
        
        df_features = df_features.drop(df_features.tail(shift_delta).index)
        df_targets = df_targets.drop(df_targets.tail(shift_delta).index)
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets

#forecasting (endogeneous & exogeneous)
def list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta=1):
    list_df_features = []
    list_df_targets = []

    for df in list_df:
        if target_choice==0:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow', '2nd Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP', '2nd AVIONICS BAY BULK TEMP']
        elif target_choice==1:
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        elif target_choice==2:
            features_list = ['TIME OF DAY IN SECONDS', '2nd Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '2nd Underfloor flow'] + ['2nd AVIONICS BAY BULK TEMP']
            targets_list = ['2nd AVIONICS BAY BULK TEMP']
        else:
            target_choice=1
            features_list = ['TIME OF DAY IN SECONDS', '1st Cooling Sys MASS FLOW', 'ACS_Zone_Actual_Temperature', 'Outside Air Temperature_OAT', 'Pressure Altitude', 'Mach', '1st Underfloor flow'] + ['1st AVIONICS BAY BULK TEMP']
            targets_list = ['1st AVIONICS BAY BULK TEMP']
        
        df_features = df[features_list]
        df_targets = df[targets_list].shift(-shift_delta)
        
        df_features = df_features.drop(df_features.tail(shift_delta).index)
        df_targets = df_targets.drop(df_targets.tail(shift_delta).index)
        
        list_df_features.append(df_features)
        list_df_targets.append(df_targets)
        
    return list_df_features, list_df_targets


## Vector Datasets

In [81]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data
from source.utils.preprocessing import *


def get_vectors_datasets(csv_files_path,
                            forecasting, feature_endo, feature_exo, target_choice, 
                            shift_delta, 
                            train_test_ratio, train_valid_ratio, 
                            shuffle=False, random_seed=42):
    #loading CSV
    list_df, list_data_units, list_data_label_type = bomb_csv_to_df(csv_files_path)
    rescaled_list_df = rescale_list_of_df(list_df)
    
    #Note: Train test and valid are made on flights, not the amount of vectors in total!
    
    #Splitting list of df into train-test
    list_df_train, list_df_test = split_list_on_ratio(rescaled_list_df, train_test_ratio, shuffle, random_seed)
    #Splitting list of train into train-valid
    list_df_train, list_df_valid = split_list_on_ratio(list_df_train, train_valid_ratio, shuffle, random_seed)
    
    sfv_ds_train = VectorsDataset(list_df_train, forecasting, feature_endo, feature_exo, target_choice, shift_delta)
    sfv_ds_valid = VectorsDataset(list_df_valid, forecasting, feature_endo, feature_exo, target_choice, shift_delta)
    sfv_ds_test = VectorsDataset(list_df_test, forecasting, feature_endo, feature_exo, target_choice, shift_delta)
    
    return sfv_ds_train, sfv_ds_valid, sfv_ds_test
    
class VectorsDataset(data.Dataset):
    def __init__(self, list_df, forecasting, feature_endo, feature_exo, target_choice, shift_delta=1, target_type_string='Regression'):
        #target_choice is a parameter to pick if we use bay 1 (1), bay 2 (2) or both bays (0) as targets
        #Forecasting decides if the targets are N steps ahead(if True), or if we predict the current time step (if False)
        #Endo is if we want to use the bay temperature in the features
        #Exo is if we want to use the other data (the data that arent bay temp) in the features
        #Target type string is either Regression or Classification. Required for other objects down the training pipeline.
        self.target_type_string = target_type_string
        #Shift Delta is the parameter for forecasting that decides the N step ahead for target prediction
        
        if forecasting: #forecasting task (N step ahead)
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_df(list_df, target_choice, shift_delta)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_forecasting_exo_df(list_df, target_choice, shift_delta)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)
            else:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)

        if not forecasting: #Intra step prediction
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_to_endogeneous_df(list_df, target_choice)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_to_exogeneous_df(list_df, target_choice)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)
            else:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)

        all_vectors_features = list_df_rows_to_vectors(list_df_features)
        all_vectors_targets = list_df_rows_to_vectors(list_df_targets)
        
        self.features = all_vectors_features
        self.targets = all_vectors_targets

    def __getitem__(self, index):
        features_item = self.features[index]
        targets_item = self.targets[index]
        
        #TODO: Verify if returns typage is "ok"
        return torch.Tensor(features_item), torch.Tensor(targets_item)

    def __len__(self):
        if len(self.features)==len(self.targets):
            return len(self.targets)
        else:
            raise('The dataset does not have one target per feature and vice versa')

    def features_size(self, index=0):
        return len(self.features[index])

    def labels_size(self, index=0):
        return len(self.targets[index])

In [7]:
csv_files_path = "../DataBombardier/2sec/flight_test_*.csv"
forecasting = True
feature_endo = True
feature_exo = True
target_choice = 0
shift_delta = 2
train_test_ratio = 0.8
train_valid_ratio = 0.8

In [8]:
sfv_ds_train, sfv_ds_valid, sfv_ds_test = get_vectors_datasets(csv_files_path,
                                                                forecasting, feature_endo, feature_exo, target_choice, shift_delta, 
                                                                train_test_ratio, train_valid_ratio, 
                                                                shuffle=False, random_seed=42)

NameError: name 'get_vectors_datasets' is not defined

In [84]:
sfvf, sfvt = sfv_ds_train[0]

In [85]:
sfvf

tensor([1.7902e+04, 5.1191e+00, 1.7831e+01, 2.9710e+01, 2.6151e+01, 2.3650e+03,
        8.8510e-03, 8.6523e+00, 1.5142e+01, 3.5241e+01, 3.3767e+01])

In [86]:
sfvt

tensor([35.2340, 33.7709])

In [87]:
sfv_ds_train.features_size()

11

In [88]:
sfv_ds_train.labels_size()

2

## Contiguous Datasets

In [73]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data
from source.utils.preprocessing import *


def get_contiguous_windows_datasets(csv_files_path, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, train_test_ratio, train_valid_ratio, remove_beg_rows=True, shuffle=False, random_seed=42):
    #loading CSV
    list_df, list_data_units, list_data_label_type = bomb_csv_to_df(csv_files_path)
    rescaled_list_df = rescale_list_of_df(list_df)

    #Note: Train test and valid are made on flights, not the amount of vectors in total!
    
    #Splitting list of df into train-test
    list_df_train, list_df_test = split_list_on_ratio(rescaled_list_df, train_test_ratio, shuffle, random_seed)
    #Splitting list of train into train-valid
    list_df_train, list_df_valid = split_list_on_ratio(list_df_train, train_valid_ratio, shuffle, random_seed)
    
    cw_ds_train = ContiguousWindowsDataset(list_df_train, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, remove_beg_rows)
    cw_ds_valid = ContiguousWindowsDataset(list_df_valid, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, remove_beg_rows)
    cw_ds_test = ContiguousWindowsDataset(list_df_test, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, remove_beg_rows)
    
    return cw_ds_train, cw_ds_valid, cw_ds_test
    
class ContiguousWindowsDataset(data.Dataset):
    def __init__(self, list_df, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta=1, remove_beg_rows=True, target_type_string='Regression'):
        #target_choice is a parameter to pick if we use bay 1 (1), bay 2 (2) or both bays (0) as targets
        #Forecasting decides if the targets are N steps ahead(if True), or if we predict the current time step (if False)
        #Endo is if we want to use the bay temperature in the features
        #Exo is if we want to use the other data (the data that arent bay temp) in the features
        #Target type string is either Regression or Classification. Required for other objects down the training pipeline.
        self.target_type_string = target_type_string
        #Shift Delta is the parameter for forecasting that decides the N step ahead for target prediction
        
        if forecasting: #forecasting task (N step ahead)
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_df(list_df, target_choice, shift_delta)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_forecasting_exo_df(list_df, target_choice, shift_delta)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)
            else:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)

        if not forecasting: #Intra step prediction
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_to_endogeneous_df(list_df, target_choice)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_to_exogeneous_df(list_df, target_choice)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)
            else:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)

        all_contiguous_features_windows = list_df_to_contiguous_sliding_windows(list_df_features, windows_size, rem_beg=remove_beg_rows)
        all_contiguous_targets_windows = list_df_to_contiguous_sliding_windows(list_df_targets, windows_size, rem_beg=remove_beg_rows)
        
        self.features = all_contiguous_features_windows
        self.targets = all_contiguous_targets_windows

    def __getitem__(self, index):
        features_item = self.features[index]
        targets_item = self.targets[index][-1]
        
        #TODO: Verify if returns typage is "ok"
        return torch.Tensor(features_item), torch.Tensor(targets_item)

    def __len__(self):
        if len(self.features)==len(self.targets):
            return len(self.targets)
        else:
            raise('The dataset does not have one target per feature and vice versa')

    def features_size(self, index=0):
        return len(self.features[index][-1])

    def labels_size(self, index=0):
        return len(self.targets[index][-1])

In [74]:
csv_files_path = "../DataBombardier/2sec/flight_test_*.csv"
forecasting = False
feature_endo = True
feature_exo = False
target_choice = 0
shift_delta = 3
windows_size = 4
train_test_ratio = 0.8
train_valid_ratio = 0.8

In [75]:
cw_ds_train, cw_ds_valid, cw_ds_test = get_contiguous_windows_datasets(csv_files_path, 
                                                                       forecasting, feature_endo, feature_exo, target_choice, 
                                                                       windows_size, shift_delta, 
                                                                       train_test_ratio, train_valid_ratio)

Loading:../DataBombardier/2sec\flight_test_1.csv
Loading:../DataBombardier/2sec\flight_test_3.csv
Loading:../DataBombardier/2sec\flight_test_4.csv
Loading:../DataBombardier/2sec\flight_test_5.csv
Loading:../DataBombardier/2sec\flight_test_6.csv
Loading:../DataBombardier/2sec\flight_test_7.csv
Loading:../DataBombardier/2sec\flight_test_8.csv
Loading:../DataBombardier/2sec\flight_test_9.csv
Loading:../DataBombardier/2sec\flight_test_10.csv
Loading:../DataBombardier/2sec\flight_test_11.csv


In [76]:
cwf, cwt = cw_ds_train[0]

In [77]:
cwf

tensor([[17902.0000,    35.2408,    33.7675],
        [17904.0000,    35.2509,    33.7878],
        [17906.0000,    35.2340,    33.7709],
        [17908.0000,    35.2441,    33.7878]])

In [78]:
cwt

tensor([35.2441, 33.7878])

In [79]:
cw_ds_train.features_size()

3

In [80]:
cw_ds_train.labels_size()

2

## Overlapping Datasets

In [9]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data
#from source.utils.preprocessing import *


def get_overlapping_windows_datasets(csv_files_path, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta, train_test_ratio, train_valid_ratio, shuffle=False, random_seed=42):
    #loading CSV
    list_df, list_data_units, list_data_label_type = bomb_csv_to_df(csv_files_path)
    rescaled_list_df = rescale_list_of_df(list_df)

    #Note: Train test and valid are made on flights, not the amount of vectors in total!
    
    #Splitting list of df into train-test
    list_df_train, list_df_test = split_list_on_ratio(rescaled_list_df, train_test_ratio, shuffle, random_seed)
    #Splitting list of train into train-valid
    list_df_train, list_df_valid = split_list_on_ratio(list_df_train, train_valid_ratio, shuffle, random_seed)
    
    ow_ds_train = OverlappingWindowsDataset(list_df_train, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta)
    ow_ds_valid = OverlappingWindowsDataset(list_df_valid, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta)
    ow_ds_test = OverlappingWindowsDataset(list_df_test, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta)
    
    return ow_ds_train, ow_ds_valid, ow_ds_test
    
class OverlappingWindowsDataset(data.Dataset):
    def __init__(self, list_df, forecasting, feature_endo, feature_exo, target_choice, windows_size, shift_delta=1, target_type_string='Regression'):
        #target_choice is a parameter to pick if we use bay 1 (1), bay 2 (2) or both bays (0) as targets
        #Forecasting decides if the targets are N steps ahead(if True), or if we predict the current time step (if False)
        #Endo is if we want to use the bay temperature in the features
        #Exo is if we want to use the other data (the data that arent bay temp) in the features
        #Target type string is either Regression or Classification. Required for other objects down the training pipeline.
        self.target_type_string = target_type_string
        #Shift Delta is the parameter for forecasting that decides the N step ahead for target prediction
        
        if forecasting: #forecasting task (N step ahead)
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_df(list_df, target_choice, shift_delta)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_forecasting_exo_df(list_df, target_choice, shift_delta)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)
            else:
                list_df_features, list_df_targets = list_df_forecasting_endo_exo_df(list_df, target_choice, shift_delta)

        if not forecasting: #Intra step prediction
            if feature_endo and not feature_exo:
                list_df_features, list_df_targets = list_df_to_endogeneous_df(list_df, target_choice)
            elif feature_exo and not feature_endo:
                list_df_features, list_df_targets = list_df_to_exogeneous_df(list_df, target_choice)
            elif feature_endo and feature_exo:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)
            else:
                list_df_features, list_df_targets = list_df_to_endo_exo_df(list_df, target_choice)

        all_overlapping_features_windows = list_df_to_overlapping_sliding_windows(list_df_features, windows_size)
        all_overlapping_targets_windows = list_df_to_overlapping_sliding_windows(list_df_targets, windows_size)
        
        self.features = all_overlapping_features_windows
        self.targets = all_overlapping_targets_windows

    def __getitem__(self, index):
        features_item = self.features[index].tolist()
        targets_item = self.targets[index][-1].tolist()
        
        #TODO: Verify if returns typage is "ok"
        return torch.Tensor(features_item), torch.Tensor(targets_item)

    def __len__(self):
        if len(self.features)==len(self.targets):
            return len(self.targets)
        else:
            raise('The dataset does not have one target per feature and vice versa')
        
    def features_size(self, index=0):
        return len(self.features[index][-1].tolist())

    def labels_size(self, index=0):
        return len(self.targets[index][-1].tolist())

In [17]:
csv_files_path = "../DataBombardier/2sec/flight_test_*.csv"
forecasting = True
feature_endo = False
feature_exo = True
target_choice = 1
shift_delta = 1
windows_size = 32
stride = 30
train_test_ratio = 0.8
train_valid_ratio = 0.8

In [18]:
ow_ds_train, ow_ds_valid, ow_ds_test = get_overlapping_windows_datasets(csv_files_path, 
                                                                       forecasting, feature_endo, feature_exo, target_choice, 
                                                                       windows_size, shift_delta, stride,
                                                                       train_test_ratio, train_valid_ratio)

Loading:../DataBombardier/2sec\flight_test_1.csv
Loading:../DataBombardier/2sec\flight_test_3.csv
Loading:../DataBombardier/2sec\flight_test_4.csv
Loading:../DataBombardier/2sec\flight_test_5.csv
Loading:../DataBombardier/2sec\flight_test_6.csv
Loading:../DataBombardier/2sec\flight_test_7.csv
Loading:../DataBombardier/2sec\flight_test_8.csv
Loading:../DataBombardier/2sec\flight_test_9.csv
Loading:../DataBombardier/2sec\flight_test_10.csv
Loading:../DataBombardier/2sec\flight_test_11.csv


In [19]:
owf, owt = ow_ds_train[0]

In [20]:
owf

tensor([[0.5283, 0.3999, 0.7865, 0.6450, 0.0540, 0.0012, 0.2709],
        [0.5283, 0.3990, 0.7865, 0.6453, 0.0540, 0.0044, 0.2698],
        [0.5283, 0.3985, 0.7865, 0.6458, 0.0540, 0.0047, 0.2681],
        [0.5283, 0.3987, 0.7865, 0.6455, 0.0540, 0.0044, 0.2682],
        [0.5284, 0.3992, 0.7865, 0.6458, 0.0540, 0.0026, 0.2713],
        [0.5284, 0.3987, 0.7865, 0.6458, 0.0540, 0.0021, 0.2750],
        [0.5284, 0.3990, 0.7865, 0.6461, 0.0540, 0.0035, 0.2770],
        [0.5284, 0.3997, 0.7865, 0.6463, 0.0540, 0.0033, 0.2785],
        [0.5285, 0.4000, 0.7865, 0.6466, 0.0540, 0.0025, 0.2805],
        [0.5285, 0.4002, 0.7865, 0.6466, 0.0540, 0.0030, 0.2810],
        [0.5285, 0.4007, 0.7837, 0.6469, 0.0539, 0.0023, 0.2799],
        [0.5285, 0.4011, 0.7865, 0.6469, 0.0539, 0.0036, 0.2787],
        [0.5285, 0.4016, 0.7837, 0.6466, 0.0540, 0.0035, 0.2770],
        [0.5286, 0.4011, 0.7837, 0.6466, 0.0540, 0.0049, 0.2755],
        [0.5286, 0.4000, 0.7865, 0.6469, 0.0540, 0.0038, 0.2745],
        [0

In [21]:
owt

tensor([0.8601])

In [22]:
ow_ds_train.features_size()

7

In [23]:
ow_ds_train.labels_size()

1

In [26]:
ow_ds_test

<__main__.OverlappingWindowsDataset at 0x22875b08ef0>