In [1]:
# working dir
import os
import sys

cwd = os.getcwd()
root_dir = os.path.dirname(os.path.dirname(cwd))
sys.path.append(root_dir)

print(root_dir)

c:\Users\david\Desktop\Main\03_Uni\WWI21DSA\02_Vorlesungen\06_Projektrealisierung\Projektrealisierung


In [2]:
# Imports and settings
import pandas as pd
import random

random.seed(42)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import pandas as pd
from IPython.display import display, HTML

# Display Dataframe (with scrollbars)
def ddf(df, max_height=500, max_width=1500):
    """
    Display a pandas DataFrame with horizontal and vertical scrollbars in a Jupyter notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    max_width (int): The maximum width of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        max-width: {max_width}px;
        overflow: auto;
        display: inline-block;
        position: relative;
    }}
    .scrollable-dataframe thead th {{
        position: sticky;
        top: 0;
        background-color: white;
        z-index: 1;
    }}
    </style>
    """
    html = style + df.to_html(classes='scrollable-dataframe')
    display(HTML(html))


In [4]:
main = pd.concat([
    pd.read_csv("../../src/data/ABCD_tripfiles_preprocessed.csv"),
    pd.read_csv("../../src/data/MNOP_tripfiles_preprocessed.csv"),
    pd.read_csv("../../src/data/ZYXW_tripfiles_preprocessed.csv"),
])

meta_cols = ['flight_id', 'id', 'creation_time', 'airline_code', 'flight_date', 'action_name',]
action_cols = ['departureAirport', 'departureTime', 'arrivalAirport', 'arrivalTime', 'aircraftRegistration', 'aircraftSubtype', 'aircraftVersion',]

main = main.loc[
    main["action_name"] == "CalculateWeightAndTrimAction",
    meta_cols + action_cols
]

  pd.read_csv("../../src/data/ABCD_tripfiles_preprocessed.csv"),
  pd.read_csv("../../src/data/MNOP_tripfiles_preprocessed.csv"),
  pd.read_csv("../../src/data/ZYXW_tripfiles_preprocessed.csv"),


In [5]:
import warnings
warnings.filterwarnings('ignore') # disable future deprecation warning of .fillna() method

def fill_na_within_group(group):
    group[action_cols] = group[action_cols].fillna(method='bfill')
    group[action_cols] = group[action_cols].fillna(method='ffill')
    return group

main = main.groupby("flight_id").apply(fill_na_within_group).reset_index(drop=True) # replaces NaN values with the values from the previous or next row within the same flight
main.dropna(inplace=True) # drops 203828 rows - these are likely flights without an ASMMsgProcessor action

warnings.filterwarnings('default')

In [6]:
def correct_year(date_str):
    date_part, time_part = date_str.split('T')
    year, month, day = date_part.split('-')
    hours, minutes, seconds = time_part.split(':')

    # Correct year
    if len(year) != 4:
        year = year[1:]
    
    # Correct the minutes
    if len(minutes) > 2:
        minutes = minutes[:2]

    # Correct the seconds
    if len(seconds) > 6:
        seconds = "00.000Z"
    
    date_part = f"{year}-{month}-{day}"
    time_part = f"{hours}:{minutes}:{seconds}"
    date_str = f"{date_part}T{time_part}"
    
    return date_str

# Fix year in departureTime and arrivalTime
main["departureTime"] = main["departureTime"].apply(lambda x: correct_year(str(x)))
main["arrivalTime"] = main["arrivalTime"].apply(lambda x: correct_year(str(x)))


main["creation_time"] = pd.to_datetime(main["creation_time"])
main["departureTime"] = pd.to_datetime(main["departureTime"], format="%Y-%m-%dT%H:%M:%S.%fZ")
main["arrivalTime"] = pd.to_datetime(main["arrivalTime"], format="%Y-%m-%dT%H:%M:%S.%fZ")

main["minutes_till_dep"] = (main["departureTime"] - main["creation_time"]).dt.total_seconds() / 60

In [7]:
cwata = pd.concat([
    pd.read_csv("../../src/data/extracted/abcd_CalculateWeightAndTrimAction.csv"), 
    pd.read_csv("../../src/data/extracted/mnop_CalculateWeightAndTrimAction.csv"), 
    pd.read_csv("../../src/data/extracted/zyxw_CalculateWeightAndTrimAction.csv")
    ])

cwata = cwata[[
    'id', 'START_WI_weight', 
    'DO_WI_weight', 'PAX_WI_weight', 'TOTAL_DEADLOAD_WI_weight', 'TOTAL_LOAD_WI',
    'TOTAL_TRAFFIC_LOAD', 'AZFW', 'ATOW', 'ALAW', 'ATXW',
    'LIZFW', 'LITOW', 'LILAW',
    'DEADLOAD_MAC', 'UNDERLOAD',
    'ALLOWED_TOW', 'ALLOWED_ZFW', 'ALLOWED_LAW',
    'ALLOWED_TXW',
    'ESTIMATED_TRAFFIC_LOAD', 'ESTIMATED_ZFW',
    'DELTA_ZFW'
    ]]

In [8]:
df = pd.merge(cwata, main, on="id", how="left")

df.dropna(inplace=True)
df.sort_values(by=["flight_id", "creation_time", "id"], inplace=True, ascending=True)

In [9]:
azfw_all = df.groupby('flight_id')['ATOW'].last()
azfw_all = azfw_all.to_dict()

df["target_ATOW"] = df["flight_id"].map(azfw_all)

In [10]:
df["delta"] = df["target_ATOW"] - df["ATOW"]
df["delta"].describe()

count    2.468920e+05
mean    -2.048361e+03
std      7.825867e+05
min     -2.189566e+07
25%      0.000000e+00
50%      2.600000e+01
75%      1.722000e+03
max      1.708310e+07
Name: delta, dtype: float64

In [11]:
special_cols = ['flight_id', 'creation_time', 'id', 'action_name', "departureTime", "arrivalTime", "delta"]
cat_cols = ["airline_code", "departureAirport", "arrivalAirport", "aircraftRegistration", "aircraftSubtype", "aircraftVersion"]
target_col = "target_ATOW"
num_cols = list(set(df.columns) - set(cat_cols) - set(special_cols) - {target_col})

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

# Separate features and target
X = df.drop(columns=[target_col] + cat_cols + special_cols)
y = df[target_col]

# Change one hot bools to floats
X = X.astype(float)

X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

dataset = TensorDataset(X_tensor, y_tensor)

# Split into train and test sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [13]:
import torch.nn as nn

class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1) 

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class Regression(nn.Module):
    def __init__(self, input_size):
        super(Regression, self).__init__()
        self.fc = nn.Linear(input_size, 1)
        self.bn = nn.BatchNorm1d(input_size)

    def forward(self, x):
        # x = self.fc(self.bn(x))
        x = self.fc(x)
        return x

input_size = X_tensor.shape[1] # 620
model = SimpleNN(input_size)
# model = Regression(input_size)

In [14]:
import torch.optim as optim
from tqdm import tqdm
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau


# criterion = nn.MSELoss()
criterion = nn.L1Loss()  # Using Mean Absolute Error as criterion to avoid inf loss when not scaling

optimizer = optim.Adam(model.parameters(), lr=0.01)
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)


# Training Loop 
num_epochs = 50
for epoch in range(num_epochs):

    # print("params in epoch", epoch+1)
    # for name, param in model.named_parameters():
    #     print(name, param.data)

    model.train()
    losses = []
    train_loader_tqdm = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch')
    for inputs, labels in train_loader_tqdm:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # if torch.isnan(loss) or torch.isinf(loss):
        #     print("NaN or Inf loss detected")
        #     break

        loss.backward()
        optimizer.step()
        # scheduler.step(loss)
        losses.append(loss.item())

        # # Print gradients for debugging
        # for name, param in model.named_parameters():
        #     if param.requires_grad and param.grad is not None:
        #         print(f"Epoch {epoch+1}, {name}, grad mean: {param.grad.mean().item()}, grad std: {param.grad.std().item()}")
        
        
        train_loader_tqdm.set_postfix(loss=np.mean(losses))

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
Epoch 1/50: 100%|██████████| 1544/1544 [00:19<00:00, 79.34batch/s, loss=8.98e+4]
Epoch 2/50: 100%|██████████| 1544/1544 [00:20<00:00, 76.85batch/s, loss=8.65e+4]
Epoch 3/50: 100%|██████████| 1544/1544 [00:20<00:00, 76.23batch/s, loss=8.6e+4] 
Epoch 4/50: 100%|██████████| 1544/1544 [00:20<00:00, 76.04batch/s, loss=8.64e+4]
Epoch 5/50: 100%|██████████| 1544/1544 [00:20<00:00, 75.76batch/s, loss=8.64e+4]
Epoch 6/50: 100%|██████████| 1544/1544 [00:21<00:00, 73.44batch/s, loss=8.63e+4]
Epoch 7/50: 100%|██████████| 1544/1544 [00:20<00:00, 74.07batch/s, loss=8.63e+4]
Epoch 8/50: 100%|██████████| 1544/1544 [00:20<00:00, 74.59batch/s, loss=8.62e+4]
Epoch 9/50: 100%|██████████| 1544/1544 [00:21<00:00, 72.82batch/s, loss=8.6e+4] 
Epoch 10/50: 100%|██████████| 1544/1544 [00:21<00:00, 71.69batch/s, loss=8.61e+4]
Epoch 11/50: 100%|██████████| 1544/1544 [00:21<00:00, 71.02batch/s, l

KeyboardInterrupt: 

In [None]:
# Evaluation
model.eval()
losses = []
unscaled_losses = []
test_loader_tqdm = tqdm(test_loader, desc='Evaluating', unit='batch')

with torch.no_grad():
    for inputs, labels in test_loader_tqdm:
        # Compute scaled outputs
        outputs = model(inputs)
        
        # Calculate scaled loss
        loss = criterion(outputs.reshape(-1, 1), labels.reshape(-1, 1))
        losses.append(loss.item())

# Calculate mean losses
mean_loss = np.mean(losses)

print(f'Test Loss: {mean_loss}')


#### MinMaxScaler & MSE
Evaluating: 100%|██████████| 386/386 [00:02<00:00, 159.17batch/s]
Test Loss: 0.0031778891372756485
Unscaled Test Loss: 939906541196.6011

#### StandardScaler & MSE
Evaluating: 100%|██████████| 386/386 [00:02<00:00, 164.46batch/s]
Test Loss: 0.8869521320083328
Unscaled Test Loss: 783619205189.9689

--> not only does the loss suck, it doesnt decrease over the epochs either

#### No Scaler & MAE 
Evaluating: 100%|██████████| 386/386 [00:00<00:00, 551.40batch/s]
Test Loss: 375176577.83937824
Unscaled Test Loss: nan

--> still sucks but at least the loss decreases

In [None]:
df.drop(columns=[target_col] + cat_cols + special_cols).columns

In [None]:
import random

random_row = df.iloc[[random.choice(range(len(df)))]]

X = random_row.drop(columns=[target_col] + cat_cols + special_cols + ["delta"]).astype(float)
y = random_row[target_col]

X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

pred = model(X_tensor).detach().numpy()

# pred_it = target_scaler.inverse_transform(pred).item()
# y_it = target_scaler.inverse_transform(y_tensor.reshape(-1, 1)).item()
# pred_it, y_it

pred, y