In [1]:
# working dir
import os
import sys

cwd = os.getcwd()
root_dir = os.path.dirname(os.path.dirname(cwd))
sys.path.append(root_dir)

print(root_dir)

c:\Users\david\Desktop\Main\03_Uni\WWI21DSA\02_Vorlesungen\06_Projektrealisierung\Projektrealisierung


In [2]:
# Imports and settings
import pandas as pd
import random

random.seed(42)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import pandas as pd
from IPython.display import display, HTML

# Display Dataframe (with scrollbars)
def ddf(df, max_height=500, max_width=1500):
    """
    Display a pandas DataFrame with horizontal and vertical scrollbars in a Jupyter notebook.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to display.
    max_height (int): The maximum height of the scrollable area in pixels.
    max_width (int): The maximum width of the scrollable area in pixels.
    """
    style = f"""
    <style>
    .scrollable-dataframe {{
        max-height: {max_height}px;
        max-width: {max_width}px;
        overflow: auto;
        display: inline-block;
        position: relative;
    }}
    .scrollable-dataframe thead th {{
        position: sticky;
        top: 0;
        background-color: white;
        z-index: 1;
    }}
    </style>
    """
    html = style + df.to_html(classes='scrollable-dataframe')
    display(HTML(html))


In [4]:
main = pd.concat([
    pd.read_csv("../../src/data/ABCD_tripfiles_preprocessed.csv"),
    pd.read_csv("../../src/data/MNOP_tripfiles_preprocessed.csv"),
    pd.read_csv("../../src/data/ZYXW_tripfiles_preprocessed.csv"),
])

meta_cols = ['flight_id', 'id', 'creation_time', 'airline_code', 'flight_date', 'action_name',]
action_cols = ['departureAirport', 'departureTime', 'arrivalAirport', 'arrivalTime', 'aircraftRegistration', 'aircraftSubtype', 'aircraftVersion',]

main = main.loc[
    main["action_name"] == "CalculateWeightAndTrimAction",
    meta_cols + action_cols
]

  pd.read_csv("../../src/data/ABCD_tripfiles_preprocessed.csv"),
  pd.read_csv("../../src/data/MNOP_tripfiles_preprocessed.csv"),
  pd.read_csv("../../src/data/ZYXW_tripfiles_preprocessed.csv"),


In [5]:
import warnings
warnings.filterwarnings('ignore') # disable future deprecation warning of .fillna() method

def fill_na_within_group(group):
    group[action_cols] = group[action_cols].fillna(method='bfill')
    group[action_cols] = group[action_cols].fillna(method='ffill')
    return group

main = main.groupby("flight_id").apply(fill_na_within_group).reset_index(drop=True) # replaces NaN values with the values from the previous or next row within the same flight
main.dropna(inplace=True) # drops 203828 rows - these are likely flights without an ASMMsgProcessor action

warnings.filterwarnings('default')

In [6]:
def correct_year(date_str):
    date_part, time_part = date_str.split('T')
    year, month, day = date_part.split('-')
    hours, minutes, seconds = time_part.split(':')

    # Correct year
    if len(year) != 4:
        year = year[1:]
    
    # Correct the minutes
    if len(minutes) > 2:
        minutes = minutes[:2]

    # Correct the seconds
    if len(seconds) > 6:
        seconds = "00.000Z"
    
    date_part = f"{year}-{month}-{day}"
    time_part = f"{hours}:{minutes}:{seconds}"
    date_str = f"{date_part}T{time_part}"
    
    return date_str

# Fix year in departureTime and arrivalTime
main["departureTime"] = main["departureTime"].apply(lambda x: correct_year(str(x)))
main["arrivalTime"] = main["arrivalTime"].apply(lambda x: correct_year(str(x)))


main["creation_time"] = pd.to_datetime(main["creation_time"])
main["departureTime"] = pd.to_datetime(main["departureTime"], format="%Y-%m-%dT%H:%M:%S.%fZ")
main["arrivalTime"] = pd.to_datetime(main["arrivalTime"], format="%Y-%m-%dT%H:%M:%S.%fZ")

main["minutes_till_dep"] = (main["departureTime"] - main["creation_time"]).dt.total_seconds() / 60

In [7]:
cwata = pd.concat([
    pd.read_csv("../../src/data/extracted/abcd_CalculateWeightAndTrimAction.csv"), 
    pd.read_csv("../../src/data/extracted/mnop_CalculateWeightAndTrimAction.csv"), 
    pd.read_csv("../../src/data/extracted/zyxw_CalculateWeightAndTrimAction.csv")
    ])

cwata = cwata[[
    'id', 'START_WI_weight', 
    'DO_WI_weight', 'PAX_WI_weight', 'TOTAL_DEADLOAD_WI_weight', 'TOTAL_LOAD_WI',
    'TOTAL_TRAFFIC_LOAD', 'AZFW', 'ATOW', 'ALAW', 'ATXW',
    'LIZFW', 'LITOW', 'LILAW',
    'DEADLOAD_MAC', 'UNDERLOAD',
    'ALLOWED_TOW', 'ALLOWED_ZFW', 'ALLOWED_LAW',
    'ALLOWED_TXW',
    'ESTIMATED_TRAFFIC_LOAD', 'ESTIMATED_ZFW',
    'DELTA_ZFW'
    ]]

In [8]:
df = pd.merge(cwata, main, on="id", how="left")

df.dropna(inplace=True)
df.sort_values(by=["flight_id", "creation_time", "id"], inplace=True, ascending=True)

In [9]:
azfw_all = df.groupby('flight_id')['ATOW'].last()
azfw_all = azfw_all.to_dict()

df["target_ATOW"] = df["flight_id"].map(azfw_all)

In [10]:
special_cols = ['flight_id', 'creation_time', 'id', 'action_name', "departureTime", "arrivalTime"]
cat_cols = ["airline_code", "departureAirport", "arrivalAirport", "aircraftRegistration", "aircraftSubtype", "aircraftVersion"]
target_col = "target_ATOW"
num_cols = list(set(df.columns) - set(cat_cols) - set(special_cols) - {target_col})

In [11]:
# Transform data (dft = dataframe transformed)
from sklearn.preprocessing import StandardScaler, MinMaxScaler


dft = pd.get_dummies(df, columns=cat_cols, drop_first=True)

scaler = StandardScaler()
dft[num_cols] = scaler.fit_transform(dft[num_cols])

target_scaler = StandardScaler()
dft[target_col] = target_scaler.fit_transform(dft[[target_col]])

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

# Separate features and target
X = dft.drop(columns=[target_col] + special_cols)
y = dft[target_col]

# Change one hot bools to floats
X = X.astype(float)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

dataset = TensorDataset(X_tensor, y_tensor)

# Split into train and test sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [24]:
# print(
#     f"X shape: {X_tensor.shape}",
#     f"y shape: {y_tensor.shape}",
#     f"Feature variance:\n{X.var()}",
#     f"Target variance:\n{y.var()}",
#     sep="\n"
# )

# #> Looking good

In [26]:
import torch.nn as nn

class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1) 

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class Regression(nn.Module):
    def __init__(self, input_size):
        super(Regression, self).__init__()
        self.fc = nn.Linear(input_size, 1) 

    def forward(self, x):
        x = self.fc(x)
        return x

input_size = X_tensor.shape[1] # 620
# model = SimpleNN(input_size)
model = Regression(input_size)

In [27]:
# def weights_init(m):
#     if isinstance(m, nn.Linear):
#         nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)

# model.apply(weights_init)


In [28]:
print("Before training:")
for name, param in model.named_parameters():
    print(name, param.data)


Before training:
fc.weight tensor([[ 3.4475e-02, -2.0496e-02,  3.5146e-03,  1.8507e-02,  2.8835e-02,
          2.5805e-02, -3.6599e-02, -4.5504e-03, -2.3241e-02,  3.5710e-03,
          9.6225e-03,  2.1764e-03, -3.9652e-02,  2.4956e-03, -1.9120e-02,
          2.0348e-02, -3.2615e-02,  2.9448e-02,  2.1593e-02,  2.3275e-02,
         -1.7564e-02, -3.7917e-02,  2.1190e-02, -2.1341e-02, -1.7831e-02,
          2.3429e-02,  1.1258e-02,  3.5971e-03,  1.6428e-02,  7.7486e-03,
          5.6097e-04,  1.8416e-02,  1.1188e-02,  3.1521e-02, -1.6455e-02,
         -3.5281e-02, -3.6845e-02, -2.2806e-02, -3.5456e-03, -7.7592e-03,
          3.0267e-02,  6.7069e-03,  8.6650e-03,  1.3783e-02, -2.0547e-02,
         -3.8929e-02, -1.1527e-02, -3.3035e-03, -3.2195e-02,  2.5192e-02,
         -1.1705e-02, -3.4028e-02,  1.7513e-02, -1.8764e-02,  1.4112e-02,
          2.6227e-02,  3.5817e-02, -2.3400e-02,  1.0692e-02,  4.2719e-03,
          2.4125e-02, -2.2551e-02,  3.6035e-02, -6.5121e-03,  2.7966e-03,
         -3

In [34]:
import torch.optim as optim
from tqdm import tqdm
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau


criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.01)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)


# Training Loop 
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    losses = []
    train_loader_tqdm = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch')
    for inputs, labels in train_loader_tqdm:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        if torch.isnan(loss) or torch.isinf(loss):
            print("NaN or Inf loss detected")
            break

        loss.backward()
        optimizer.step()
        scheduler.step(loss)
        losses.append(loss.item())

        # # Print gradients for debugging
        # for name, param in model.named_parameters():
        #     if param.requires_grad and param.grad is not None:
        #         print(f"Epoch {epoch+1}, {name}, grad mean: {param.grad.mean().item()}, grad std: {param.grad.std().item()}")
        
        
        train_loader_tqdm.set_postfix(loss=np.mean(losses))

Epoch 1/3: 100%|██████████| 1544/1544 [00:14<00:00, 105.56batch/s, loss=1.14]
Epoch 2/3: 100%|██████████| 1544/1544 [00:14<00:00, 108.64batch/s, loss=1.14]
Epoch 3/3: 100%|██████████| 1544/1544 [00:14<00:00, 108.82batch/s, loss=1.14]


In [32]:
print("\nAfter training:")
for name, param in model.named_parameters():
    print(name, param.data)


After training:
fc.weight tensor([[-6.6770e-02, -1.6219e-01, -1.9123e-01,  3.3264e-02, -5.5131e-02,
          1.9856e-01, -7.4966e-02,  1.0415e-02,  4.2963e-03, -1.1996e-01,
          1.4478e-04, -2.4387e-01, -4.0595e-02,  1.0008e-02, -4.5290e-03,
         -6.7988e-02,  8.3328e-02,  2.5317e-02, -6.4764e-04,  2.5841e-01,
          4.4663e-03, -1.9136e-02, -1.8745e-02, -2.1727e-02,  2.3597e-02,
         -4.3400e-03,  1.4128e-02,  2.2811e-01,  3.1540e-02,  7.8307e-03,
          5.5843e-04,  1.8418e-02,  1.7266e-01, -1.9064e-01,  8.6679e-02,
          3.4993e-01, -1.3642e-02, -4.1345e-02,  4.5309e-02,  2.0498e-01,
          3.1002e-02, -4.3140e-02, -2.1724e-01,  1.3784e-02, -2.0480e-02,
         -3.6071e-02, -1.0989e-02, -5.0587e-02, -3.2193e-02, -6.1329e-02,
          7.8805e-03, -4.2035e-01, -1.7160e-01, -4.7590e-02,  1.4113e-02,
          2.6231e-02, -1.2820e-01, -7.5579e-02,  1.7730e-01,  1.9761e-01,
          2.5551e-02, -1.4347e-02,  3.6029e-02,  1.4598e-02,  2.4404e-03,
          5

In [None]:
# Evaluation
model.eval()
losses = []
unscaled_losses = []
test_loader_tqdm = tqdm(test_loader, desc='Evaluating', unit='batch')

with torch.no_grad():
    for inputs, labels in test_loader_tqdm:
        # Compute scaled outputs
        outputs = model(inputs)
        
        # Calculate scaled loss
        loss = criterion(outputs.reshape(-1, 1), labels.reshape(-1, 1))
        losses.append(loss.item())

        # Inverse transform the scaled outputs and labels to get unscaled values
        unscaled_outputs = target_scaler.inverse_transform(outputs.numpy().reshape(-1, 1))
        unscaled_labels = target_scaler.inverse_transform(labels.numpy().reshape(-1, 1))

        # Ensure the shapes are correct for unscaled loss calculation
        unscaled_outputs_tensor = torch.tensor(unscaled_outputs, dtype=torch.float32).reshape(-1)
        unscaled_labels_tensor = torch.tensor(unscaled_labels, dtype=torch.float32).reshape(-1)

        # Calculate unscaled loss
        unscaled_loss = criterion(unscaled_outputs_tensor, unscaled_labels_tensor)
        unscaled_losses.append(unscaled_loss.item())

# Calculate mean losses
mean_loss = np.mean(losses)
mean_unscaled_loss = np.mean(unscaled_losses)

print(f'Test Loss: {mean_loss}')
print(f'Unscaled Test Loss: {mean_unscaled_loss}')


# MinMaxScaler
Evaluating: 100%|██████████| 386/386 [00:02<00:00, 159.17batch/s]
Test Loss: 0.0031778891372756485
Unscaled Test Loss: 939906541196.6011


# StandardScaler
Evaluating: 100%|██████████| 386/386 [00:02<00:00, 164.46batch/s]
Test Loss: 0.8869521320083328
Unscaled Test Loss: 783619205189.9689

In [None]:
import random

random_row = dft.iloc[[random.choice(range(len(df)))]]

X = random_row.drop(columns=[target_col] + special_cols).astype(float)
y = random_row[target_col]

X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

pred = model(X_tensor).detach().numpy()
pred_it = target_scaler.inverse_transform(pred).item()

y_it = target_scaler.inverse_transform(y_tensor.reshape(-1, 1)).item()

pred_it, y_it