Baseline Models = Logistic Regression and Decision Trees

In [1]:
# necessary modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [None]:
# import dataset
merged_df = pd.read_csv('merged_df.csv')

In [None]:
# Implement multinomial logistic regression
# Define target variables and feature variables
X = merged_df.drop(columns= ['DEP_DATE_TIME', 'ACC_DEP_TIME', 'DEP_DELAY_NEW', 'DEP_DELAY_GROUP'])
Y = merged_df['DEP_DELAY_GROUP']

# Define numerical and categorical columns 
categorical_cols = ['ORIGIN', 'DEST']
numerical_cols = [col for col in X.columns if col not in categorical_cols]


# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Convert to PyTorch tensors
X_train = torch.FloatTensor(X_train_processed.toarray())  # Use toarray() if sparse matrix
Y_train = torch.LongTensor(Y_train.values)
X_test = torch.FloatTensor(X_test_processed.toarray())
Y_test = torch.LongTensor(Y_test.values)


# Define multinomial logistic regression model
class MultinomialLogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultinomialLogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.linear(x)  # Softmax is included in CrossEntropyLoss

# Model, loss, optimizer
input_dim = X_train.shape[1]  # Number of features after preprocessing
output_dim = len(torch.unique(Y_train))
model = MultinomialLogisticRegression(input_dim, output_dim)

criterion = nn.CrossEntropyLoss()  # Includes softmax
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, Y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate accuracy
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    y_pred = torch.argmax(model(X_test), dim=1)
    accuracy = (y_pred == Y_test).float().mean()
    print(f'Accuracy: {accuracy:.4f}')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Assume merged_df is your dataset
# Define feature variables and target variable
X = merged_df.drop(columns=['DEP_DELAY_GROUP']).values
Y = merged_df['DEP_DELAY_GROUP'].values

# Convert labels to tensor (ensure they are long for classification)
Y = torch.tensor(Y, dtype=torch.long)

# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)

# Normalize data
scaler = StandardScaler()
X_train = torch.tensor(scaler.fit_transform(X_train), dtype=torch.float32)
X_test = torch.tensor(scaler.transform(X_test), dtype=torch.float32)

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # Fully connected layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, num_classes)  # Output layer

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x  # No softmax since CrossEntropyLoss expects raw logits

# Initialize model
input_size = X_train.shape[1]
num_classes = len(set(Y.numpy()))  # Number of unique labels
model = SimpleNN(input_size, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, Y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Make predictions
with torch.no_grad():
    y_pred = model(X_test)
    y_pred_classes = torch.argmax(y_pred, dim=1)

# Evaluate accuracy
accuracy = accuracy_score(Y_test.numpy(), y_pred_classes.numpy())
print(f'Accuracy: {accuracy:.2f}')


# dont know if we need to do this but here's how to visualise the tree
# plt.figure(figsize=(12,8))
# tree.plot_tree(clf, filled=True, feature_names = data.feature_names, class_names = data.target_names)
# plt.show()