Baseline Models = Logistic Regression and Decision Trees

In [2]:
# necessary modules
import pandas as pandas
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [None]:
# import dataset
merged_df = pd.read_csv('merged_df.csv')

In [None]:
# Implement multinomial logistic regression
# Define target variables and feature variables
X = merged_df.drop(columns=['DEP_DELAY_NEW'])
y = merged_df['DEP_DELAY_NEW']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ])

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Convert to PyTorch tensors
X_train = torch.FloatTensor(X_train_processed.toarray())  # Use toarray() if sparse matrix
y_train = torch.LongTensor(y_train.values)
X_test = torch.FloatTensor(X_test_processed.toarray())
y_test = torch.LongTensor(y_test.values)

# Define multinomial logistic regression model
class MultinomialLogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultinomialLogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.linear(x)  # Softmax is included in CrossEntropyLoss

# Model, loss, optimizer
input_dim = X_train.shape[1]  # Number of features after preprocessing
output_dim = len(torch.unique(y_train))
model = MultinomialLogisticRegression(input_dim, output_dim)

criterion = nn.CrossEntropyLoss()  # Includes softmax
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate accuracy
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    y_pred = torch.argmax(model(X_test), dim=1)
    accuracy = (y_pred == y_test).float().mean()
    print(f'Accuracy: {accuracy:.4f}')

In [None]:
# implement decision trees
# define target variables and feature variables
x = merged_df.drop([columns=['DEP_DEL15']])
Y = merged_df[DEP_DEL15]

# split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# create and train decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, Y_train)

# make predictions
y_pred = clf.predict(X_test)

# evaluate accuracy
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# dont know if we need to do this but here's how to visualise the tree
# plt.figure(figsize=(12,8))
# tree.plot_tree(clf, filled=True, feature_names = data.feature_names, class_names = data.target_names)
# plt.show()