In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

In [3]:
# Load Dataset 1
file_path_1 = '/content/drive/MyDrive/476 project/datasets/flight_data_0-1.csv'
df_01 = pd.read_csv(file_path_1)

# Print the first few rows of Dataset 1
print("Dataset 1 (flight_data_0-1.csv) Head:")
print(df_01.head())


Dataset 1 (flight_data_0-1.csv) Head:
                  AIRLINE ORIGIN DEST  ARR_DELAY  CRS_ELAPSED_TIME  AIR_TIME  \
0   United Air Lines Inc.    FLL  EWR        0.0             186.0     153.0   
1    Delta Air Lines Inc.    MSP  SEA        0.0             235.0     189.0   
2   United Air Lines Inc.    DEN  MSP        0.0             118.0      87.0   
3    Delta Air Lines Inc.    MSP  SFO        1.0             260.0     249.0   
4  American Airlines Inc.    DCA  BOS        0.0             109.0      58.0   

   DISTANCE  MONTH  DAY_OF_WEEK  DEP_HOUR  MONTHLY_DELAY_INDICATOR  \
0    1065.0      1            2        11                -0.626652   
1    1399.0     11            5        21                -0.908221   
2     680.0      7            4         9                 0.980383   
3    1589.0      3            0        16                -0.345450   
4     399.0      6            6        10                 1.000000   

   ROUTE_DELAY_INDICATOR  
0              -0.445906  
1     

In [4]:
# Load Dataset 2
file_path_2 = '/content/drive/MyDrive/476 project/datasets/flight_data_15+.csv'
df_15 = pd.read_csv(file_path_2)

# Print the first few rows of Dataset 2
print("Dataset 2 (flight_data_15+.csv) Head:")
print(df_15.head())

Dataset 2 (flight_data_15+.csv) Head:
                  AIRLINE ORIGIN DEST  ARR_DELAY  CRS_ELAPSED_TIME  AIR_TIME  \
0   United Air Lines Inc.    FLL  EWR        0.0             186.0     153.0   
1    Delta Air Lines Inc.    MSP  SEA        0.0             235.0     189.0   
2   United Air Lines Inc.    DEN  MSP        0.0             118.0      87.0   
3    Delta Air Lines Inc.    MSP  SFO       24.0             260.0     249.0   
4  American Airlines Inc.    DCA  BOS        0.0             109.0      58.0   

   DISTANCE  MONTH  DAY_OF_WEEK  DEP_HOUR  MONTHLY_DELAY_INDICATOR  \
0    1065.0      1            2        11                -0.626652   
1    1399.0     11            5        21                -0.908221   
2     680.0      7            4         9                 0.980383   
3    1589.0      3            0        16                -0.345450   
4     399.0      6            6        10                 1.000000   

   ROUTE_DELAY_INDICATOR  
0              -0.445906  
1     

In [5]:
# Define categorical columns
categorical_cols = ['AIRLINE', 'ORIGIN', 'DEST']

# Dataset 1
X_01 = df_01.drop(columns=['ARR_DELAY'])  # Features
y_01 = df_01['ARR_DELAY']                # Target

# Dataset 2
X_15 = df_15.drop(columns=['ARR_DELAY'])  # Features
y_15 = df_15['ARR_DELAY']                 # Target

# Apply Ordinal Encoding to both datasets
encoder = OrdinalEncoder()
X_01[categorical_cols] = encoder.fit_transform(X_01[categorical_cols])
X_15[categorical_cols] = encoder.fit_transform(X_15[categorical_cols])

# Split data for Dataset 1
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(X_01, y_01, test_size=0.2, random_state=42)

# Split data for Dataset 2
X_train_15, X_test_15, y_train_15, y_test_15 = train_test_split(X_15, y_15, test_size=0.2, random_state=42)



In [6]:
# Train Decision Tree for Dataset 1
dt_01 = DecisionTreeRegressor(random_state=42)
dt_01.fit(X_train_01, y_train_01)

# Predict on the test set
y_pred_01 = dt_01.predict(X_test_01)

# Evaluate using Mean Squared Error (MSE)
mse_01 = mean_squared_error(y_test_01, y_pred_01)
print(f"Dataset 1 (flight_data_0-1.csv) MSE: {mse_01}")


Dataset 1 (flight_data_0-1.csv) MSE: 0.2990882772512609


In [7]:
# Train Decision Tree for Dataset 2
dt_15 = DecisionTreeRegressor(random_state=42)
dt_15.fit(X_train_15, y_train_15)

# Predict on the test set
y_pred_15 = dt_15.predict(X_test_15)

# Evaluate using Mean Squared Error (MSE)
mse_15 = mean_squared_error(y_test_15, y_pred_15)
print(f"Dataset 2 (flight_data_15+.csv) MSE: {mse_15}")


Dataset 2 (flight_data_15+.csv) MSE: 3071.2422852338723


In [10]:
# Recompute MSE for Decision Tree

# Dataset 1: Recompute MSE for 0-1 regression
dt_best_01 = DecisionTreeRegressor(random_state=42)
dt_best_01.fit(X_train_01, y_train_01)
y_test_pred_01_dt = dt_best_01.predict(X_test_01)
mse_01_dt = mean_squared_error(y_test_01, y_test_pred_01_dt)
print(f"Decimal MSE for Dataset 1 (0-1 regression): {mse_01_dt:.4f}")

# Dataset 2: Recompute predictions for 15+ dataset
dt_best_15 = DecisionTreeRegressor(random_state=42)
dt_best_15.fit(X_train_15, y_train_15)
y_test_pred_15_dt = dt_best_15.predict(X_test_15)

# Convert predictions and test labels to 0-1 binary for Dataset 2
y_test_pred_15_binary = (y_test_pred_15_dt >= 15).astype(int)  # Predictions: 0 or 1
y_test_15_binary = (y_test_15 >= 15).astype(int)  # Ground truth: 0 or 1
mse_15_dt_binary = mean_squared_error(y_test_15_binary, y_test_pred_15_binary)
print(f"Binary MSE for Dataset 2 (15+ converted to 0-1): {mse_15_dt_binary:.4f}")

# Compare Binary MSE
if mse_01_dt < mse_15_dt_binary:
    print("Dataset 1 (0-1 regression) model is better for binary classification.")
else:
    print("Dataset 2 (15+ converted to 0-1) model is better for binary classification.")

Decimal MSE for Dataset 1 (0-1 regression): 0.2991
Binary MSE for Dataset 2 (15+ converted to 0-1): 0.3030
Dataset 1 (0-1 regression) model is better for binary classification.
