# 0-1 Dataset

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

In [10]:
# Load the dataset from the file
file_path = "data/flight_data_0-1.csv"
data = pd.read_csv(file_path)

In [11]:
# Define features and target variable
categorical_features = ["AIRLINE", "ORIGIN", "DEST"]
numerical_features = [
    "ELAPSED_TIME", "AIR_TIME", "DISTANCE", "MONTH", "DAY_OF_WEEK",
    "DEP_HOUR", "MONTHLY_DELAY_INDICATOR", "ROUTE_DELAY_INDICATOR"
]
target = "ARR_DELAY"

In [12]:
# Split data into features and target
X = data[categorical_features + numerical_features]
y = data[target]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Preprocessing for numerical and categorical features
# 1. Scale numerical features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numerical_features])
X_test_num = scaler.transform(X_test[numerical_features])

# 2. Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_features])
X_test_cat = encoder.transform(X_test[categorical_features])

# Combine numerical and categorical features
X_train_final = np.concatenate([X_train_num, X_train_cat], axis=1)
X_test_final = np.concatenate([X_test_num, X_test_cat], axis=1)

In [14]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_final, y_train)

# Make predictions
y_pred = model.predict(X_test_final)

In [15]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 0.13753838874633395
R^2 Score: 0.1685314295226874


# 15+ Dataset

In [16]:
# Load the second dataset (flight_data_15+.csv)
file_path_15_plus = "data/flight_data_15+.csv"
data_15_plus = pd.read_csv(file_path_15_plus)

In [17]:
# Split the dataset into features and target
X_15_plus = data_15_plus[categorical_features + numerical_features]
y_15_plus = data_15_plus[target]

# Split into train and test sets
X_train_15_plus, X_test_15_plus, y_train_15_plus, y_test_15_plus = train_test_split(X_15_plus, y_15_plus, test_size=0.2, random_state=42)

In [19]:
# Preprocessing for the dataset
# 1. Scale numerical features
scaler_15_plus = StandardScaler()
X_train_num_15_plus = scaler_15_plus.fit_transform(X_train_15_plus[numerical_features])
X_test_num_15_plus = scaler_15_plus.transform(X_test_15_plus[numerical_features])

# 2. Encode categorical features
encoder_15_plus = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_15_plus = encoder_15_plus.fit_transform(X_train_15_plus[categorical_features])
X_test_cat_15_plus = encoder_15_plus.transform(X_test_15_plus[categorical_features])

# Combine numerical and categorical features
X_train_final_15_plus = np.concatenate([X_train_num_15_plus, X_train_cat_15_plus], axis=1)
X_test_final_15_plus = np.concatenate([X_test_num_15_plus, X_test_cat_15_plus], axis=1)

In [20]:
# Train the Linear Regression model
model_15_plus = LinearRegression()
model_15_plus.fit(X_train_final_15_plus, y_train_15_plus)

# Make predictions
y_pred_15_plus = model_15_plus.predict(X_test_final_15_plus)

In [21]:
# Evaluate the model
mse_15_plus = mean_squared_error(y_test_15_plus, y_pred_15_plus)
r2_15_plus = r2_score(y_test_15_plus, y_pred_15_plus)

# Print evaluation results
print("Dataset 15+:")
print("Mean Squared Error:", mse_15_plus)
print("R^2 Score:", r2_15_plus)

Dataset 15+:
Mean Squared Error: 1337.7629515480642
R^2 Score: 0.08999619408772153
