In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [3]:
df_01 = pd.read_csv('/content/drive/My Drive/476 project/datasets/flight_data_0-1.csv')
df_15 = pd.read_csv('/content/drive/My Drive//476 project/datasets/flight_data_15+.csv')

In [6]:
# Identifying categorical and numerical features
categorical_features = ['AIRLINE', 'ORIGIN', 'DEST']
numerical_features = ['AIR_TIME', 'DISTANCE', 'MONTH', 'DAY_OF_WEEK', 'DEP_HOUR', 'MONTHLY_DELAY_INDICATOR', 'ROUTE_DELAY_INDICATOR']

## Dataset 0-1

In [7]:
# Sampling 10% of the data
data_sample_01 = df_01.sample(frac=0.1, random_state=42)

# Splitting features and target
X1 = data_sample_01[categorical_features + numerical_features]
y1 = data_sample_01['ARR_DELAY']

# Splitting into train and test sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Full pipeline with KNN Regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

# Training the model
pipeline.fit(X1_train, y1_train)

# Predictions
y1_pred = pipeline.predict(X1_test)

# Calculate MSE
mse = mean_squared_error(y1_test, y1_pred)
# Print the result
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 0.18833991861768004


## Dataset 15+

In [8]:
# Sampling 10% of the data
data_sample_15 = df_15.sample(frac=0.1, random_state=42)

# Splitting features and target
X2 = data_sample_15[categorical_features + numerical_features]
y2 = data_sample_15['ARR_DELAY']

# Splitting into train and test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Full pipeline with KNN Regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor(n_neighbors=5))
])

# Training the model
pipeline.fit(X2_train, y2_train)

# Predictions
y2_pred = pipeline.predict(X2_test)

# Calculate MSE
mse = mean_squared_error(y2_test, y2_pred)
# Print the result
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 1682.7809659727543


In [10]:
y2_pred_01 = (y2_pred >= 15).astype(int)
y2_test_01 = (y2_test >= 15).astype(int)

# Calculate MSE
mse = mean_squared_error(y2_test_01, y2_pred_01)
# Print the result
print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 0.35118240254762045
