In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Load the first dataset (flight_data_0-1.csv)
file_path_0_1 = "data/flight_data_0-1.csv"
data_0_1 = pd.read_csv(file_path_0_1)

# Define features and target variable
categorical_features = ["AIRLINE", "ORIGIN", "DEST"]
numerical_features = [
    "ELAPSED_TIME", "AIR_TIME", "DISTANCE", "MONTH", "DAY_OF_WEEK",
    "DEP_HOUR", "MONTHLY_DELAY_INDICATOR", "ROUTE_DELAY_INDICATOR"
]

# Convert ARR_DELAY into a binary variable (0 = on time, 1 = delayed)
data_0_1["ARR_DELAY_BINARY"] = np.where(data_0_1["ARR_DELAY"] > 0, 1, 0)

# Define the target and features
target = "ARR_DELAY_BINARY"
X_0_1 = data_0_1[categorical_features + numerical_features]
y_0_1 = data_0_1[target]

# Split the dataset into train and test sets
X_train_0_1, X_test_0_1, y_train_0_1, y_test_0_1 = train_test_split(X_0_1, y_0_1, test_size=0.2, random_state=42)

# Preprocessing for the dataset
# 1. Scale numerical features
scaler_0_1 = StandardScaler()
X_train_num_0_1 = scaler_0_1.fit_transform(X_train_0_1[numerical_features])
X_test_num_0_1 = scaler_0_1.transform(X_test_0_1[numerical_features])

# 2. Encode categorical features
encoder_0_1 = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_0_1 = encoder_0_1.fit_transform(X_train_0_1[categorical_features])
X_test_cat_0_1 = encoder_0_1.transform(X_test_0_1[categorical_features])

# Combine numerical and categorical features
X_train_final_0_1 = np.concatenate([X_train_num_0_1, X_train_cat_0_1], axis=1)
X_test_final_0_1 = np.concatenate([X_test_num_0_1, X_test_cat_0_1], axis=1)

# Train the Logistic Regression model
model_0_1 = LogisticRegression(max_iter=1000)
model_0_1.fit(X_train_final_0_1, y_train_0_1)

# Make predictions
y_pred_0_1 = model_0_1.predict(X_test_final_0_1)

# Evaluate the model
accuracy_0_1 = accuracy_score(y_test_0_1, y_pred_0_1)
conf_matrix_0_1 = confusion_matrix(y_test_0_1, y_pred_0_1)
class_report_0_1 = classification_report(y_test_0_1, y_pred_0_1)

# Print evaluation results
print("Dataset 0-1 Logistic Regression Results:")
print("Accuracy:", accuracy_0_1)
print("Confusion Matrix:")
print(conf_matrix_0_1)
print("Classification Report:")
print(class_report_0_1)


Dataset 0-1 Logistic Regression Results:
Accuracy: 0.8258051579648862
Confusion Matrix:
[[130836   3260]
 [ 26277   9190]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.98      0.90    134096
           1       0.74      0.26      0.38     35467

    accuracy                           0.83    169563
   macro avg       0.79      0.62      0.64    169563
weighted avg       0.81      0.83      0.79    169563

