In [2]:
#libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import os

def load_features(filepath):
    if filepath.endswith('.npy'):
        return np.load(filepath)
    elif filepath.endswith('.csv'):
        return pd.read_csv(filepath).values
    else:
        raise ValueError("Unsupported file format: must be .npy or .csv")


In [6]:
#file paths
import os


RESULTS_DIR = r'C:/Users/Welcome/OneDrive - NSBM/Desktop/3rd_year/flight delay/flight-delay-prediction-ml/flight-delay-prediction-ml/results'

# 1. Standard pipeline (already split)
X_train_std = np.load(os.path.join(RESULTS_DIR, 'X_train_std.npy'))
X_test_std = np.load(os.path.join(RESULTS_DIR, 'X_test_std.npy'))
y_train_std = np.load(os.path.join(RESULTS_DIR, 'y_train_std.npy'))
y_test_std = np.load(os.path.join(RESULTS_DIR, 'y_test_std.npy'))

# 2. Advanced pipeline (already split)
X_train_adv = pd.read_csv(os.path.join(RESULTS_DIR, 'X_train_adv.csv')).values
X_test_adv = pd.read_csv(os.path.join(RESULTS_DIR, 'X_test_adv.csv')).values
y_train_adv = np.load(os.path.join(RESULTS_DIR, 'y_train_adv.npy'))
y_test_adv = np.load(os.path.join(RESULTS_DIR, 'y_test_adv.npy'))

# 3. Preprocessed features sample (split now)
X_sample = load_features(os.path.join(RESULTS_DIR, 'processed_features_sample.csv'))
y_sample = pd.read_csv(os.path.join(RESULTS_DIR, 'target_sample.csv')).values.ravel()


X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample
)

# 4. Preprocessed features advanced (split now)
X_adv_sample = load_features(os.path.join(RESULTS_DIR, 'processed_features_advanced.csv'))
y_adv_sample = pd.read_csv(os.path.join(RESULTS_DIR, 'target_advanced.csv')).values.ravel()


X_train_adv_s, X_test_adv_s, y_train_adv_s, y_test_adv_s = train_test_split(
    X_adv_sample, y_adv_sample, test_size=0.2, random_state=42, stratify=y_adv_sample
)


In [7]:
#group everything into a dataset list
datasets = [
    {
        'name': 'Standard Pipeline (Split)',
        'X_train': X_train_std, 'X_test': X_test_std,
        'y_train': y_train_std, 'y_test': y_test_std
    },
    {
        'name': 'Advanced Pipeline (Split)',
        'X_train': X_train_adv, 'X_test': X_test_adv,
        'y_train': y_train_adv, 'y_test': y_test_adv
    },
    {
        'name': 'Preprocessed Features Sample',
        'X_train': X_train_s, 'X_test': X_test_s,
        'y_train': y_train_s, 'y_test': y_test_s
    },
    {
        'name': 'Preprocessed Features Advanced',
        'X_train': X_train_adv_s, 'X_test': X_test_adv_s,
        'y_train': y_train_adv_s, 'y_test': y_test_adv_s
    }
]


In [8]:
#model training and evaluation loop
results = []

for dataset in datasets:
    print(f"\n=== {dataset['name']} ===")
    X_train, X_test = dataset['X_train'], dataset['X_test']
    y_train, y_test = dataset['y_train'], dataset['y_test']

    # -- Logistic Regression (scaling required)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model_lr = LogisticRegression(max_iter=1000, random_state=42)
    model_lr.fit(X_train_scaled, y_train)
    y_pred_lr = model_lr.predict(X_test_scaled)
    y_proba_lr = model_lr.predict_proba(X_test_scaled)[:, 1]
    results.append({
        'Preprocessing': dataset['name'],
        'Model': 'Logistic Regression',
        'Accuracy': accuracy_score(y_test, y_pred_lr),
        'F1': f1_score(y_test, y_pred_lr),
        'ROC-AUC': roc_auc_score(y_test, y_proba_lr)
    })
    print("Logistic Regression trained & evaluated.")

    # -- Random Forest
    model_rf = RandomForestClassifier(random_state=42)
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)
    y_proba_rf = model_rf.predict_proba(X_test)[:, 1]
    results.append({
        'Preprocessing': dataset['name'],
        'Model': 'Random Forest',
        'Accuracy': accuracy_score(y_test, y_pred_rf),
        'F1': f1_score(y_test, y_pred_rf),
        'ROC-AUC': roc_auc_score(y_test, y_proba_rf)
    })
    print("Random Forest trained & evaluated.")

    # -- XGBoost
    model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model_xgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_test)
    y_proba_xgb = model_xgb.predict_proba(X_test)[:, 1]
    results.append({
        'Preprocessing': dataset['name'],
        'Model': 'XGBoost',
        'Accuracy': accuracy_score(y_test, y_pred_xgb),
        'F1': f1_score(y_test, y_pred_xgb),
        'ROC-AUC': roc_auc_score(y_test, y_proba_xgb)
    })
    print("XGBoost trained & evaluated.")



=== Standard Pipeline (Split) ===
Logistic Regression trained & evaluated.
Random Forest trained & evaluated.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained & evaluated.

=== Advanced Pipeline (Split) ===
Logistic Regression trained & evaluated.
Random Forest trained & evaluated.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained & evaluated.

=== Preprocessed Features Sample ===
Logistic Regression trained & evaluated.
Random Forest trained & evaluated.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained & evaluated.

=== Preprocessed Features Advanced ===
Logistic Regression trained & evaluated.
Random Forest trained & evaluated.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained & evaluated.


In [9]:
results_df = pd.DataFrame(results)
display(results_df)


results_df.to_csv(os.path.join(RESULTS_DIR, 'model_results_summary.csv'), index=False)
print("Results saved as model_results_summary.csv")


Unnamed: 0,Preprocessing,Model,Accuracy,F1,ROC-AUC
0,Standard Pipeline (Split),Logistic Regression,0.8017,0.0,0.622821
1,Standard Pipeline (Split),Random Forest,0.79305,0.14182,0.636793
2,Standard Pipeline (Split),XGBoost,0.8002,0.108434,0.660425
3,Advanced Pipeline (Split),Logistic Regression,0.8006,0.033915,0.631607
4,Advanced Pipeline (Split),Random Forest,0.78445,0.176819,0.636056
5,Advanced Pipeline (Split),XGBoost,0.797,0.130249,0.65206
6,Preprocessed Features Sample,Logistic Regression,0.798435,0.0,0.627297
7,Preprocessed Features Sample,Random Forest,0.790405,0.145816,0.641279
8,Preprocessed Features Sample,XGBoost,0.800061,0.131567,0.668149
9,Preprocessed Features Advanced,Logistic Regression,0.79995,0.081286,0.695895


Results saved as model_results_summary.csv
