In [1]:
#automated pipelines

In [2]:
import pandas as pd
import numpy as np
import os

DATA_PATH = r'C:/Users/Welcome/OneDrive - NSBM/Desktop/3rd_year/flight delay/flight_data_2024.csv'
RESULTS_DIR = r'C:/Users/Welcome/OneDrive - NSBM/Desktop/3rd_year/flight delay/flight-delay-prediction-ml/flight-delay-prediction-ml/results'
os.makedirs(RESULTS_DIR, exist_ok=True)  # Ensures output directory exists

#randome sample
data = pd.read_csv(DATA_PATH)
data_sample = data.sample(100_000, random_state=42)
features = ['month', 'day_of_week', 'op_unique_carrier', 'origin', 'dest', 'crs_dep_time', 'distance']
y = (data_sample['arr_delay'] > 15).astype(int)  # Binary target: delayed or not

# Split into train/test sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data_sample[features], y, test_size=0.2, random_state=42, stratify=y
)


  data = pd.read_csv(DATA_PATH)


In [3]:
#standard automated pipline

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

num_features = ['crs_dep_time', 'distance']
cat_features = ['month', 'day_of_week', 'op_unique_carrier', 'origin', 'dest']

# filling missing values and ordinal encoder
standard_preprocessor = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_features),
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_features)
])

standard_pipeline = Pipeline([
    ('preprocess', standard_preprocessor)
])

# Train pipeline 
X_train_std = standard_pipeline.fit_transform(X_train)
X_test_std = standard_pipeline.transform(X_test)


np.save(os.path.join(RESULTS_DIR, 'X_train_std.npy'), X_train_std)
np.save(os.path.join(RESULTS_DIR, 'X_test_std.npy'), X_test_std)
np.save(os.path.join(RESULTS_DIR, 'y_train_std.npy'), y_train)
np.save(os.path.join(RESULTS_DIR, 'y_test_std.npy'), y_test)



In [5]:
#advanced pipeline

In [8]:
from sklearn.impute import KNNImputer
from category_encoders import TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
import os

# --- Setup paths and features ---
numeric_features = ['crs_dep_time', 'distance']
categorical_features = ['month', 'day_of_week', 'op_unique_carrier', 'origin', 'dest']
all_features = numeric_features + categorical_features
te_features = ['op_unique_carrier', 'origin', 'dest', 'dep_time_bin', 'carrier_origin']

RESULTS_DIR = r'C:/Users/Welcome/OneDrive - NSBM/Desktop/3rd_year/flight delay/flight-delay-prediction-ml/flight-delay-prediction-ml/results'
os.makedirs(RESULTS_DIR, exist_ok=True)

# --- Custom Transformer (handles DataFrame conversion) ---
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Ensure input is a DataFrame
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.columns)
        bins = []
        for time in X['crs_dep_time']:
            t = int(time) if not pd.isnull(time) else 0
            if t >= 500 and t < 1200:
                bins.append('morning')
            elif t >= 1200 and t < 1700:
                bins.append('afternoon')
            elif t >= 1700 and t <= 2359:
                bins.append('evening')
            else:
                bins.append('night')
        X = X.copy()
        X['dep_time_bin'] = bins
        X['carrier_origin'] = X['op_unique_carrier'].astype(str) + '_' + X['origin'].astype(str)
        return X

# --- Main preprocessing pipeline ---
imputer = ColumnTransformer([
    ('num', KNNImputer(n_neighbors=5), numeric_features),
    ('cat', 'passthrough', categorical_features),
])

advanced_pipeline = Pipeline([
    ('imputer', imputer),
    ('feature_engineer', FeatureEngineer(columns=all_features)),
    ('target_encoder', TargetEncoder(cols=te_features))
])

# --- Load and sample your data ---
data = pd.read_csv(r'C:/Users/Welcome/OneDrive - NSBM/Desktop/3rd_year/flight delay/flight_data_2024.csv')
data_sample = data.sample(100_000, random_state=42)
features = all_features
y = (data_sample['arr_delay'] > 15).astype(int)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data_sample[features], y, test_size=0.2, random_state=42, stratify=y
)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

# --- Execute pipeline ---
X_train_adv = advanced_pipeline.fit_transform(X_train, y_train)
X_test_adv = advanced_pipeline.transform(X_test)

# Convert to DataFrame for saving
X_train_adv_df = pd.DataFrame(X_train_adv)
X_test_adv_df = pd.DataFrame(X_test_adv)

X_train_adv_df.to_csv(os.path.join(RESULTS_DIR, 'X_train_adv.csv'), index=False)
X_test_adv_df.to_csv(os.path.join(RESULTS_DIR, 'X_test_adv.csv'), index=False)
np.save(os.path.join(RESULTS_DIR, 'y_train_adv.npy'), y_train)
np.save(os.path.join(RESULTS_DIR, 'y_test_adv.npy'), y_test)

print("Advanced preprocessing pipeline complete and data saved.")



  data = pd.read_csv(r'C:/Users/Welcome/OneDrive - NSBM/Desktop/3rd_year/flight delay/flight_data_2024.csv')


Advanced preprocessing pipeline complete and data saved.
