In [4]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Load datasets
df_reg = pd.read_csv('/content/drive/My Drive/flight_data_regression_with_features.csv')
df_cla = pd.read_csv('/content/drive/My Drive/flight_data_classification_with_features.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Define a function to preprocess, train, and evaluate the models
def process_and_evaluate(dataframe, target_col, dataset_name):
    print(f"\n--- Processing {dataset_name} ---")

    # Splitting data into features (X) and target (y)
    X = dataframe.drop(columns=[target_col])
    y = dataframe[target_col]

    # Identify categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object']).columns
    numerical_cols = X.select_dtypes(include=['number']).columns

    # Preprocessing pipeline: One-hot encode categorical data, scale numerical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ]
    )

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Transform the data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Lasso Regression
    lasso = Lasso(alpha=1.0, random_state=42)
    lasso.fit(X_train_processed, y_train)
    lasso_preds = lasso.predict(X_test_processed)
    lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_preds))
    lasso_r2 = r2_score(y_test, lasso_preds)

    print(f"Lasso Results for {dataset_name}:")
    print(f"RMSE: {lasso_rmse:.4f}")
    print(f"R^2: {lasso_r2:.4f}")

    # Ridge Regression
    ridge = Ridge(alpha=1.0, random_state=42)
    ridge.fit(X_train_processed, y_train)
    ridge_preds = ridge.predict(X_test_processed)
    ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_preds))
    ridge_r2 = r2_score(y_test, ridge_preds)

    print(f"Ridge Results for {dataset_name}:")
    print(f"RMSE: {ridge_rmse:.4f}")
    print(f"R^2: {ridge_r2:.4f}")

# Process and evaluate both datasets
process_and_evaluate(df_01, target_col='ARR_DELAY', dataset_name='df_reg')
process_and_evaluate(df_15, target_col='ARR_DELAY', dataset_name='df_cla')



--- Processing df_reg ---
Lasso Results for df_reg:
RMSE: 0.4067
R^2: -0.0000
Ridge Results for df_reg:
RMSE: 0.3709
R^2: 0.1685

--- Processing df_cla ---
Lasso Results for df_cla:
RMSE: 37.8813
R^2: 0.0239
Ridge Results for df_cla:
RMSE: 36.5753
R^2: 0.0900
