In [17]:
#helping functions
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer

from sklearn import set_config

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor

from sklearn.metrics import mean_squared_error
import numpy as np
import sys
import io
import matplotlib.pyplot as plt

In [18]:
# Since the data set is too large to run, we will run by only 30 millions data points to analyze
df = pd.read_csv("flight_price.csv")


In [19]:
# - Drop complex string format columns (it might helpful, but need time to actually clean it)
# - For the limited resources of this project we wil use these following features:
#   + baseFare, seatsRemaining, totalTravelDistance, elapsedDays, flightDate,
# startingAirport,destinationAirport, isBasicEconomy, isRefundable, isNonStop,
# segmentsAirlineCode

In [20]:
chosen_features = ["baseFare", "seatsRemaining", "totalTravelDistance", "elapsedDays",
                   "flightDate", "startingAirport","destinationAirport", "isBasicEconomy",
                   "isRefundable", "isNonStop","segmentsAirlineCode", "totalFare"]
df_processed = df [chosen_features]

In [21]:
df_processed

Unnamed: 0,baseFare,seatsRemaining,totalTravelDistance,elapsedDays,flightDate,startingAirport,destinationAirport,isBasicEconomy,isRefundable,isNonStop,segmentsAirlineCode,totalFare
0,217.67,9,947.0,0,2022-04-17,ATL,BOS,False,False,True,DL,248.6
1,217.67,4,947.0,0,2022-04-17,ATL,BOS,False,False,True,DL,248.6
2,217.67,9,947.0,0,2022-04-17,ATL,BOS,False,False,True,DL,248.6
3,217.67,8,947.0,0,2022-04-17,ATL,BOS,False,False,True,DL,248.6
4,217.67,9,947.0,0,2022-04-17,ATL,BOS,False,False,True,DL,248.6
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,278.14,4,1806.0,0,2022-04-18,DEN,BOS,False,False,False,UA||UA,322.6
9996,278.14,9,1764.0,0,2022-04-18,DEN,BOS,False,False,False,DL||DL,322.6
9997,278.14,4,1815.0,0,2022-04-18,DEN,BOS,False,False,False,DL||DL,322.6
9998,278.14,9,1806.0,0,2022-04-18,DEN,BOS,False,False,False,DL||DL,322.6


In [22]:
def split_train_test_dev(X, y, flight_price_col='totalFare', test_ratio=0.2, dev_ratio=0.1):
    # Ensure the input is a pandas DataFrame
    if not isinstance(X, pd.DataFrame) or not isinstance(y, pd.DataFrame):
        raise ValueError("X and y must be pandas DataFrame objects")

    # Concatenate X and y for stratified sampling
    X_columns = X.columns
    y_columns = y.columns
    data = pd.concat([X, y], axis=1)

    # Check if the FlightPrice column exists
    if flight_price_col not in data.columns:
        raise ValueError(f"{flight_price_col} column does not exist in the DataFrame")

    # Categorize 'FlightPrice' into four bins for stratification
    # The 'qcut' function is used for equal-frequency binning
    data['FlightPriceCategorical'] = pd.qcut(data[flight_price_col], q=4, labels=False)

    # Calculate adjusted dev_ratio for accurate splitting
    adjusted_dev_ratio = dev_ratio / (1 - test_ratio)

    # Splitting the data initially into temp (train + dev) and test datasets
    data_temp, data_test = train_test_split(
        data, test_size=test_ratio, stratify=data['FlightPriceCategorical'], random_state=42
    )

    # Splitting the temp dataset further into train and dev datasets
    data_train, data_dev = train_test_split(
        data_temp, test_size=adjusted_dev_ratio, stratify=data_temp['FlightPriceCategorical'], random_state=42
    )

    # Drop the 'FlightPriceCategorical' column as it's no longer needed
    data_train.drop(['FlightPriceCategorical'], axis=1, inplace=True)
    data_dev.drop(['FlightPriceCategorical'], axis=1, inplace=True)
    data_test.drop(['FlightPriceCategorical'], axis=1, inplace=True)

    # Extracting X and y from the stratified datasets
    X_train = data_train[X_columns]
    y_train = data_train[y_columns]
    X_dev = data_dev[X_columns]
    y_dev = data_dev[y_columns]
    X_test = data_test[X_columns]
    y_test = data_test[y_columns]

    return X_train, X_test, X_dev, y_train, y_test, y_dev

def fill_na(X, strategy = 'median'):
    #X: ndarray array of shape (n_samples, n_features)
    #return ndarray of shape (n_samples, n_features) with missing values filled by the strategy

    imputer = SimpleImputer(strategy = strategy)
    imputer.fit(X)

    return imputer.transform(X)

def get_outlier_indices(X):
    #X: ndarray of shape (n_samples, n_features)
    #y: label of shape (n_samples, k = 1)
    #return the indices of outliers in X

    isolation_forest = IsolationForest(random_state = 42)
    outlier_pred = isolation_forest.fit_predict(X)

    return outlier_pred

def standard_scaler(X):
    #scaling all columns in X such that for each column, we have mean = 0, std = 1

    std_scaler = StandardScaler()
    return std_scaler.fit_transform(X)

def one_hot_encoder(df_one_column):
    #df_one_column: a dataframe with one categorical column
    #return the trained model

    cat_encoder = OneHotEncoder(sparse = False)
    return cat_encoder.fit(df_one_column)



def prepare_for_train(X_train, X_dev, X_test, y_train, y_dev, y_test):
    # Define categorical and numerical features
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
    numerical_features = X_train.select_dtypes(include=[np.number]).columns

    # Create pipelines for both numerical and categorical preprocessing
    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    # Combine pipelines into a single ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # Fit and transform the training data
    X_train_prepared = preprocessor.fit_transform(X_train)
    X_dev_prepared = preprocessor.transform(X_dev)
    X_test_prepared = preprocessor.transform(X_test)

    # Optionally handle outliers in the training set after preprocessing numerical features
    # Note: It's recommended to handle outliers before scaling, but it's shown here as per the original function's structure
    isolation_forest = IsolationForest(random_state=42)
    outlier_preds = isolation_forest.fit_predict(X_train_prepared[:, :len(numerical_features)])  # Assuming numerical features are first
    outlier_indices = np.where(outlier_preds == 1)[0]  # Keep only non-outliers

    # Filter the training set to remove outliers
    X_train_prepared_no_outliers = X_train_prepared[outlier_indices]
    y_train_no_outliers = y_train.iloc[outlier_indices]

    return X_train_prepared_no_outliers, X_dev_prepared, X_test_prepared, y_train_no_outliers.values, y_dev.values, y_test.values

