### Read CSV file

In [2]:
# Import package
import pandas as pd
import re

# Load data
file_path = '../data/processed/sampled.csv'
df = pd.read_csv(file_path)

# Display the first few rows for inspection
df.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,...,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode
0,e1b137527b9175d7d930c3af82e70ae0,2022-04-19,2022-05-20,OAK,ATL,PT7H52M,False,False,False,103.98,...,1653107460||1653126600,2022-05-20T22:31:00.000-06:00||2022-05-21T05:5...,DEN||ATL,OAK||DEN,Frontier Airlines||Frontier Airlines,F9||F9,||Airbus A320,9180||10620,943||1207,coach||coach
1,d813ebd107e3fa700206c0d96015da7a,2022-04-19,2022-05-20,OAK,ATL,PT6H15M,False,False,False,216.58,...,1653067080||1653084660,2022-05-20T10:18:00.000-07:00||2022-05-20T18:1...,LAX||ATL,OAK||LAX,Spirit Airlines||Spirit Airlines,NK||NK,||AIRBUS INDUSTRIE A320 SHARKLETS,4920||15600,None||None,coach||coach
2,e8ece5ad6f5962c696e06e031fc2a24a,2022-04-19,2022-05-20,OAK,ATL,PT9H6M,False,False,False,216.58,...,1653056820||1653084660,2022-05-20T07:27:00.000-07:00||2022-05-20T18:1...,LAX||ATL,OAK||LAX,Spirit Airlines||Spirit Airlines,NK||NK,AIRBUS INDUSTRIE A320 SHARKLETS||AIRBUS INDUST...,4920||15600,None||None,coach||coach
3,c004a54681335100f326c9613b3c9448,2022-04-19,2022-05-20,OAK,ATL,PT6H17M,False,False,False,237.58,...,1653110940||1653127980,2022-05-20T22:29:00.000-07:00||2022-05-21T06:1...,LAS||ATL,OAK||LAS,Spirit Airlines||Spirit Airlines,NK||NK,AIRBUS INDUSTRIE A320 SHARKLETS||Airbus A319,5580||13980,None||None,coach||coach
4,4a42bbf77211b4afa7b9e14005949120,2022-04-19,2022-05-20,OAK,ATL,PT14H12M,False,False,False,307.21,...,1653115560||1653159180,2022-05-20T23:46:00.000-07:00||2022-05-21T14:5...,SEA||ATL,OAK||SEA,Alaska Airlines||Alaska Airlines,AS||AS,Boeing 737-900||Boeing 737-900,7500||17580,672||2178,coach||coach


### Understand Datatype for each column

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13519999 entries, 0 to 13519998
Data columns (total 23 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   legId                              object 
 1   searchDate                         object 
 2   flightDate                         object 
 3   startingAirport                    object 
 4   destinationAirport                 object 
 5   travelDuration                     object 
 6   isBasicEconomy                     bool   
 7   isRefundable                       bool   
 8   isNonStop                          bool   
 9   totalFare                          float64
 10  totalTravelDistance                float64
 11  segmentsDepartureTimeEpochSeconds  object 
 12  segmentsDepartureTimeRaw           object 
 13  segmentsArrivalTimeEpochSeconds    object 
 14  segmentsArrivalTimeRaw             object 
 15  segmentsArrivalAirportCode         object 
 16  segmentsDepartur

In [4]:
df.dropna(inplace=True)

### Data preprocessing

In [5]:
# Define preprocessing function
def preprocessor(df):
        
    def extract_segment(s, position):
        segments = s.strip().split('||')
        if position == "first":
            return segments[0]
        elif position == "last":
            return segments[-1]
        else:
            return s
    
    # Helper function to parse the timestamp
    def parse_timestamp(ts):
        date, time_zone = ts.split('T')
        time, time_zone = time_zone.split('.')
        time = time.split('+')[0] if '+' in time else time.split('-')[0] if '-' in time else time

        # Extract hour from the time
        hour = int(time.split(':')[0])

        return date, time, time_zone, hour

    
    def is_float(value):
        """Check if the string can be converted to a float."""
        try:
            float(value)
            return True
        except ValueError:
            return False
    
    def convert_to_floats(segment_list):
        """Convert a list of strings to floats, ignoring invalid strings."""
        return [float(val) for val in segment_list if val and is_float(val)]

    def compute_sum_for_segment(segment_str, separator='|'):
        """Compute the sum for a given segment string."""

        # Check if the segment_str is not a string
        if not isinstance(segment_str, str):
            print(f"Unexpected type: {type(segment_str)}")
            return segment_str  # Return the original value

        segments = segment_str.split(separator)
        return sum(convert_to_floats(segments))

    def compute_summation_optimized(df, sum_cols):
        """Optimized function to compute the summation for specific columns."""
        for col in sum_cols:
            df[col] = df[col].apply(compute_sum_for_segment)
        return df

 
    # Extracting the first segment's data for departures
    df['segmentsDepartureTimeRaw'] = df['segmentsDepartureTimeRaw'].apply(lambda x: extract_segment(x, "first"))
    df['segmentsDepartureAirportCode'] = df['segmentsDepartureAirportCode'].apply(lambda x: extract_segment(x, "first"))

    # Extracting the last segment's data for arrivals
    df['segmentsArrivalTimeRaw'] = df['segmentsArrivalTimeRaw'].apply(lambda x: extract_segment(x, "last"))
    df['segmentsArrivalAirportCode'] = df['segmentsArrivalAirportCode'].apply(lambda x: extract_segment(x, "last"))
    
    # Apply the parsing for 'segmentsDepartureTimeRaw'
    df['departure_date'], df['departure_time'], df['departure_time_zone'], df['departure_hour'] = zip(*df['segmentsDepartureTimeRaw'].apply(parse_timestamp))

    # Apply the parsing for 'segmentsArrivalTimeRaw'
    df['arrival_date'], df['arrival_time'], df['arrival_time_zone'], df['arrival_hour'] = zip(*df['segmentsArrivalTimeRaw'].apply(parse_timestamp))

     # Drop specified columns
    df = df.drop(columns=['segmentsDepartureTimeRaw','segmentsArrivalTimeRaw',
                          'segmentsDepartureTimeEpochSeconds','segmentsArrivalTimeEpochSeconds', 
                          'segmentsAirlineName','arrival_date','arrival_time_zone','arrival_time',
                          'departure_date','departure_time_zone','departure_time'
                         ], errors='ignore')
    
    # Apply the optimized summation computation
    df = compute_summation_optimized(df,['segmentsDurationInSeconds', 'segmentsDistance'])

    return df

In [6]:
# Use function to clean the data
df_cleaned = preprocessor(df.copy())

In [11]:
# Display the first few rows of the preprocessed DataFrame
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,...,totalTravelDistance,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode,departure_hour,arrival_hour
0,33553,6b7d0a0d603b8a3d02b44421fed9d801,2022-04-17,2022-05-30,OAK,ORD,PT7H31M,False,False,False,...,2285.0,ORD,OAK,AS||UA,Embraer 175||Boeing 737-800,20100.0,2285.0,coach||coach,20,5
1,9427,ffb0b62073c15f83dbd4effb820e14d9,2022-05-02,2022-05-26,OAK,LAX,PT5H37M,False,False,False,...,1628.0,ONT,OAK,AS||AS,Embraer 175||Boeing 737-900,16320.0,1628.0,coach||coach,17,23
2,199,6267c0895a65dc8b6ae2a74e2bd34563,2022-04-19,2022-05-20,OAK,EWR,PT9H14M,False,False,False,...,2796.0,EWR,OAK,DL||UA,Embraer 175 (Enhanced Winglets)||Boeing 757-200,24840.0,2796.0,coach||coach,7,19
4,39489,143f6dfae663c5919a6f1eb957188637,2022-04-18,2022-05-25,OAK,SFO,PT5H,False,False,False,...,1187.0,SFO,OAK,DL||UA,Embraer 175 (Enhanced Winglets)||Boeing 737-800,14580.0,1187.0,coach||coach,10,15
6,10822,68369821b5340abd2fe499175546cd43,2022-05-01,2022-05-17,OAK,ATL,PT15H30M,False,False,False,...,2412.0,ATL,OAK,DL||UA||UA,Embraer 175 (Enhanced Winglets)||Boeing 757-30...,23400.0,2412.0,coach||coach||coach,16,10


In [12]:
# Store processed data
df_cleaned.to_csv('../data/processed/processed.csv')

### Build pipeline

In [13]:
# Seperate features and target column
X = df_cleaned.drop('totalFare', axis=1)
y = df_cleaned['totalFare']

In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Define columns for numerical processing and categorical processing
numerical_features = X.select_dtypes(include=['float64', 'float32', 'int64', 'int32']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Numerical preprocessing: fill missing values, then standardize
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine the above preprocessing using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the LightGBM model
dt_model = DecisionTreeRegressor()

# Create and evaluate the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', dt_model)
                          ])

In [17]:
# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
import numpy as np
# Define  hyperparameter for the Decision Tree
param_dist = {
    'model__max_depth': [None] + list(np.arange(2, 20)),
    'model__min_samples_split': np.arange(2, 11),
    'model__min_samples_leaf': np.arange(1, 11),
    'model__max_features': ['auto', 'sqrt', 'log2', None] + list(np.arange(0.1, 1.0, 0.1))
}

# Create the Randomized Search CV object
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=15,
    cv=5, n_jobs=-1, verbose=1, random_state=42
)

# Fit the Randomized Search to your training data
random_search.fit(X_train, y_train)  # X and y are your training data

# Print the best hyperparameters found
print("Best hyperparameters:", random_search.best_params_)

# Save the best model
best_model = random_search.best_estimator_

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best hyperparameters: {'model__min_samples_split': 7, 'model__min_samples_leaf': 8, 'model__max_features': 0.8, 'model__max_depth': 19}


In [20]:
# Store model
import joblib
joblib.dump(best_model, '../models/decision_tree_regression.pkl')

['../models/decision_tree_regression_1.pkl']

In [21]:
# load the model
loaded_pipeline = joblib.load('../models/decision_tree_regression.pkl')

In [22]:
# Predict model
y_pred = loaded_pipeline.predict(X_test)

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("mse:", mse)
print("rmse:", rmse)

mse: 15981.908963617672
rmse: 126.41957508083023
