In [11]:
import pandas as pd
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
data_path = "C:\\Users\\gxavi\\OneDrive\\Desktop\\Flight Project\\US Airline Flight Routes and Fares 1993-2024.csv"

df = pd.read_csv(data_path, low_memory = False)
df = df.drop(['Geocoded_City1', 'Geocoded_City2','tbl1apk', 'tbl'], axis = 1)

df.head()

Unnamed: 0,Year,quarter,citymarketid_1,citymarketid_2,city1,city2,airportid_1,airportid_2,airport_1,airport_2,nsmiles,passengers,fare,carrier_lg,large_ms,fare_lg,carrier_low,lf_ms,fare_low
0,2021,3,30135,33195,"Allentown/Bethlehem/Easton, PA","Tampa, FL (Metropolitan Area)",10135,14112,ABE,PIE,970,180,81.43,G4,1.0,81.43,G4,1.0,81.43
1,2021,3,30135,33195,"Allentown/Bethlehem/Easton, PA","Tampa, FL (Metropolitan Area)",10135,15304,ABE,TPA,970,19,208.93,DL,0.4659,219.98,UA,0.1193,154.11
2,2021,3,30140,30194,"Albuquerque, NM","Dallas/Fort Worth, TX",10140,11259,ABQ,DAL,580,204,184.56,WN,0.9968,184.44,WN,0.9968,184.44
3,2021,3,30140,30194,"Albuquerque, NM","Dallas/Fort Worth, TX",10140,11298,ABQ,DFW,580,264,182.64,AA,0.9774,183.09,AA,0.9774,183.09
4,2021,3,30140,30466,"Albuquerque, NM","Phoenix, AZ",10140,14107,ABQ,PHX,328,398,177.11,WN,0.6061,184.49,AA,0.3939,165.77


In [15]:
df = df.drop_duplicates()
df['carrier_lg'] = df['carrier_lg'].fillna('Unknown')
df['carrier_low'] = df['carrier_low'].fillna('Unknown')

missing_columns = ['large_ms', 'fare_lg', 'lf_ms', 'fare_low']

for col in missing_columns:
    mean = df[col].mean()
    df[col] = df[col].fillna(mean)

df['Year'] = df['Year'].astype('object')

categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

encoder = TargetEncoder()

categorical_encoded = encoder.fit_transform(df[categorical_columns], df['fare'])

encoded_df = pd.DataFrame(
    categorical_encoded,
    columns = list(encoder.feature_names_in_)
)

encoded_df.head()

df = df.drop(columns = categorical_columns)
df_numerical = pd.concat([encoded_df, df], axis = 1)

scaler = StandardScaler()
standerdized_data = scaler.fit_transform(df_numerical)

df_clean = pd.DataFrame(
    standerdized_data,
    columns = df_numerical.columns
)

df_clean.head()

Unnamed: 0,Year,city1,city2,airport_1,airport_2,carrier_lg,carrier_low,quarter,citymarketid_1,citymarketid_2,airportid_1,airportid_2,nsmiles,passengers,fare,large_ms,fare_lg,lf_ms,fare_low
0,-0.570968,0.031572,-1.883039,0.028899,-3.01572,-3.562131,-3.843604,0.464153,-1.304219,0.82346,-1.607991,0.604647,-0.312614,-0.233632,-1.669852,1.494881,-1.626384,1.657423,-1.489663
1,-0.612007,0.113346,-1.904267,0.10126,-1.386212,0.505254,1.01514,0.464153,-1.304219,0.82346,-1.607991,1.440665,-0.312614,-0.548461,-0.122002,-0.890241,0.015034,-0.998676,-0.498608
2,-0.604322,-0.423699,-0.528032,-0.37397,-0.861434,-0.828965,-0.596053,0.464153,-1.299631,-1.611504,-1.604498,-1.396324,-0.867268,-0.186701,-0.417854,1.48059,-0.406012,1.647773,-0.085032
3,-0.612007,-0.386574,-0.502676,-0.341118,-0.050818,1.04188,1.05083,0.464153,-1.299631,-1.611504,-1.604498,-1.368972,-0.867268,-0.069373,-0.441163,1.393956,-0.422006,1.589264,-0.103441
4,-0.604322,-0.423699,-0.672293,-0.37397,-0.23681,-0.828965,1.080545,0.464153,-1.299631,-1.390808,-1.604498,0.60114,-1.225659,0.192658,-0.508297,-0.264153,-0.40542,-0.170511,-0.339614


In [31]:
train_df, test_df = train_test_split(df_clean, test_size = 0.2, random_state = 42)
train_df, val_df = train_test_split(train_df, test_size = 0.125, random_state = 42)

X_train = train_df.drop(columns = 'fare')

X_train, y_train = train_df.drop(columns = 'fare'), train_df['fare']
X_test, y_test = test_df.drop(columns = 'fare'), test_df['fare']
X_val, y_val = test_df.drop(columns = 'fare'), val_df['fare']


model = RandomForestRegressor(max_depth = 2, random_state = 42)

model.fit(X_train, y_train)

predictions_test = model.predict(X_test)
predictions_val = model.predict(X_val)

mse_test = mean_squared_error(y_test, predictions_test)
r2_test = r2_score(y_test, predictions_test)

# mse_val = mean_squared_error(y_val, predictions_val)
# r2_val = r2_score(y_val, predictions_val)

print('Testing Metrics', mse_test, r2_test)

Testing Metrics 0.26451042261553037 0.72877130413839
