In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE


In [99]:
# Load data
df = pd.read_excel("data/Realized Schedule 20210101-20220208.xlsx")
# Remove rows with missing values
df.dropna(inplace=True)

In [100]:
# Extract features and targets
X = df.drop(["LoadFactor"], axis=1)
y = df.LoadFactor

#Convert schedule time to different columns
# Include month, year, day and weekday in the dataset
X = pd.concat([X, pd.to_datetime(X.ScheduleTime).dt.year.rename("Year")], axis=1)
X = pd.concat([X, pd.to_datetime(X.ScheduleTime).dt.month.rename("Month")], axis=1)
X = pd.concat([X, pd.to_datetime(X.ScheduleTime).dt.isocalendar().week.rename("Week")], axis=1)
X = pd.concat([X, pd.to_datetime(X.ScheduleTime).dt.day.rename("Day")], axis=1)
X = pd.concat([X, pd.to_datetime(X.ScheduleTime).dt.weekday.rename("Weekday")], axis=1)
X = pd.concat([X, pd.to_datetime(X.ScheduleTime).dt.hour.rename("Hour")], axis=1)

# include dummy variables for categorical features
cols = ["Airline", "Destination", "FlightType", "Sector"]
col_prefix = ["AIR", "DEST", "FLT", "SECT"]
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
cols_to_drop = ["FlightNumber", "AircraftType", "ScheduleTime", "Weekday"]

for i, col in enumerate(cols):
    X = pd.concat([X, pd.get_dummies(X[col], prefix=col_prefix[i])], axis=1)

weekday_df = pd.get_dummies(X.Weekday)
weekday_df.columns = weekdays

X = pd.concat([X, weekday_df], axis=1)

X.drop(cols, axis=1, inplace=True)
X.drop(cols_to_drop, axis=1, inplace=True)

print(X.columns)
print(X.shape)



Index(['SeatCapacity', 'Year', 'Month', 'Week', 'Day', 'Hour', 'AIR_AY',
       'AIR_BJ', 'AIR_BT', 'AIR_BZ',
       ...
       'SECT_QA', 'SECT_SG', 'SECT_US', 'Monday', 'Tuesday', 'Wednesday',
       'Thursday', 'Friday', 'Saturday', 'Sunday'],
      dtype='object', length=353)
(36768, 353)


In [101]:
# Create a linear regression model

model = LinearRegression()
model.fit(X, y)
f_selector = RFE(model, n_features_to_select=0.2, step=1)
f_selector.fit(X, y)
print(f_selector.support_)

[False False False False False False False False False  True False False
 False False False False  True False False  True  True  True False False
 False  True False False False False False False False False False False
 False False  True False  True False False False False False False False
 False False False  True False False False False False False False False
 False False False  True False False False False  True False False False
 False  True False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False  True
 False False False False False False False  True False False False False
  True False False False False False False False False False False  True
 False False False False False False  True  True False  True False False
 False False False False False False False  True False  True False False
 False False False False False False False False Fa