In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [10]:
# Load the datasets
df = pd.read_csv('data/Train.csv')
df2= pd.read_csv('data/airports.csv')

In [11]:
# Preliminary data checks and conversion of date columns
df['DATOP'] = pd.to_datetime(df['DATOP'])
df['STD'] = pd.to_datetime(df['STD'], errors='coerce')
df['STA'] = pd.to_datetime(df['STA'], errors='coerce')

  df['STA'] = pd.to_datetime(df['STA'], errors='coerce')


In [12]:
# Feature Engineering: Creating new features based on the existing data
df['month'] = df['DATOP'].dt.month
df['day_of_week'] = df['DATOP'].dt.dayofweek
df['hour_of_day'] = df['STD'].dt.hour
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
df['flight_duration_hours'] = ((df['STA'] - df['STD']).dt.total_seconds() / 3600).fillna(0)

In [13]:
# Encoding Categorical Variables
encoder = LabelEncoder()
categorical_vars = ['DEPSTN', 'ARRSTN', 'FLTID', 'AC', 'STATUS']
for var in categorical_vars:
    df[var] = encoder.fit_transform(train_df[var])

In [14]:
# Removing Outliers
Q1 = df['target'].quantile(0.25)
Q3 = df['target'].quantile(0.75)
IQR = Q3 - Q1
multiplier = 1.5
outliers = df[(df['target'] < (Q1 - multiplier * IQR)) | (df['target'] > (Q3 + multiplier * IQR))]
df_no = df[~df.index.isin(outliers.index)]

# Define the features and the target variable without outliers
X_no= df_no.drop(['ID', 'DATOP', 'STD', 'STA', 'target'], axis=1)
y_no = df_no['target']


# Handling any remaining NaN values
X_no.fillna(0, inplace=True)

In [15]:
# Splitting the dataset into training and testing sets without outliers
X_train_no, X_test_no, y_train_no, y_test_no = train_test_split(X_no, y_no, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting Regressor
gbm_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gbm_regressor.fit(X_train_no, y_train_no)

In [17]:

# Predictions and evaluation without outliers
y_pred_gbm_no_outliers = gbm_regressor.predict(X_test_no)
mae_gbm_no_outliers = mean_absolute_error(y_test_no, y_pred_gbm_no_outliers)
rmse_gbm_no_outliers = np.sqrt(mean_squared_error(y_test_no, y_pred_gbm_no_outliers))
r2_gbm_no_outliers = r2_score(y_test_no, y_pred_gbm_no_outliers)

print(f"GBM Results After Removing Outliers:")
print(f"Mean Absolute Error (MAE): {mae_gbm_no_outliers}")
print(f"Root Mean Squared Error (RMSE): {rmse_gbm_no_outliers}")
print(f"R^2 Score: {r2_gbm_no_outliers}")


GBM Results After Removing Outliers:
Mean Absolute Error (MAE): 13.725547039445795
Root Mean Squared Error (RMSE): 19.400473042349834
R^2 Score: 0.243617101415449
