# Data Preprocessing

In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from scipy import stats
from scipy.sparse import hstack, csr_matrix

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate



import xgboost as xgb
from xgboost import plot_importance

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

## Load preprocessed data

In [38]:
# Load preprocessed data set from /data folder
file_path = '/content/drive/My Drive/Colab Notebooks/lateguru/data/Top_5_Airports.csv'

preprocessed_df = pd.read_csv(file_path)

print(preprocessed_df.head())

                  Time Origin Dest                 Carrier  Cancelled  \
0  2021-01-01 09:00:00    LAX  JFK  American Airlines Inc.      False   
1  2021-01-02 09:00:00    LAX  JFK  American Airlines Inc.      False   
2  2021-01-03 09:00:00    LAX  JFK  American Airlines Inc.      False   
3  2021-01-03 09:00:00    LAX  JFK  American Airlines Inc.      False   
4  2021-01-04 09:00:00    LAX  JFK  American Airlines Inc.      False   

  CancellationReason  Delayed  DepDelayMinutes  CarrierDelay  WeatherDelay  \
0                NaN    False              0.0           NaN           NaN   
1                NaN     True              2.0           NaN           NaN   
2                NaN     True             28.0          28.0           0.0   
3                NaN     True             28.0          28.0           0.0   
4                NaN    False              0.0           NaN           NaN   

   ...  LateAircraftDelay  Temperature  Feels_Like_Temperature  \
0  ...                NaN 

In [39]:
preprocessed_df.columns

Index(['Time', 'Origin', 'Dest', 'Carrier', 'Cancelled', 'CancellationReason',
       'Delayed', 'DepDelayMinutes', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Temperature',
       'Feels_Like_Temperature', 'Altimeter_Pressure', 'Sea_Level_Pressure',
       'Visibility', 'Wind_Speed', 'Wind_Gust', 'Precipitation',
       'Ice_Accretion_3hr'],
      dtype='object')

In [40]:
preprocessed_df['Origin'].unique()

array(['LAX', 'DFW', 'DEN', 'ATL', 'ORD'], dtype=object)

Generate 'mean' delay for each carrier

In [41]:
#Create carrier_avg_delay
preprocessed_df['Carrier'] = preprocessed_df['Carrier'].str.replace('.', '')
carrier_avg_delay = preprocessed_df.groupby('Carrier')['DepDelayMinutes'].mean()
preprocessed_df['CarrierAvgDelay'] = preprocessed_df['Carrier'].map(carrier_avg_delay)

In [42]:
#Inspect CarrierAvgDelay
preprocessed_df['CarrierAvgDelay']

Unnamed: 0,CarrierAvgDelay
0,20.101924
1,20.101924
2,20.101924
3,20.101924
4,20.101924
...,...
4618830,11.582883
4618831,11.582883
4618832,11.582883
4618833,11.582883


Create 'Route' feature

In [43]:
preprocessed_df['Route'] = preprocessed_df['Origin'] + '_' + preprocessed_df['Dest']
preprocessed_df['Route']

Unnamed: 0,Route
0,LAX_JFK
1,LAX_JFK
2,LAX_JFK
3,LAX_JFK
4,LAX_JFK
...,...
4618830,ORD_DLH
4618831,ORD_IND
4618832,ORD_IND
4618833,ORD_SDF


In [44]:
preprocessed_df['Time'] = pd.to_datetime(preprocessed_df['Time'])
preprocessed_df['DayOfWeek'] = preprocessed_df['Time'].dt.dayofweek
preprocessed_df['HourOfDay'] = preprocessed_df['Time'].dt.hour
preprocessed_df['Month'] = preprocessed_df['Time'].dt.month

In [45]:
#Check features
preprocessed_df.columns

Index(['Time', 'Origin', 'Dest', 'Carrier', 'Cancelled', 'CancellationReason',
       'Delayed', 'DepDelayMinutes', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Temperature',
       'Feels_Like_Temperature', 'Altimeter_Pressure', 'Sea_Level_Pressure',
       'Visibility', 'Wind_Speed', 'Wind_Gust', 'Precipitation',
       'Ice_Accretion_3hr', 'CarrierAvgDelay', 'Route', 'DayOfWeek',
       'HourOfDay', 'Month'],
      dtype='object')

Create new 'Features' set

In [46]:
features = [
    'Origin', 'Dest', 'Route', 'Carrier', 'DayOfWeek', 'HourOfDay',
    'Temperature', 'Feels_Like_Temperature', 'Altimeter_Pressure',
    'Sea_Level_Pressure', 'Visibility', 'Wind_Speed', 'Wind_Gust',
    'Precipitation', 'Ice_Accretion_3hr', 'CarrierAvgDelay', 'Month'
]

In [47]:
# Check available columns in the DataFrame
print(preprocessed_df.columns)

# Check if specific columns are in the DataFrame
missing_features = [col for col in ['DayOfWeek', 'HourOfDay', 'Month'] if col not in preprocessed_df.columns]
if missing_features:
    print(f"Missing features: {missing_features}")
else:
    print("All features are present.")

Index(['Time', 'Origin', 'Dest', 'Carrier', 'Cancelled', 'CancellationReason',
       'Delayed', 'DepDelayMinutes', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Temperature',
       'Feels_Like_Temperature', 'Altimeter_Pressure', 'Sea_Level_Pressure',
       'Visibility', 'Wind_Speed', 'Wind_Gust', 'Precipitation',
       'Ice_Accretion_3hr', 'CarrierAvgDelay', 'Route', 'DayOfWeek',
       'HourOfDay', 'Month'],
      dtype='object')
All features are present.


### Modelling for y_pred = 'Weather_Delayed'

In [48]:
# Define y_pred and drop X columns based on 'delayed' feature engineering
X = preprocessed_df[features]

In [49]:
# Downcast integer and float columns to reduce memory usage
for col in X.select_dtypes(include=['int']).columns:
    X[col] = X[col].astype('int32')

for col in X.select_dtypes(include=['float']).columns:
    X[col] = X[col].astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('int32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [50]:
len(X)

4618835

In [51]:
X.isna().sum()

Unnamed: 0,0
Origin,0
Dest,0
Route,0
Carrier,0
DayOfWeek,0
HourOfDay,0
Temperature,0
Feels_Like_Temperature,0
Altimeter_Pressure,0
Sea_Level_Pressure,0


In [52]:
categorical_features = ['Origin', 'Dest', 'Route', 'Carrier']
numerical_features = [
    'DayOfWeek', 'HourOfDay', 'Temperature', 'Feels_Like_Temperature',
    'Altimeter_Pressure', 'Sea_Level_Pressure', 'Visibility',
    'Wind_Speed', 'Wind_Gust', 'Precipitation', 'Ice_Accretion_3hr',
    'CarrierAvgDelay', 'Month'
]

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [53]:
# # Apply SMOTE to balance the training data
# smote = SMOTE(random_state=42)
# X_train_balanced, y_train_balanced = smote.fit_resample(X_train_preprocessed, y_train)

In [54]:
y = preprocessed_df['Delayed'].astype(int)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Split into train/test (applicable to all model variations)

In [56]:
len(X_train)

3695068

In [57]:
len(X_test)

923767

In [58]:
len(y_train)

3695068

In [59]:
len(y_test)

923767

In [60]:
# Define the XGBClassifier with the specified parameters
xgb_model_top5 = xgb.XGBClassifier(
    use_label_encoder=False,  # Disable the use of the label encoder
    eval_metric='logloss',    # Logloss as the evaluation metric
    random_state=42,          # For reproducibility
    max_depth=7,              # Maximum depth of the trees
    n_estimators=500,         # Number of boosting rounds
    learning_rate=0.01,       # Learning rate for the boosting process
    n_jobs=4,                 # Number of parallel threads
    min_child_weight=5,       # Minimum sum of instance weight (hessian) needed in a child
    gamma=0.3                 # Minimum loss reduction required to make a further partition
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model_top5)
])

In [61]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [62]:
cv_results = cross_validate(
    pipeline,
    X_train,
    y_train,
    cv=stratified_kfold,  # Use StratifiedKFold for cross-validation
    scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
)

# Print the cross-validation results for each metric
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    print(f"{metric}: {cv_results['test_' + metric].mean():.4f} (+/- {cv_results['test_' + metric].std():.4f})")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



accuracy: 0.6715 (+/- 0.0005)
precision: 0.6630 (+/- 0.0013)
recall: 0.3657 (+/- 0.0007)
f1: 0.4714 (+/- 0.0007)
roc_auc: 0.7002 (+/- 0.0006)


In [None]:
# After cross-validation, fit the pipeline on the entire training data
pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
import os
import joblib

# Define the directory path where models should be saved and loaded
model_directory = '/content/drive/My Drive/Colab Notebooks/lateguru/data'

# Save the model
def save_model(model, filename):
    filepath = os.path.join(model_directory, filename)
    joblib.dump(model, filepath)
    print(f"Model saved to {filepath}")

# Load the model
def load_model(filename):
    filepath = os.path.join(model_directory, filename)
    model = joblib.load(filepath)
    print(f"Model loaded from {filepath}")
    return model

# Save the model
save_model(pipeline, 'xgb_model_top5.pkl')

### Learning Curves

In [None]:
# # Define more granular training sizes and reduce to a smaller range
# train_sizes = np.linspace(0.1, 1.0, 5)  # Choose 5 points from 10% to 100% of the training data

# # Calculate learning curve using a smaller subset for faster computation
# train_sizes, train_scores, test_scores = learning_curve(
#     estimator=xgb_model_weather,
#     X=X_train_balanced.sample(frac=0.1, random_state=42),  # Use 10% of the training data for speed
#     y=y_train_balanced.sample(frac=0.1, random_state=42),
#     train_sizes=train_sizes,
#     cv=3,  # 3-fold cross-validation
#     scoring='recall',  # Use Recall score to measure performance
#     n_jobs=4,



# )

In [None]:
# # Calculate mean and standard deviation for training scores
# train_mean = np.mean(train_scores, axis=1)
# train_std = np.std(train_scores, axis=1)

In [None]:
# # Calculate mean and standard deviation for test scores
# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)

In [None]:
# # Plot learning curve
# plt.figure(figsize=(10, 6))

# # Plot mean training recall scores
# plt.plot(train_sizes, train_mean, 'o-', color='r', label='Training Recall', linestyle='--')
# # Plot mean cross-validation recall scores
# plt.plot(train_sizes, test_mean, 'o-', color='g', label='Cross-Validation Recall')

# # Add error bars to indicate standard deviation
# plt.errorbar(train_sizes, train_mean, yerr=train_std, fmt='o', color='r', alpha=0.5)
# plt.errorbar(train_sizes, test_mean, yerr=test_std, fmt='o', color='g', alpha=0.5)

# # Fill between the recall scores to indicate standard deviation
# plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
# plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='g')

# # Add titles and labels
# plt.title('Learning Curve (Recall)')
# plt.xlabel('Training Size')
# plt.ylabel('Recall Score')

# # Add grid lines
# plt.grid(True)

# # Dynamic legend positioning
# plt.legend(loc='best')

# # Show plot
# plt.show()

In [None]:
# # Generate Classifiation Report
# from sklearn.metrics import classification_report
# report = classification_report(y_test, y_pred)
# print(f"Classification Report:\n{report}")

#### Plot Feature Importance

In [None]:
# # Plot the feature importance
# plot_importance(xgb_model_weather, max_num_features=10)
# plt.show()

In [None]:
# # Get the feature names from the encoder
# encoded_feature_names = encoder.get_feature_names_out(categorical_features)

In [None]:
# # Combine feature names
# all_feature_names = np.hstack([
#     encoded_feature_names,  # Encoded categorical feature names
#     numeric_features,        # Numeric feature names
#     # binary_features          # Binary feature names
# ])

In [None]:
# # Mapping indices to feature names
# important_features = [(all_feature_names[i], importance) for i, importance in enumerate(xgb_model_weather.feature_importances_)]

# # Sort by importance
# important_features.sort(key=lambda x: x[1], reverse=True)

# # Display top important features
# for feature, importance in important_features[:30]:
#     print(f"Feature: {feature}, Importance: {importance}")

In [None]:
# # Sort the important features by importance
# important_features.sort(key=lambda x: x[1], reverse=True)

# # Extract the top 10 features and their importance
# top_features = important_features[:30]
# features = [f[0] for f in top_features]
# importances = [f[1] for f in top_features]

# # Plot
# plt.figure(figsize=(10, 6))
# plt.barh(features, importances)
# plt.xlabel('Importance')
# plt.title('Top 30 Feature Importances')
# plt.gca().invert_yaxis()
# plt.show()

In [None]:
from google.colab import runtime
runtime.unassign()