In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("C:/Users/cj.alonzo/OneDrive - Nice Systems Ltd/Documents/Personal/Data Science/personal projects/delays/flight_data.csv")

In [3]:
# prepare the features
features = ['ORIGIN', 'ORIGIN_CITY_NAME', 'DEST', 'DEST_CITY_NAME', 'Description', 'YEAR', 'DAY_OF_WEEK', 'MONTH']
X = df[features].copy()  # Create a copy to avoid SettingWithCopyWarning

# label encoding for categorical features
label_encoders = {}
for feature in ['ORIGIN', 'ORIGIN_CITY_NAME', 'DEST', 'DEST_CITY_NAME', 'Description']:
    le = LabelEncoder()
    X.loc[:, feature] = le.fit_transform(X[feature])  # Use .loc to avoid SettingWithCopyWarning
    label_encoders[feature] = le

In [4]:
# standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# prepare the target variable
y = df['DEP_DEL15']

# split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [6]:
# find missing values in X_train
missing_indices = np.isnan(X_train).any(axis=1)

# remove corresponding rows from X_train and y_train
X_train = X_train[~missing_indices]
y_train = y_train[~missing_indices]

In [7]:
# anything missing in target values
nan_indices = np.isnan(y_train)

# remove rows with missing target values from X_train and y_train
X_train = X_train[~nan_indices]
y_train = y_train[~nan_indices]

# initialize and train the XGBoost classifier
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)

In [8]:
# drop rows with missing values in y_test
X_test = X_test[~np.isnan(y_test)]
y_test = y_test[~np.isnan(y_test)]

# make predictions
y_pred = xgb.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.8198772127974923
              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90   6657736
         1.0       0.53      0.00      0.00   1462980

    accuracy                           0.82   8120716
   macro avg       0.67      0.50      0.45   8120716
weighted avg       0.77      0.82      0.74   8120716



In [9]:
# define hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 9]
}

# XGBoost classifier
xgb = XGBClassifier(eval_metric='logloss')

# GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='accuracy')

In [10]:
chunk_size = 10000

# split X_train and y_train into chunks and perform grid search on each chunk
for i in range(0, len(X_train), chunk_size):
    chunk_X_train = X_train[i:i+chunk_size]
    chunk_y_train = y_train[i:i+chunk_size]
    
    # perform grid search on the current chunk
    grid_search.fit(chunk_X_train, chunk_y_train)

In [11]:
# get best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 200}
Best Score: 0.8191326946082528


In [13]:
# Instantiate XGBClassifier with the best parameters found
best_learning_rate = 0.01
best_max_depth = 6
best_n_estimators = 200
best_xgb_model = XGBClassifier(learning_rate=best_learning_rate,
                               max_depth=best_max_depth,
                               n_estimators=best_n_estimators,
                               eval_metric='logloss')

# Train the final model on the entire training dataset
best_xgb_model.fit(X_train, y_train)

# Evaluate the model on the test dataset
y_pred = best_xgb_model.predict(X_test)

# Calculate accuracy score (or other appropriate evaluation metric)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy on test set:", accuracy)

Accuracy on test set: 0.8198459347673284


In [14]:
# evaluate the model
print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90   6657736
         1.0       0.00      0.00      0.00   1462980

    accuracy                           0.82   8120716
   macro avg       0.41      0.50      0.45   8120716
weighted avg       0.67      0.82      0.74   8120716



  _warn_prf(average, modifier, msg_start, len(result))
