In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Read the first DataFrame
df = pd.read_csv("C:/Users/cj.alonzo/OneDrive - Nice Systems Ltd/Documents/Personal/Data Science/personal projects/delays/merged_data_with_airlines.csv")

In [3]:
#made a new column, date
df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20604118 entries, 0 to 20604117
Data columns (total 23 columns):
 #   Column               Dtype         
---  ------               -----         
 0   YEAR                 int64         
 1   MONTH                int64         
 2   DAY                  int64         
 3   DAY_OF_WEEK          int64         
 4   TAIL_NUM             object        
 5   OP_CARRIER_FL_NUM    int64         
 6   ORIGIN               object        
 7   ORIGIN_CITY_NAME     object        
 8   DEST                 object        
 9   DEST_CITY_NAME       object        
 10  DEP_TIME             object        
 11  DEP_DELAY            float64       
 12  DEP_DEL15            float64       
 13  ARR_TIME             object        
 14  ARR_DELAY            float64       
 15  ARR_DEL15            float64       
 16  CARRIER_DELAY        float64       
 17  WEATHER_DELAY        float64       
 18  NAS_DELAY            float64       
 19  SECURITY_DELAY     

In [4]:
df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_TIME,ARR_DELAY,ARR_DEL15,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,AIRLINE,DATE
0,2021,1,1,5,N131EV,4656,ATL,"Atlanta, GA",JAN,"Jackson/Vicksburg, MS",...,15:02:00,249.0,1.0,49.0,0.0,0.0,0.0,200.0,Endeavor Air,2021-01-01
1,2021,1,1,5,N131EV,4656,JAN,"Jackson/Vicksburg, MS",ATL,"Atlanta, GA",...,17:49:00,218.0,1.0,0.0,0.0,0.0,0.0,218.0,Endeavor Air,2021-01-01
2,2021,1,1,5,N131EV,4889,ATL,"Atlanta, GA",GSP,"Greer, SC",...,21:28:00,7.0,0.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,2021-01-01
3,2021,1,1,5,N131EV,4936,OKC,"Oklahoma City, OK",ATL,"Atlanta, GA",...,13:01:00,238.0,1.0,0.0,176.0,62.0,0.0,0.0,Endeavor Air,2021-01-01
4,2021,1,1,5,N132EV,4698,BHM,"Birmingham, AL",ATL,"Atlanta, GA",...,17:01:00,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,2021-01-01


In [5]:
#Convert to string and then to datetime
df['DEP_TIME'] = pd.to_datetime(df['DEP_TIME'].astype(str), format='%H:%M:%S', errors='coerce')

In [6]:
df['ARR_TIME'] = pd.to_datetime(df['ARR_TIME'].astype(str), format='%H:%M:%S', errors='coerce')

In [7]:
# Convert int64 to int32
int_columns = df.select_dtypes(include=['int64']).columns
df[int_columns] = df[int_columns].astype('int32')

# Convert float64 to float32
float_columns = df.select_dtypes(include=['float64']).columns
df[float_columns] = df[float_columns].astype('float32')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20604118 entries, 0 to 20604117
Data columns (total 23 columns):
 #   Column               Dtype         
---  ------               -----         
 0   YEAR                 int32         
 1   MONTH                int32         
 2   DAY                  int32         
 3   DAY_OF_WEEK          int32         
 4   TAIL_NUM             object        
 5   OP_CARRIER_FL_NUM    int32         
 6   ORIGIN               object        
 7   ORIGIN_CITY_NAME     object        
 8   DEST                 object        
 9   DEST_CITY_NAME       object        
 10  DEP_TIME             datetime64[ns]
 11  DEP_DELAY            float32       
 12  DEP_DEL15            float32       
 13  ARR_TIME             datetime64[ns]
 14  ARR_DELAY            float32       
 15  ARR_DEL15            float32       
 16  CARRIER_DELAY        float32       
 17  WEATHER_DELAY        float32       
 18  NAS_DELAY            float32       
 19  SECURITY_DELAY     

In [9]:
# prepare the features
features = ['ORIGIN', 'ORIGIN_CITY_NAME', 'DEST', 'DEST_CITY_NAME', 'AIRLINE', 'YEAR', 'DAY_OF_WEEK', 'MONTH', 'TAIL_NUM']
X = df[features].copy()  # Create a copy to avoid SettingWithCopyWarning

# label encoding for categorical features
label_encoders = {}
for feature in ['ORIGIN', 'ORIGIN_CITY_NAME', 'DEST', 'DEST_CITY_NAME', 'AIRLINE', 'TAIL_NUM']:
    le = LabelEncoder()
    X.loc[:, feature] = le.fit_transform(X[feature])  # Use .loc to avoid SettingWithCopyWarning
    label_encoders[feature] = le

In [10]:
# standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# prepare the target variable
y = df['DEP_DEL15']

# split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [12]:
# find missing values in X_train
missing_indices = np.isnan(X_train).any(axis=1)

# remove corresponding rows from X_train and y_train
X_train = X_train[~missing_indices]
y_train = y_train[~missing_indices]

In [13]:
# anything missing in target values
nan_indices = np.isnan(y_train)

# remove rows with missing target values from X_train and y_train
X_train = X_train[~nan_indices]
y_train = y_train[~nan_indices]

# initialize and train the XGBoost classifier
#xgb = XGBClassifier(eval_metric='logloss')
#xgb.fit(X_train, y_train)

In [14]:
# drop rows with missing values in y_test
X_test = X_test[~np.isnan(y_test)]
y_test = y_test[~np.isnan(y_test)]

In [15]:
# Instantiate RandomOverSampler
oversampler = RandomOverSampler(random_state=42)

# Fit and apply the oversampling on the training set
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

In [16]:
# Perform oversampling before fine-tuning hyperparameters
# (Assuming you've already performed oversampling and have X_train_resampled, y_train_resampled)
# Calculate ratio of negative to positive class instances
ratio = (y_train_resampled == 0).sum() / (y_train_resampled == 1).sum()

In [17]:
# Define a wider range of values for hyperparameters to search
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # New values added: 0.05 and 0.2
    'max_depth': [3, 4, 5, 6, 7],  # New values added: 4, 5, 6, and 7
    'subsample': [0.8],  # Wrap single value in a list
    'colsample_bytree': [0.8],  # Wrap single value in a list
}

In [18]:
# Initialize XGBoost classifier
xgb_model = XGBClassifier(eval_metric='logloss')

In [19]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy')

In [20]:
# Perform grid search
grid_search.fit(X_train_resampled, y_train_resampled)

In [21]:
# Get best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [22]:
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'subsample': 0.8}
Best Score: 0.6212984154173985


In [23]:
# Predict on the test set using the best model from grid search
y_pred = grid_search.predict(X_test)

In [24]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.6207715259732473
Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.62      0.72   3250513
         1.0       0.29      0.62      0.39    793706

    accuracy                           0.62   4044219
   macro avg       0.58      0.62      0.56   4044219
weighted avg       0.75      0.62      0.66   4044219



In [25]:
# evaluate the model
#accuracy = accuracy_score(y_test, y_pred)
#print(f"Model Accuracy: {accuracy}")
#print(classification_report(y_test, y_pred))