# Loading the Data

In [None]:
import pandas as pd

# Load the CSV
df = pd.read_csv("/content/drive/MyDrive/G3merged.csv", low_memory=False)

# Rename columns based on your Hive table
df.columns = [
    "Year", "Month", "DayofMonth", "DayOfWeek", "DepTime", "CRSDepTime", "ArrTime",
    "CRSArrTime", "UniqueCarrier", "FlightNum", "TailNum", "ActualElapsedTime",
    "CRSElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Origin", "Dest", "Distance",
    "TaxiIn", "TaxiOut", "Cancelled", "CancellationCode", "Diverted", "CarrierDelay",
    "WeatherDelay", "NASDelay",   "SecurityDelay", "LateAircraftDelay", "Delayed"
]

# View the first few rows
df.head()


Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Delayed
0,2002,4,11,4,1125,1123,1256,1240,AA,3059,...,18,0,,0,,,,,,Y
1,2002,6,2,7,615,615,812,825,WN,1441,...,11,0,,0,,,,,,N
2,2002,2,6,3,1337,1340,1426,1434,US,122,...,12,0,,0,,,,,,N
3,2002,3,24,7,1534,1540,1632,1635,MQ,3672,...,10,0,,0,,,,,,N
4,2002,1,31,4,710,710,738,740,WN,2003,...,8,0,,0,,,,,,N


In [None]:
numeric_cols = [
    "DepTime", "ArrTime", "ActualElapsedTime", "CRSElapsedTime", "AirTime", # Converting relevant numeric columns that might have mixed types
    "ArrDelay", "DepDelay", "TaxiIn", "TaxiOut", "CarrierDelay", "WeatherDelay",
    "NASDelay", "SecurityDelay", "LateAircraftDelay"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Converting to numeric, NaN if error


df = df.dropna(subset=["DepTime", "ArrTime", "ArrDelay", "DepDelay", "Distance", "Delayed"])# Droping rows with missing essential values


categorical_cols = ["UniqueCarrier", "Origin", "Dest", "CancellationCode", "Delayed"]# Converting categorical columns
for col in categorical_cols:
    df[col] = df[col].astype("category")


for col in ["UniqueCarrier", "Origin", "Dest", "CancellationCode"]:# Converting all remaining categorical columns to numerical codes
    df[col] = df[col].cat.codes


df["Delayed"] = df["Delayed"].cat.codes# Encoding the target column 'Delayed' (Yes/No or similar) to 0/1 if needed

# Final check
df.info()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype("category")


<class 'pandas.core.frame.DataFrame'>
Index: 147067 entries, 0 to 150395
Data columns (total 30 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Year               147067 non-null  int64  
 1   Month              147067 non-null  int64  
 2   DayofMonth         147067 non-null  int64  
 3   DayOfWeek          147067 non-null  int64  
 4   DepTime            147067 non-null  float64
 5   CRSDepTime         147067 non-null  int64  
 6   ArrTime            147067 non-null  float64
 7   CRSArrTime         147067 non-null  int64  
 8   UniqueCarrier      147067 non-null  int8   
 9   FlightNum          147067 non-null  int64  
 10  TailNum            147067 non-null  object 
 11  ActualElapsedTime  147067 non-null  float64
 12  CRSElapsedTime     147067 non-null  float64
 13  AirTime            147065 non-null  float64
 14  ArrDelay           147067 non-null  float64
 15  DepDelay           147067 non-null  float64
 16  Origin 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
X = df.drop(columns=[ # Drop leakage columns and unnecessary identifiers
    "FlightNum", "TailNum", "Delayed", "ArrDelay", "DepDelay"
])
y = df["Delayed"]


from sklearn.model_selection import train_test_split# Training-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


from xgboost import XGBClassifier # Training XGBoost model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = XGBClassifier(eval_metric="logloss")  # Removed use_label_encoder warning
model.fit(X_train, y_train)


y_pred = model.predict(X_test)# Predict and evaluate

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8947779968722377

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.95      0.90     14183
           1       0.94      0.85      0.89     15231

    accuracy                           0.89     29414
   macro avg       0.90      0.90      0.89     29414
weighted avg       0.90      0.89      0.89     29414


Confusion Matrix:
 [[13415   768]
 [ 2327 12904]]


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np


param_dist = { # Defining hyperparameter space
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [3, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3]
}


xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False) # Initialize model


random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, # Randomized search
                                   n_iter=30, scoring='accuracy', cv=3, verbose=2,
                                   random_state=42, n_jobs=-1)


random_search.fit(X_train, y_train) # Fit model


print("Best Parameters:", random_search.best_params_)# Best model and its parameters


best_model = random_search.best_estimator_# Evaluate the best model
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.8}
Accuracy: 0.9299313252192833

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93     14183
           1       0.96      0.90      0.93     15231

    accuracy                           0.93     29414
   macro avg       0.93      0.93      0.93     29414
weighted avg       0.93      0.93      0.93     29414


Confusion Matrix:
 [[13660   523]
 [ 1538 13693]]


In [None]:
import pandas as pd

# Load the target dataset
target_df = pd.read_csv("/content/drive/MyDrive/TargetDataSet.csv", low_memory=False)

# Rename columns to match training data (make sure these are correct)
target_df.columns = [
    "Year", "Month", "DayofMonth", "DayOfWeek", "DepTime", "CRSDepTime", "ArrTime",
    "CRSArrTime", "UniqueCarrier", "FlightNum", "TailNum", "ActualElapsedTime",
    "CRSElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Origin", "Dest", "Distance",
    "TaxiIn", "TaxiOut", "Cancelled", "CancellationCode", "Diverted", "CarrierDelay",
    "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"
]

# Convert numeric columns
numeric_cols = [
    "DepTime", "ArrTime", "ActualElapsedTime", "CRSElapsedTime", "AirTime",
    "ArrDelay", "DepDelay", "TaxiIn", "TaxiOut", "CarrierDelay", "WeatherDelay",
    "NASDelay", "SecurityDelay", "LateAircraftDelay"
]
for col in numeric_cols:
    target_df[col] = pd.to_numeric(target_df[col], errors='coerce')

# Drop rows with missing critical values
target_df = target_df.dropna(subset=["DepTime", "ArrTime", "Distance"])

# Encode categorical columns
for col in ["UniqueCarrier", "Origin", "Dest", "CancellationCode"]:
    target_df[col] = target_df[col].astype("category").cat.codes

# Prepare features (same as training)
target_X = target_df.drop(columns=["FlightNum", "TailNum", "ArrDelay", "DepDelay"], errors='ignore')

# Predict using best_model
predictions = best_model.predict(target_X)

# Add predictions to DataFrame
target_df["Predicted_Delayed"] = predictions

# Save to a new CSV
target_df.to_csv("/content/drive/MyDrive/TargetDS_Predicted1.csv", index=False)

print("predictions done! File saved as 'TargetDS_Predicted1.csv'")


predictions done! File saved as 'TargetDS_Predicted1.csv'
