In [4]:
import pandas as pd


# Step 1: Load the dataset


file_path = r'C:\Users\Welcome\OneDrive - NSBM\Desktop\3rd_year\flight delay\flight_data_2024.csv'

print("Loading dataset. This might take a few minutes depending on size...")
data = pd.read_csv(file_path)
print(f"Dataset loaded successfully with {len(data)} rows and {len(data.columns)} columns.\n")

# ===========================
# Step 2: Inspect missing values
# ===========================
# In real-world datasets, missing data is common and must be handled before modeling.
missing_counts = data.isnull().sum()
print("Columns with missing values:\n", missing_counts[missing_counts > 0], "\n")

# ===========================
# Step 3: Handle missing values
# ===========================
# For 'arr_delay' and related critical features, drop rows where missing as target or important input is unknown
data = data.dropna(subset=['arr_delay'])

# For delay cause columns, fill missing with 0, which means no delay from that reason
delay_cause_cols = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
for col in delay_cause_cols:
    data[col] = data[col].fillna(0)

# For timing columns like 'dep_time', fill missing values with median (simple imputation)
timing_cols = ['dep_time', 'arr_time', 'actual_elapsed_time', 'air_time', 'taxi_out', 'taxi_in']
for col in timing_cols:
    data[col] = data[col].fillna(data[col].median())

print("Missing values handled.\n")

# ===========================
# Step 4: Create binary target variable 'is_delayed'
# ===========================
# Define a flight as delayed if actual arrival delay is more than 15 minutes
data['is_delayed'] = (data['arr_delay'] > 15).astype(int)
print("Binary target variable 'is_delayed' created.\n")
print(f"Delay distribution:\n{data['is_delayed'].value_counts(normalize=True) * 100}\n")

# ===========================
# Step 5: Select relevant features for modeling
# ===========================
# We choose features that can influence delay prediction based on domain knowledge
features = ['month', 'day_of_week', 'op_unique_carrier', 'origin', 'dest', 'crs_dep_time', 'distance']
X = data[features]
y = data['is_delayed']

print("Selected features and target variable.\n")

# ===========================
# Step 6: One-hot encoding of categorical variables
# ===========================
# Convert categorical features to dummy/indicator variables for ML models
X_encoded = pd.get_dummies(X, columns=['op_unique_carrier', 'origin', 'dest'], drop_first=True)

print(f"Features after encoding have shape: {X_encoded.shape}\n")

# ===========================
# Step 7: Save processed data for modeling
# ===========================
# Save the clean and processed feature matrix and the target variable to CSVs
X_encoded.to_csv('Data/processed_features.csv', index=False)
y.to_csv('Data/target.csv', index=False)

print("Processed features and target saved to 'data/' folder.")
print("You are now ready to move on to the modeling step!")


Loading dataset. This might take a few minutes depending on size...


  data = pd.read_csv(file_path)


Dataset loaded successfully with 7079081 rows and 35 columns.

Columns with missing values:
 op_carrier_fl_num            1
dep_time                 92659
dep_delay                92970
taxi_out                 95734
wheels_off               95734
wheels_on                97856
taxi_in                  97856
arr_time                 97854
arr_delay               113814
cancellation_code      6982766
crs_elapsed_time             1
actual_elapsed_time     113814
air_time                113814
dtype: int64 

Missing values handled.

Binary target variable 'is_delayed' created.

Delay distribution:
is_delayed
0    79.851627
1    20.148373
Name: proportion, dtype: float64

Selected features and target variable.



MemoryError: Unable to allocate 2.25 GiB for an array with shape (347, 6965267) and data type bool

In [5]:
#memory issue

In [9]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

# ===========================
# Define paths
# ===========================

# Your script folder (Notebooks), relative to project root
script_folder = r'C:\Users\Welcome\OneDrive - NSBM\Desktop\3rd_year\flight delay\flight-delay-prediction-ml\flight-delay-prediction-ml\Notebooks'

# Define the path to the results folder (one level up from Notebooks, then 'results' folder)
results_folder = os.path.join(os.path.dirname(script_folder), 'results')

# Create the results folder if it doesn't exist
os.makedirs(results_folder, exist_ok=True)

print(f"'results' folder ensured at: {results_folder}\n")

# ===========================
# Step 1: Load the full dataset
# ===========================

file_path = r'C:\Users\Welcome\OneDrive - NSBM\Desktop\3rd_year\flight delay\flight_data_2024.csv'

print("Loading full dataset. This may take a while...")
data = pd.read_csv(file_path)
print(f"Full dataset loaded with {len(data)} rows.\n")

# ===========================
# Step 2: Sample the dataset
# ===========================

# Randomly sample 100,000 rows to reduce memory usage during processing
data_sample = data.sample(100000, random_state=42)
print(f"Sampled {len(data_sample)} rows for preprocessing.\n")

# ===========================
# Step 3: Handle missing values
# ===========================

# Drop rows with missing 'arr_delay' since it's your target variable
data_sample = data_sample.dropna(subset=['arr_delay'])

# Fill NaNs in delay reason columns with 0 (means no delay from that cause)
delay_cols = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
for col in delay_cols:
    data_sample[col] = data_sample[col].fillna(0)

# Fill missing timing columns with median to maintain consistency
timing_cols = ['dep_time', 'arr_time', 'actual_elapsed_time', 'air_time', 'taxi_out', 'taxi_in']
for col in timing_cols:
    data_sample[col] = data_sample[col].fillna(data_sample[col].median())

print("Missing values handled.\n")

# ===========================
# Step 4: Create binary delay target
# ===========================

data_sample['is_delayed'] = (data_sample['arr_delay'] > 15).astype(int)
print(f"Delay target created with distribution:\n{data_sample['is_delayed'].value_counts(normalize=True) * 100}\n")

# ===========================
# Step 5: Select features and target
# ===========================

features = ['month', 'day_of_week', 'op_unique_carrier', 'origin', 'dest', 'crs_dep_time', 'distance']
X = data_sample[features]
y = data_sample['is_delayed']

print("Selected feature columns and target variable.\n")

# ===========================
# Step 6: Label encode categorical variables
# ===========================

categorical_cols = ['op_unique_carrier', 'origin', 'dest']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

print("Categorical variables encoded using LabelEncoder.\n")

# ===========================
# Step 7: Save processed data for modeling
# ===========================

# Define full save paths
features_save_path = os.path.join(results_folder, 'processed_features_sample.csv')
target_save_path = os.path.join(results_folder, 'target_sample.csv')

X.to_csv(features_save_path, index=False)
y.to_csv(target_save_path, index=False)

print(f"Processed features saved to: {features_save_path}")
print(f"Target saved to: {target_save_path}\n")

print("Preprocessing complete. You can now proceed to model training.")


'results' folder ensured at: C:\Users\Welcome\OneDrive - NSBM\Desktop\3rd_year\flight delay\flight-delay-prediction-ml\flight-delay-prediction-ml\results

Loading full dataset. This may take a while...


  data = pd.read_csv(file_path)


Full dataset loaded with 7079081 rows.

Sampled 100000 rows for preprocessing.

Missing values handled.

Delay target created with distribution:
is_delayed
0    79.844276
1    20.155724
Name: proportion, dtype: float64

Selected feature columns and target variable.

Categorical variables encoded using LabelEncoder.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])


Processed features saved to: C:\Users\Welcome\OneDrive - NSBM\Desktop\3rd_year\flight delay\flight-delay-prediction-ml\flight-delay-prediction-ml\results\processed_features_sample.csv
Target saved to: C:\Users\Welcome\OneDrive - NSBM\Desktop\3rd_year\flight delay\flight-delay-prediction-ml\flight-delay-prediction-ml\results\target_sample.csv

Preprocessing complete. You can now proceed to model training.
