<a href="https://colab.research.google.com/github/goodu001/1000_flights_project/blob/main/1000_flight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

path = "/content/drive/MyDrive/flight 1000.csv"
# --- 1. Data Loading and Initial Inspection ---

# Define the expected 15 column names after the ULD_Details split
column_names = [
    'FlightID', 'FlightNumber', 'Date_Local', 'Departure_Local', 'Arrival_Local',
    'Date_UTC', 'Departure_UTC', 'Arrival_UTC', 'Origin', 'Destination',
    'Aircraft', 'ULD_Details_Part1', 'ULD_Details_Part2', 'Total_ULDs', 'Status'
]

# Load the dataset without a header, and using the defined column names
# Using 'engine=python' and 'on_bad_lines=skip' for robustness against parsing errors
df = pd.read_csv(path, header=None, names=column_names, on_bad_lines='skip', engine='python')

# The first row of the data is actually the header, so drop it and reset index
df = df.iloc[1:].copy()
df.reset_index(drop=True, inplace=True)

# Combine ULD_Details_Part1 and ULD_Details_Part2 into a single ULD_Details column
df['ULD_Details'] = df['ULD_Details_Part1'].astype(str) + ',' + df['ULD_Details_Part2'].astype(str)

# Drop the individual ULD_Details_Part columns
df.drop(columns=['ULD_Details_Part1', 'ULD_Details_Part2'], inplace=True)

# Reorder columns to have ULD_Details in its original position
original_cols = [
    'FlightID', 'FlightNumber', 'Date_Local', 'Departure_Local', 'Arrival_Local',
    'Date_UTC', 'Departure_UTC', 'Arrival_UTC', 'Origin', 'Destination',
    'Aircraft', 'ULD_Details', 'Total_ULDs', 'Status'
]
df = df[original_cols]

print("First 5 rows of the DataFrame after initial processing:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nInformation about the DataFrame after initial processing:")
print(df.info())

# --- 2. Data Preprocessing ---

# Convert 'Total_ULDs' to numeric, coercing errors to NaN
df['Total_ULDs'] = pd.to_numeric(df['Total_ULDs'], errors='coerce')

# Drop rows where 'Total_ULDs' is NaN, as it is our target variable
df.dropna(subset=['Total_ULDs'], inplace=True)
print("\nShape of DataFrame after dropping NaN 'Total_ULDs' rows:", df.shape)


# Convert date and time columns to datetime objects and engineer features
# Function to safely combine date and time, handling potential errors
def combine_date_time(date_col, time_col):
    return pd.to_datetime(df[date_col] + ' ' + df[time_col], errors='coerce')

df['Departure_Local_DT'] = combine_date_time('Date_Local', 'Departure_Local')
df['Arrival_Local_DT'] = combine_date_time('Date_Local', 'Arrival_Local')
df['Departure_UTC_DT'] = combine_date_time('Date_UTC', 'Departure_UTC')
df['Arrival_UTC_DT'] = combine_date_time('Date_UTC', 'Arrival_UTC')

# Calculate flight durations in hours
df['Flight_Duration_Local_Hours'] = (df['Arrival_Local_DT'] - df['Departure_Local_DT']).dt.total_seconds() / 3600
df['Flight_Duration_UTC_Hours'] = (df['Arrival_UTC_DT'] - df['Departure_UTC_DT']).dt.total_seconds() / 3600

# Extract time-based features
df['Departure_Local_DayOfWeek'] = df['Departure_Local_DT'].dt.dayofweek
df['Departure_Local_Month'] = df['Departure_Local_DT'].dt.month
df['Departure_Local_Hour'] = df['Departure_Local_DT'].dt.hour
df['Departure_Local_Minute'] = df['Departure_Local_DT'].dt.minute

# Drop original date and time columns and the intermediate datetime objects
df.drop(columns=['Date_Local', 'Departure_Local', 'Arrival_Local',
                 'Date_UTC', 'Departure_UTC', 'Arrival_UTC',
                 'Departure_Local_DT', 'Arrival_Local_DT',
                 'Departure_UTC_DT', 'Arrival_UTC_DT'], inplace=True)

# Drop FlightID as it's an identifier and not directly predictive
df.drop(columns=['FlightID'], inplace=True)

# Parse ULD_Details to extract individual ULD types and their counts
def parse_uld_details(uld_string):
    if pd.isna(uld_string):
        return {}
    uld_types = uld_string.split(',')
    uld_counts = {}
    for item in uld_types:
        if '×' in item:
            try:
                uld_type, count = item.split('×')
                uld_counts[uld_type.strip()] = int(count)
            except ValueError:
                continue
    return uld_counts

df['Parsed_ULD_Details'] = df['ULD_Details'].apply(parse_uld_details)

# Create new features for each ULD type
all_uld_types = set()
for detail_dict in df['Parsed_ULD_Details']:
    all_uld_types.update(detail_dict.keys())

for uld_type in all_uld_types:
    df[f'ULD_{uld_type}_Count'] = df['Parsed_ULD_Details'].apply(lambda x: x.get(uld_type, 0))

# Drop the original 'ULD_Details' and the intermediate 'Parsed_ULD_Details'
df.drop(columns=['ULD_Details', 'Parsed_ULD_Details'], inplace=True)

# Identify categorical columns for one-hot encoding
categorical_cols = df.select_dtypes(include='object').columns

# Handle missing values in Status column: fill with 'Unknown'
if 'Status' in categorical_cols:
    df['Status'].fillna('Unknown', inplace=True)

# Apply One-Hot Encoding to categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Impute any remaining NaN values in numerical features (e.g., from duration calculation issues)
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())

print("\nInformation about the DataFrame after full preprocessing:")
print(df.info())

print("\nFirst 5 rows of the DataFrame after full preprocessing:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 3. Model Training ---

# Define target variable (y) and features (X)
y = df['Total_ULDs']
X = df.drop(columns=['Total_ULDs'])

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize and train the RandomForestRegressor model
# n_estimators: number of trees in the forest
# random_state: for reproducibility
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- 4. Model Evaluation ---

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation Results ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

print("\n--- Project Summary ---")
print("This project successfully built a RandomForestRegressor model to predict 'Total_ULDs' based on various flight details.")
print(f"The model achieved an R-squared of {r2:.2f}, indicating a reasonable fit to the data.")
print(f"On average, the predictions are off by approximately {mae:.2f} units from the actual 'Total_ULDs' values.")

First 5 rows of the DataFrame after initial processing:
| FlightID   | FlightNumber   | Date_Local   | Departure_Local   | Arrival_Local   | Date_UTC   | Departure_UTC   | Arrival_UTC   | Origin   | Destination   | Aircraft   | ULD_Details   | Total_ULDs   | Status    |
|:-----------|:---------------|:-------------|:------------------|:----------------|:-----------|:----------------|:--------------|:---------|:--------------|:-----------|:--------------|:-------------|:----------|
| 1000       | RG559          | 2025-01-18   | 7:30              | 15:15           | 2025-01-18 | 1:30            | 8:15          | SIN      | DXB           | B737       | AKE×17,P1P×12 | 29           | Cancelled |
| 1001       | RG891          | 2025-06-16   | 8:15              | 11:15           | 2025-06-16 | 1:15            | 2:15          | SYD      | LAX           | B787       | RKN×9,9       | Cancelled    |           |
| 1002       | RG967          | 2025-06-06   | 9:45              | 14:45           |

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Status'].fillna('Unknown', inplace=True)



--- Model Evaluation Results ---
Mean Absolute Error (MAE): 3.35
Mean Squared Error (MSE): 16.19
Root Mean Squared Error (RMSE): 4.02
R-squared (R2): 0.73

--- Project Summary ---
This project successfully built a RandomForestRegressor model to predict 'Total_ULDs' based on various flight details.
The model achieved an R-squared of 0.73, indicating a reasonable fit to the data.
On average, the predictions are off by approximately 3.35 units from the actual 'Total_ULDs' values.
