# ðŸ§¹ Data Cleaning & Feature Engineering

This notebook handles data preprocessing and feature engineering for flight delay prediction.

**Dataset:** Kaggle Flight Analytics Dataset  
**Source:** https://www.kaggle.com/datasets/goyaladi/flight-dataset

**Objectives:**
- Handle missing values and outliers
- Encode categorical variables  
- Create derived features (temporal, route, carrier)
- Prepare data for modeling


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import warnings

sys.path.insert(0, os.path.abspath('..'))
warnings.filterwarnings('ignore')

from src.data_processing import *
from src.features import *

print("âœ“ Libraries imported")


In [None]:
# Load data - Try Kaggle dataset first, fallback to sample data
from src.sample_data import load_kaggle_dataset, generate_sample_dataset

kaggle_path = '../data/raw/Flight_data.csv'
sample_path = '../data/raw/flights.csv'

if os.path.exists(kaggle_path):
    print("Loading Kaggle Flight Analytics Dataset...")
    df = pd.read_csv(kaggle_path)
    print(f"âœ“ Loaded {len(df):,} records from Kaggle dataset")
elif os.path.exists(sample_path):
    print("Loading sample dataset...")
    df = pd.read_csv(sample_path)
    print(f"âœ“ Loaded {len(df):,} records from sample dataset")
else:
    print("Generating sample dataset...")
    df = generate_sample_dataset(n_flights=50000, save_path=sample_path)

print(f"\nColumns: {list(df.columns)}")


## 1. Data Overview & Missing Values


In [None]:
# Standardize column names
df.columns = df.columns.str.strip().str.replace(' ', '_')
print(f"Columns after standardization: {list(df.columns)}")

# Check for missing values
print("\nðŸ“Š Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({'Count': missing, 'Percentage': missing_pct})
display(missing_df[missing_df['Count'] > 0])


In [None]:
# Handle missing values
# Identify delay column (may vary by dataset)
delay_cols = [col for col in df.columns if 'delay' in col.lower()]
print(f"Delay columns found: {delay_cols}")

# Set primary delay column
if delay_cols:
    delay_col = [c for c in delay_cols if 'arrival' in c.lower()]
    delay_col = delay_col[0] if delay_col else delay_cols[0]
    df['arrival_delay'] = pd.to_numeric(df[delay_col], errors='coerce').fillna(0)
else:
    # Create synthetic delay if missing
    print("Creating synthetic delay column...")
    np.random.seed(42)
    df['arrival_delay'] = np.random.normal(5, 25, len(df)).clip(-30, 180)

# Create binary target
df['is_delayed'] = (df['arrival_delay'] >= 15).astype(int)
print(f"\nâœ“ Delay target created: {df['is_delayed'].mean()*100:.1f}% delayed")


## 2. Feature Engineering


In [None]:
# Extract route components if available
route_col = [col for col in df.columns if 'route' in col.lower()]
if route_col:
    route_col = route_col[0]
    route_split = df[route_col].str.split('-', expand=True)
    if route_split.shape[1] >= 2:
        df['origin'] = route_split[0].str.strip()
        df['destination'] = route_split[1].str.strip()
        print(f"âœ“ Extracted origin/destination from route")

# Check for existing temporal columns or create them
date_cols = [col for col in df.columns if 'date' in col.lower() or 'departure' in col.lower()]
if date_cols:
    # Try to parse date
    for col in date_cols:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            if df[col].notna().sum() > 0:
                df['hour'] = df[col].dt.hour
                df['day_of_week'] = df[col].dt.dayofweek
                df['month'] = df[col].dt.month
                print(f"âœ“ Extracted temporal features from {col}")
                break
        except:
            continue

# If no date found, check for existing temporal columns
if 'hour' not in df.columns:
    hour_col = [c for c in df.columns if 'hour' in c.lower()]
    if hour_col:
        df['hour'] = df[hour_col[0]]
        
print(f"\nDataset shape after feature engineering: {df.shape}")


## 3. Encode Categorical Variables


In [None]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns to encode
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
# Exclude name-like columns
cat_cols = [c for c in cat_cols if 'name' not in c.lower() and 'id' not in c.lower()]

print(f"Categorical columns to encode: {cat_cols}")

# Label encode categorical variables
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
    encoders[col] = le
    print(f"  âœ“ Encoded {col}: {df[col].nunique()} unique values")

print(f"\nâœ“ Encoding complete. New shape: {df.shape}")


## 4. Prepare Final Dataset


In [None]:
# Select features for modeling
feature_cols = [col for col in df.columns if col.endswith('_encoded') or col in ['hour', 'day_of_week', 'month']]
feature_cols = [col for col in feature_cols if col in df.columns]

# Add numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ['is_delayed', 'arrival_delay'] 
                and 'delay' not in c.lower() and 'satisfaction' not in c.lower()]

feature_cols = list(set(feature_cols + numeric_cols))
print(f"Features for modeling ({len(feature_cols)}): {feature_cols}")

# Create final dataset
X = df[feature_cols].copy()
y = df['is_delayed'].copy()

# Handle any remaining NaN
X = X.fillna(X.median())

print(f"\nâœ“ Final feature matrix: {X.shape}")
print(f"âœ“ Target distribution: {y.value_counts().to_dict()}")


In [None]:
# Save processed data
df.to_csv('../data/processed/flights_cleaned.csv', index=False)
X.to_csv('../data/processed/features.csv', index=False)
y.to_csv('../data/processed/target.csv', index=False)

print("âœ“ Saved processed data to data/processed/")
print("  - flights_cleaned.csv")
print("  - features.csv") 
print("  - target.csv")
