# 02 - Data Cleaning and Feature Engineering
**Project**: Addis Ababa Light Rail Transit Analysis  
**Author**: Gosaye Emshaw  
**Purpose**: Clean data, handle anomalies, and create features for modeling

# 1. Setup and Load Previos Results
## 1.1 Import Libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis and preprocessing
from scipy import stats
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import DBSCAN

# Date handling
from datetime import datetime, timedelta
import calendar

import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


## 1.2 Load Exploared Data
For cleaning and feature engineering let's load the exploared data with added data features

In [2]:
# Load data from previous notebook
data = pd.read_csv('../data/processed/explored_transit_data.csv')
data['date'] = pd.to_datetime(data['date'])

print("=== DATA LOADED FOR CLEANING ===")
print(f"Shape: {data.shape}")
print(f"Date range: {data['date'].min()} to {data['date'].max()}")
print(f"Columns: {list(data.columns)}")

# Quick data overview
data.head()

=== DATA LOADED FOR CLEANING ===
Shape: (2824, 19)
Date range: 2017-07-08 00:00:00 to 2025-03-31 00:00:00
Columns: ['date', 'passengers_ns', 'passengers_ew', 'passengers_total', 'train_ew_single', 'train_ew_couple', 'train_ns_single', 'train_ns_couple', 'train_total', 'year', 'month', 'month_name', 'day', 'day_of_week', 'day_name', 'is_weekend', 'is_monday', 'is_friday', 'quarter']


Unnamed: 0,date,passengers_ns,passengers_ew,passengers_total,train_ew_single,train_ew_couple,train_ns_single,train_ns_couple,train_total,year,month,month_name,day,day_of_week,day_name,is_weekend,is_monday,is_friday,quarter
0,2017-07-08,53759,64872,118631,1.0,7.0,1.0,7.0,16,2017,7,July,8,5,Saturday,True,False,False,3
1,2017-07-09,52556,60074,112630,1.0,7.0,1.0,7.0,16,2017,7,July,9,6,Sunday,True,False,False,3
2,2017-07-10,63700,71569,135269,1.0,7.0,1.0,7.0,16,2017,7,July,10,0,Monday,False,True,False,3
3,2017-07-11,57504,67418,124922,1.0,7.0,1.0,7.0,16,2017,7,July,11,1,Tuesday,False,False,False,3
4,2017-07-12,69832,66576,136408,1.0,7.0,1.0,7.0,16,2017,7,July,12,2,Wednesday,False,False,False,3


# 2. Data Quality Handling
## 2.1 Missing Values Handling
From our data exploration notebook we have identified our dataset contains missing values so we have to handle them.

In [7]:
# missing value analysis
print("=== MISSING VALUES ANALYSIS ===")
missing_summary = data.isnull().sum()
missing_percentage = (missing_summary / len(data)) * 100

missing_data = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_percentage
}).sort_values('Missing_Count', ascending=False)

print(missing_data[missing_data['Missing_Count'] > 0])

if missing_data['Missing_Count'].sum() == 0:
    print("No missing values found!")
else:
    print(f"Total missing values: {missing_data['Missing_Count'].sum()}")

=== MISSING VALUES ANALYSIS ===
                 Missing_Count  Missing_Percentage
train_ew_couple             61            2.160057
train_ns_couple             61            2.160057
train_ns_single              7            0.247875
train_ew_single              7            0.247875
Total missing values: 136


In [9]:
# Handle missing values 
data_clean = data.copy()

# Example: Fill missing passenger data with interpolation
passenger_cols = ['passengers_ns', 'passengers_ew', 'passengers_total']
for col in passenger_cols:
    if col in data_clean.columns and data_clean[col].isnull().sum() > 0:
        print(f"Handling missing values in {col}...")
        
        # Use time-aware interpolation
        data_clean[col] = data_clean[col].interpolate(method='time')
        
        # Fill any remaining NaNs at edges
        data_clean[col] = data_clean[col].fillna(data_clean[col].median())
        
        print(f"{col} missing values handled")
print("Missing values treatment completed")

Missing values treatment completed
