In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
PATH = "/Users/beratzengin/Desktop/Github/EcoAir SmartCity Predictor/Feature Engineering"

train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [6]:
train_df.columns

Index(['date', 'PM10', 'SO2', 'O3', 'NO2', 'CO', 'temp', 'humidity',
       'wind_speed', 'wind_direction', 'avg_speed', 'total_vehicles', 'hour',
       'day_of_week', 'is_weekend', 'month', 'prev_PM10', 'traffic_rolling_3h',
       'station_Aksaray', 'station_Alibeyköy', 'station_Arnavutköy',
       'station_Avcılar', 'station_Bağcılar', 'station_Beşiktaş',
       'station_D-100_', 'station_Esenler', 'station_Kadıköy',
       'station_Kandilli_1', 'station_Kartal', 'station_Kağıthane_1',
       'station_Kumköy', 'station_Maslak', 'station_Mobil',
       'station_Sancaktepe', 'station_Sarıyer', 'station_Selimiye',
       'station_Sultangazi_1', 'station_Sultangazi_2', 'station_Sultangazi_3',
       'station_Tuzla', 'station_Yenibosna', 'station_Çatladıkapı',
       'station_Ümraniye_1', 'station_Üsküdar_1'],
      dtype='str')

In [7]:
# Cell 3: Advanced Feature Engineering (Fixed)

# Re-loading to make sure we have all columns (Run this if you deleted columns before)
train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

# 1. Handle Categorical Data (Station_Name)
if 'Station_Name' in train_df.columns:
    train_df = pd.get_dummies(train_df, columns=['Station_Name'], prefix='station')
    test_df = pd.get_dummies(test_df, columns=['Station_Name'], prefix='station')
else:
    print("Warning: Station_Name column not found! Check your CSV files.")

# 2. Extract Features from Date
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

for df_temp in [train_df, test_df]:
    df_temp['hour'] = df_temp['date'].dt.hour
    df_temp['day_of_week'] = df_temp['date'].dt.dayofweek
    df_temp['month'] = df_temp['date'].dt.month
    df_temp['is_weekend'] = df_temp['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# 3. Align columns (ensure both have the same dummy columns)
train_df, test_df = train_df.align(test_df, join='inner', axis=1)

# 4. Define Target and ALL Numeric Features
target = 'PM10'
# Automatically pick all numeric columns except the target and date
features = train_df.select_dtypes(include=[np.number]).columns.tolist()
if target in features:
    features.remove(target)

X_train = train_df[features].fillna(0)
y_train = train_df[target].fillna(train_df[target].mean())

X_test = test_df[features].fillna(0)
y_test = test_df[target].fillna(test_df[target].mean())

print(f"Total features used: {len(features)}")

Total features used: 16
