In [1]:
import pandas as pd
import os

In [2]:
# --- CONFIGURATION ---
BASE_PATH = "/Users/beratzengin/Desktop/Github/EcoAir SmartCity Predictor/Data"
STATIONS_CSV = os.path.join(BASE_PATH, "stations_info.csv")

In [3]:
# Month names dictionary for output naming
month_names = {
    "01": "january", "02": "february", "03": "march", "04": "april",
    "05": "may", "06": "june", "07": "july", "08": "august",
    "09": "september", "10": "october", "11": "november", "12": "december"
}

In [4]:
# Load station list
stations = pd.read_csv(STATIONS_CSV)

In [5]:
for index, row in stations.iterrows():
    # Folder name sanitization
    station_name = row['Name'].replace(" ", "_").replace("/", "-")
    station_dir = os.path.join(BASE_PATH, station_name)
    
    if os.path.exists(station_dir):
        print(f"reating Final Raw Datasets for: {station_name}")
        
        for m_id, m_name in month_names.items():
            merged_air_weather = os.path.join(station_dir, f"merged_{m_id}_2024.csv")
            traffic_file = os.path.join(station_dir, f"traffic_{m_id}_2024.csv")
            
            # Check if both files exist for the specific month
            if os.path.exists(merged_air_weather) and os.path.exists(traffic_file):
                # Loading datasets
                df_main = pd.read_csv(merged_air_weather)
                df_traffic = pd.read_csv(traffic_file)
                
                # Standardize timestamps for perfect alignment
                df_main['date'] = pd.to_datetime(df_main['date'], utc=True)
                df_traffic['date'] = pd.to_datetime(df_traffic['date'], utc=True)
                
                # Merge: Inner join ensures we have air, weather, and traffic data for the same hour
                final_df = pd.merge(df_main, df_traffic, on='date', how='inner')
                
                if not final_df.empty:
                    # Output name format: raw_monthname.csv (e.g., raw_january.csv)
                    output_filename = f"raw_{m_name}.csv"
                    output_path = os.path.join(station_dir, output_filename)
                    
                    final_df.to_csv(output_path, index=False)
                    print(f"{m_name.capitalize()} done: {len(final_df)} hourly records created.")
            else:
                # Optional: Log missing files for debugging
                pass
    else:
        print(f"Station folder missing: {station_name}")

reating Final Raw Datasets for: Maslak
January done: 406 hourly records created.
February done: 672 hourly records created.
March done: 733 hourly records created.
April done: 684 hourly records created.
May done: 695 hourly records created.
June done: 648 hourly records created.
July done: 744 hourly records created.
August done: 721 hourly records created.
September done: 720 hourly records created.
October done: 743 hourly records created.
November done: 720 hourly records created.
December done: 533 hourly records created.
reating Final Raw Datasets for: Esenler
January done: 406 hourly records created.
February done: 672 hourly records created.
March done: 733 hourly records created.
April done: 684 hourly records created.
May done: 695 hourly records created.
June done: 648 hourly records created.
July done: 744 hourly records created.
August done: 721 hourly records created.
September done: 720 hourly records created.
October done: 743 hourly records created.
November done: 720 