In [176]:
import pandas as pd
import glob
import json
from pathlib import Path

dir = {'Athens': [Path('data/raw/raw_current_weather_Athens_20250823_184954.json'), Path('data/raw/raw_forecast_weather_Athens_20250823_184954.json'), Path('data/raw/raw_air_pollution_Athens_20250823_184954.json')], 'Paris': [Path('data/raw/raw_current_weather_Paris_20250823_184957.json'), Path('data/raw/raw_forecast_weather_Paris_20250823_184957.json'), Path('data/raw/raw_air_pollution_Paris_20250823_184957.json')], 'London': [Path('data/raw/raw_current_weather_London_20250823_185001.json'), Path('data/raw/raw_forecast_weather_London_20250823_185001.json'), Path('data/raw/raw_air_pollution_London_20250823_185001.json')], 'Tokyo': [Path('data/raw/raw_current_weather_Tokyo_20250823_185005.json'), Path('data/raw/raw_forecast_weather_Tokyo_20250823_185005.json'), Path('data/raw/raw_air_pollution_Tokyo_20250823_185005.json')]}
aqi_map = {1: "Good", 2: "Fair", 3: "Moderate", 4: "Poor", 5: "Very Poor"}

def transform(raw_file):
    air_pollution_data = []
    current_weather_data = []
    forecast_weather_data =[]
    for city, files in raw_file.items():
        for file in files:
            if 'air_pollution' in str(file):
                with open(file) as f:
                    data = json.load(f)
                    df = pd.json_normalize(data, record_path="list", meta=["coord"], sep="_")

                    df['main_aqi_desc'] = df['main_aqi'].map(aqi_map)
                    
                    df['dt'] = pd.to_datetime(df['dt'], unit='s')
                    
                    coord_df = pd.json_normalize(df['coord'])
                    coord_df.columns = ['coord_' + c for c in coord_df.columns]
                    df = pd.concat([df.drop(columns='coord'), coord_df], axis=1)
                    
                    df['city'] = city

                    df.drop_duplicates(subset=['city', 'dt'], inplace=True)
                    fill_values_numeric = {
                        'main_aqi': -1,
                        'dt': pd.Timestamp("1970-01-01")
                    }
                    df.fillna(fill_values_numeric, inplace=True)
                    mask = df['main_aqi'].between(1, 5)
                    df.loc[mask == False, 'main_aqi_desc'] = 'Unknown'

                    
                    air_pollution_data.append(df)

            elif 'current_weather' in str(file):
                with open(file) as f:
                    data = json.load(f)
                    df = pd.json_normalize(data, meta=["coord"], sep="_")
                    columns_to_keep = [
                        "main_temp",
                        "main_feels_like",
                        "main_temp_max",
                        "main_temp_min",
                        "main_humidity",
                        "main_pressure",
                        "dt",
                        "coord_lat",
                        "coord_lon"
                    ]
                        
                    df_clean = df[columns_to_keep].copy()
                    df_clean['city'] = city

                    df_clean['dt'] = pd.to_datetime(df['dt'], unit='s')

                    df_clean.drop_duplicates(subset=['city', 'dt'], inplace=True)
                    fill_values_numeric = {
                        'dt': pd.Timestamp("1970-01-01")
                    }
                    df_clean.fillna(fill_values_numeric, inplace=True)
                    current_weather_data.append(df_clean)
                    

            elif 'forecast_weather' in str(file):
                  with open(file) as f:
                    data = json.load(f)
                    df = pd.json_normalize(data, record_path='list',meta=['city'], sep='_')
                    df['dt'] = pd.to_datetime(df['dt'], unit='s')
                   
                    coord_df = pd.json_normalize(df['city'], sep='_')
                   

                    columns_to_keep = [
                        "main_temp",
                        "main_feels_like",
                        "main_temp_max",
                        "main_temp_min",
                        "main_humidity",
                        "main_pressure",
                        "dt"
                    ]
                        
                    df_clean = df[columns_to_keep].copy()
                    df_clean['coord_lat'] = coord_df['coord_lat']
                    df_clean['coord_lon'] = coord_df['coord_lon']
                    df_clean['city'] = city

                    df_clean['dt'] = pd.to_datetime(df['dt'], unit='s')

                    df_clean.drop_duplicates(subset=['city', 'dt'], inplace=True)
                    fill_values_numeric = {
                        'dt': pd.Timestamp("1970-01-01")
                    }
                    df_clean.fillna(fill_values_numeric, inplace=True)
                    forecast_weather_data.append(df_clean)
           
                      
                     
                    
                    

    air_pollution_df = pd.concat(air_pollution_data, ignore_index=True)
    air_pollution_df.to_csv(Path('data/clean/clean_air_pollution.csv'), index=False)

    current_weather_df = pd.concat(current_weather_data, ignore_index=True)
    current_weather_df.to_csv(Path('data/clean/clean_current_weather.csv'), index=False)

    forecast_weather_df = pd.concat(forecast_weather_data, ignore_index=True)
    forecast_weather_df.to_csv(Path('data/clean/clean_forecast_weather.csv'), index=False)



transform(dir)



