In [1]:
import os
import json
import pandas as pd
from datetime import datetime, timezone

RAW_DATA_DIR = "raw_data"


In [7]:
def load_raw_data_to_dataframe(dir:str = RAW_DATA_DIR):
    data = []
    
    # Iteration on all files of the folder
    for filename in os.listdir(dir):
        with open(os.path.join(dir, filename), 'r') as file:
            
            raw_data = json.load(file)
            
            # Takes the name of the city from the json file (it will be saved as cityname_timestamp)
            city_name = filename.split('_')[0]
            observation_ts = datetime.fromtimestamp(raw_data['data'][0]['dt'], tz=timezone.utc)
            temperature = raw_data['data'][0]['temp']
            wind_speed = raw_data['data'][0]['wind_speed']
            
            # Iteration on weather types, there can be more than one
            for weather in raw_data['data'][0]['weather']:
                weather_type = weather['main']
                weather_type_ds = weather['description']
                    
                # Append the observation for the weather type
                data.append({
                    "city": city_name,
                    "observation_ts": observation_ts,
                    "temperature": temperature,
                    "wind_speed": wind_speed,
                    "weather_type": weather_type,
                    "weather_type_ds":weather_type_ds
                })

    df = pd.DataFrame(data)

    # Convert the timestamp in data format
    df['observation_ts'] = pd.to_datetime(df['observation_ts'])
    
    print(df.head())
    return df

df = load_raw_data_to_dataframe()


      city            observation_ts  temperature  wind_speed weather_type  \
0  Bologna 2024-09-30 23:00:00+00:00        16.40        1.54       Clouds   
1  Bologna 2024-10-01 00:00:00+00:00        16.37        2.06       Clouds   
2  Bologna 2024-10-01 01:00:00+00:00        15.81        1.03       Clouds   
3  Bologna 2024-10-01 02:00:00+00:00        16.02        1.54       Clouds   
4  Bologna 2024-10-01 03:00:00+00:00        16.13        2.57       Clouds   

    weather_type_ds  
0  scattered clouds  
1     broken clouds  
2     broken clouds  
3     broken clouds  
4  scattered clouds  


### How many distinct weather conditions were observed (rain/snow/clear/…) in a certain period?

In [8]:
print(df['weather_type_ds'].nunique())

21


## What are the temperature averages observed in a certain period per city?

In [9]:
print(df.groupby('city')['temperature'].mean())

city
Bologna     15.584957
Cagliari    20.687139
Milano      16.444300
Name: temperature, dtype: float64


## What city had the highest absolute temperature in a certain period of time?

In [10]:
print(df.groupby('city')['temperature'].max())

city
Bologna     22.12
Cagliari    28.01
Milano      24.19
Name: temperature, dtype: float64


## Which city had the highest daily temperature variation in a certain period of time?

In [12]:
# Add a column for the day
df['data'] = df['observation_ts'].dt.date

# New dataframe where to store temperature max and min per day
daily_variation = df.groupby(['city', 'data'])['temperature'].agg(['min', 'max'])

# Add the column for daily_variation
daily_variation['variation'] = daily_variation['max']-daily_variation['min']

# Find the max and sort
print(daily_variation.groupby(['city','data'])['variation'].max().sort_values(ascending=False))


city      data      
Cagliari  2024-10-02    11.44
Milano    2024-10-09    10.62
Cagliari  2024-10-06    10.48
          2024-10-12    10.32
          2024-10-11     9.56
                        ...  
Milano    2024-10-19     1.10
          2024-10-24     0.38
Cagliari  2024-09-30     0.00
Milano    2024-09-30     0.00
Bologna   2024-09-30     0.00
Name: variation, Length: 93, dtype: float64


## What city had the strongest wind in a certain period of time?

In [13]:
print(df.groupby('city')['wind_speed'].max().sort_values(ascending=False))

city
Cagliari    11.83
Bologna      7.72
Milano       7.20
Name: wind_speed, dtype: float64
