In [11]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime


In [12]:
# Setup cache and retry settings
cache_session = requests_cache.CachedSession(".cache", expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)


In [13]:
# Define parameters for API request with both hourly and daily variables
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 48.8566,  # Paris latitude
    "longitude": 2.3522,  # Paris longitude
    "start_date": "2020-01-01",
    "end_date": datetime.now().strftime("%Y-%m-%d"),
    "hourly": (
        "temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,"
        "surface_pressure,precipitation,rain,snowfall,cloud_cover,"
        "cloud_cover_low,cloud_cover_mid,cloud_cover_high,shortwave_radiation,"
        "direct_radiation,direct_normal_irradiance,diffuse_radiation,wind_speed_10m,"
        "wind_direction_10m,wind_gusts_10m,et0_fao_evapotranspiration,"
        "weather_code,snow_depth,vapour_pressure_deficit,"
        "soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,"
        "soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,"
        "soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,"
        "soil_moisture_28_to_100cm,soil_moisture_100_to_255cm"
    ),
    "daily": (
        "temperature_2m_min,temperature_2m_max,precipitation_sum,"
        "rain_sum,snowfall_sum,precipitation_hours,sunshine_duration,"
        "daylight_duration,wind_speed_10m_max,wind_gusts_10m_max,"
        "shortwave_radiation_sum,et0_fao_evapotranspiration_sum"
    ),
    "timezone": "auto",
}


In [14]:
# Fetch data from API
responses = openmeteo.weather_api(url, params=params)
response = responses[0]  # First response for Paris

# Display metadata
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")


Coordinates 48.82249450683594°N 2.288135528564453°E
Elevation 36.0 m asl
Timezone b'Europe/Paris' b'CET'
Timezone difference to GMT+0 3600 s


In [15]:
# Process hourly data
hourly = response.Hourly()
hourly_data = {
    "date": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left",
    )
}


In [16]:
# Organize all variables into a dictionary for DataFrame creation
hourly_data = {
    "date": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left",
    ),
    "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
    "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
    "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
    "apparent_temperature": hourly.Variables(3).ValuesAsNumpy(),
    "surface_pressure": hourly.Variables(4).ValuesAsNumpy(),
    "precipitation": hourly.Variables(5).ValuesAsNumpy(),
    "rain": hourly.Variables(6).ValuesAsNumpy(),
    "snowfall": hourly.Variables(7).ValuesAsNumpy(),
    "cloud_cover": hourly.Variables(8).ValuesAsNumpy(),
    "cloud_cover_low": hourly.Variables(9).ValuesAsNumpy(),
    "cloud_cover_mid": hourly.Variables(10).ValuesAsNumpy(),
    "cloud_cover_high": hourly.Variables(11).ValuesAsNumpy(),
    "shortwave_radiation": hourly.Variables(12).ValuesAsNumpy(),
    "direct_radiation": hourly.Variables(13).ValuesAsNumpy(),
    "direct_normal_irradiance": hourly.Variables(14).ValuesAsNumpy(),
    "diffuse_radiation": hourly.Variables(15).ValuesAsNumpy(),
    "wind_speed_10m": hourly.Variables(16).ValuesAsNumpy(),
    "wind_direction_10m": hourly.Variables(17).ValuesAsNumpy(),
    "wind_gusts_10m": hourly.Variables(18).ValuesAsNumpy(),
    "et0_fao_evapotranspiration": hourly.Variables(19).ValuesAsNumpy(),
    "weather_code": hourly.Variables(20).ValuesAsNumpy(),
    "snow_depth": hourly.Variables(21).ValuesAsNumpy(),
    "vapour_pressure_deficit": hourly.Variables(22).ValuesAsNumpy(),
    "soil_temperature_0_to_7cm": hourly.Variables(23).ValuesAsNumpy(),
    "soil_temperature_7_to_28cm": hourly.Variables(24).ValuesAsNumpy(),
    "soil_temperature_28_to_100cm": hourly.Variables(25).ValuesAsNumpy(),
    "soil_temperature_100_to_255cm": hourly.Variables(26).ValuesAsNumpy(),
    "soil_moisture_0_to_7cm": hourly.Variables(27).ValuesAsNumpy(),
    "soil_moisture_7_to_28cm": hourly.Variables(28).ValuesAsNumpy(),
    "soil_moisture_28_to_100cm": hourly.Variables(29).ValuesAsNumpy(),
    "soil_moisture_100_to_255cm": hourly.Variables(30).ValuesAsNumpy(),
}

# Create a DataFrame
hourly_dataframe = pd.DataFrame(data=hourly_data)
print(hourly_dataframe)


                           date  temperature_2m  relative_humidity_2m  \
0     2019-12-31 23:00:00+00:00           -0.14             99.276306   
1     2020-01-01 00:00:00+00:00            2.61             98.239220   
2     2020-01-01 01:00:00+00:00            2.11             99.644173   
3     2020-01-01 02:00:00+00:00            1.91            100.000000   
4     2020-01-01 03:00:00+00:00            2.21             97.883568   
...                         ...             ...                   ...   
42355 2024-10-30 18:00:00+00:00             NaN                   NaN   
42356 2024-10-30 19:00:00+00:00             NaN                   NaN   
42357 2024-10-30 20:00:00+00:00             NaN                   NaN   
42358 2024-10-30 21:00:00+00:00             NaN                   NaN   
42359 2024-10-30 22:00:00+00:00             NaN                   NaN   

       dew_point_2m  apparent_temperature  surface_pressure  precipitation  \
0             -0.24             -2.673424    

In [17]:
hourly_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42360 entries, 0 to 42359
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   date                           42360 non-null  datetime64[ns, UTC]
 1   temperature_2m                 42314 non-null  float32            
 2   relative_humidity_2m           42314 non-null  float32            
 3   dew_point_2m                   42314 non-null  float32            
 4   apparent_temperature           42314 non-null  float32            
 5   surface_pressure               42314 non-null  float32            
 6   precipitation                  42314 non-null  float32            
 7   rain                           42314 non-null  float32            
 8   snowfall                       42314 non-null  float32            
 9   cloud_cover                    42314 non-null  float32            
 10  cloud_cover_low       

In [18]:
# Process daily data
daily = response.Daily()
daily_data_length = len(daily.Variables(0).ValuesAsNumpy())
dates_daily = pd.date_range(
    start=pd.to_datetime(daily.Time(), unit="s", utc=True),
    periods=daily_data_length,
    freq="D",
)

# Define daily data dictionary
daily_data = {
    "date": dates_daily,
    "temperature_2m_min": daily.Variables(0).ValuesAsNumpy(),
    "temperature_2m_max": daily.Variables(1).ValuesAsNumpy(),
    "precipitation_sum": daily.Variables(2).ValuesAsNumpy(),
    "rain_sum": daily.Variables(3).ValuesAsNumpy(),
    "snowfall_sum": daily.Variables(4).ValuesAsNumpy(),
    "precipitation_hours": daily.Variables(5).ValuesAsNumpy(),
    "sunshine_duration": daily.Variables(6).ValuesAsNumpy(),
    "daylight_duration": daily.Variables(7).ValuesAsNumpy(),
    "wind_speed_10m_max": daily.Variables(8).ValuesAsNumpy(),
    "wind_gusts_10m_max": daily.Variables(9).ValuesAsNumpy(),
    "shortwave_radiation_sum": daily.Variables(10).ValuesAsNumpy(),
    "et0_fao_evapotranspiration_sum": daily.Variables(11).ValuesAsNumpy(),
}


In [19]:
# Create daily DataFrame
daily_dataframe = pd.DataFrame(data=daily_data)

# Check for missing values summary
print("Hourly Data - Missing values per column:\n", hourly_dataframe.isnull().sum())
print("Daily Data - Missing values per column:\n", daily_dataframe.isnull().sum())


Hourly Data - Missing values per column:
 date                               0
temperature_2m                    46
relative_humidity_2m              46
dew_point_2m                      46
apparent_temperature              46
surface_pressure                  46
precipitation                     46
rain                              46
snowfall                          46
cloud_cover                       46
cloud_cover_low                   46
cloud_cover_mid                   46
cloud_cover_high                  46
shortwave_radiation               46
direct_radiation                  46
direct_normal_irradiance          46
diffuse_radiation                 46
wind_speed_10m                    46
wind_direction_10m                46
wind_gusts_10m                    46
et0_fao_evapotranspiration        46
weather_code                      46
snow_depth                       623
vapour_pressure_deficit           46
soil_temperature_0_to_7cm         46
soil_temperature_7_to_28cm       

In [23]:
daily_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1765 entries, 0 to 1764
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype              
---  ------                          --------------  -----              
 0   date                            1765 non-null   datetime64[ns, UTC]
 1   temperature_2m_min              1764 non-null   float32            
 2   temperature_2m_max              1764 non-null   float32            
 3   precipitation_sum               1763 non-null   float32            
 4   rain_sum                        1763 non-null   float32            
 5   snowfall_sum                    1763 non-null   float32            
 6   precipitation_hours             1765 non-null   float32            
 7   sunshine_duration               1763 non-null   float32            
 8   daylight_duration               1765 non-null   float32            
 9   wind_speed_10m_max              1764 non-null   float32            
 10  wind_gusts_1

In [26]:
# Ensure `date` column in `daily_dataframe` is of datetime type and aligns at midnight
daily_dataframe["date"] = pd.to_datetime(daily_dataframe["date"]).dt.floor("D")


In [27]:
# Resample the daily data to hourly by repeating each daily value across all 24 hours
# for each day, then join with the hourly data
daily_expanded = daily_dataframe.set_index("date").resample("H").ffill().reset_index()


  daily_expanded = daily_dataframe.set_index("date").resample("H").ffill().reset_index()


In [28]:
# Merge the expanded daily data with hourly data on the `date` column
merged_dataframe = pd.merge(hourly_dataframe, daily_expanded, on="date", how="left")


In [32]:
merged_dataframe.dropna(inplace=True)

In [34]:
merged_dataframe["date"].min(), merged_dataframe["date"].max()

(Timestamp('2019-12-31 23:00:00+0000', tz='UTC'),
 Timestamp('2024-10-24 23:00:00+0000', tz='UTC'))

In [35]:
merged_dataframe.to_csv("../data/paris_weather_extensive.csv", index=False)

In [36]:
merged_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41737 entries, 0 to 42216
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype              
---  ------                          --------------  -----              
 0   date                            41737 non-null  datetime64[ns, UTC]
 1   temperature_2m                  41737 non-null  float32            
 2   relative_humidity_2m            41737 non-null  float32            
 3   dew_point_2m                    41737 non-null  float32            
 4   apparent_temperature            41737 non-null  float32            
 5   surface_pressure                41737 non-null  float32            
 6   precipitation                   41737 non-null  float32            
 7   rain                            41737 non-null  float32            
 8   snowfall                        41737 non-null  float32            
 9   cloud_cover                     41737 non-null  float32            
 10  cloud_cover_low