# Weather data set creation
This uses the openmeteo API to get historical weather data from the days and times of F1 races from 2018- 2023<br>
openmeteo historical weather API docs: https://open-meteo.com/en/docs/historical-weather-api

## Imports

In [None]:
try:
  from google.colab import drive
  import pandas as pd
  import numpy as np
  import os
  import seaborn as sns
  import matplotlib.pyplot as plt
  import matplotlib.ticker as plticker
  import openmeteo_requests
  import requests_cache
  from retry_requests import retry
  from pathlib import Path
except ModuleNotFoundError:
  !pip install openmeteo_requests
  !pip install requests_cache
  !pip install retry_requests
  !pip install pathlib
  import openmeteo_requests
  import requests_cache
  from retry_requests import retry
  from pathlib import Path

Collecting openmeteo_requests
  Downloading openmeteo_requests-1.1.0-py3-none-any.whl (5.5 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo_requests)
  Downloading openmeteo_sdk-1.7.0-py3-none-any.whl (12 kB)
Installing collected packages: openmeteo-sdk, openmeteo_requests
Successfully installed openmeteo-sdk-1.7.0 openmeteo_requests-1.1.0
Collecting requests_cache
  Downloading requests_cache-1.1.1-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.3/60.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting cattrs>=22.2 (from requests_cache)
  Downloading cattrs-23.2.3-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize>=1.4 (from requests_cache)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, cattrs, requests_cache
Successfully installed cattrs-23.2.3 requests

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pd.set_option('display.max_rows', None)

## Read in race dataset from parquet file

The race dataset is used to identify the location based on latitude and longitude of the race circuit so we can get the weather conditions at that specific place. We will also use the race time in UTC and capture the weather from the race time up to 4 hours after that for our analysis.

In [None]:
race_df = pd.read_parquet('/content/drive/My Drive/200_Final_Project/ColabSharedFolder/race_df.parquet')

In [None]:
cols_for_api = ['circuit_name', 'location', 'country', 'lat', 'lng', 'race_date', 'race_time_utc']

In [None]:
weather_loc_df = race_df.groupby('race_id')[cols_for_api].first().reset_index()

In [None]:
weather_loc_df.head()

Unnamed: 0,race_id,circuit_name,location,country,lat,lng,race_date,race_time_utc
0,989,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,2018-03-25,05:10:00
1,990,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,2018-04-08,15:10:00
2,991,Shanghai International Circuit,Shanghai,China,31.3389,121.22,2018-04-15,06:10:00
3,992,Baku City Circuit,Baku,Azerbaijan,40.3725,49.8533,2018-04-29,12:10:00
4,993,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,2018-05-13,13:10:00


## Get Data from Weather API
The code below outputs a dataframe weather_df that has the weather variables for the hours of the F1 race.<br>
The weather variables are:<br>
1. Temperature in Fahrenheit<br>
2. Precipitaion in Inches<br>
3. Wind Speed in MpH<br>
4. Wind Direction<br>

In [None]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
weather_df = pd.DataFrame(columns = ["date", "temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m", "race_id", "hour"])
for index, row in weather_loc_df.iterrows():
    params = {
      "latitude": weather_loc_df["lat"][index],
      "longitude": weather_loc_df["lng"][index],
      "start_date": weather_loc_df["race_date"][index],
      "end_date": weather_loc_df["race_date"][index],
      "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m"],
      "temperature_unit": "fahrenheit",
      "wind_speed_unit": "mph",
      "precipitation_unit": "inch",
      "timezone": "GMT"
    }
    responses = openmeteo.weather_api(url, params=params)

    # Process location
    response = responses[0]

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_precipitation = hourly.Variables(1).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(2).ValuesAsNumpy()
    hourly_wind_direction_10m = hourly.Variables(3).ValuesAsNumpy()

    hourly_data = {"date": pd.date_range(
      start = pd.to_datetime(hourly.Time(), unit = "s"),
      end = pd.to_datetime(hourly.TimeEnd(), unit = "s"),
      freq = pd.Timedelta(seconds = hourly.Interval()),
      inclusive = "left"
    )}
    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["precipitation"] = hourly_precipitation
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["wind_direction_10m"] = hourly_wind_direction_10m

    hourly_dataframe = pd.DataFrame(data = hourly_data)
    # Add race_id to weather dataframe
    hourly_dataframe['race_id'] = weather_loc_df["race_id"][index]

    # Only take the rows from the start of the race up to 4 hours after that
    hourly_dataframe['hour'] = hourly_dataframe['date'].dt.hour
    single_race_df = weather_loc_df.loc[weather_loc_df['race_id'] == weather_loc_df["race_id"][index]]
    start_time = pd.to_datetime(single_race_df["race_time_utc"]).dt.hour
    hourly_dataframe = hourly_dataframe[(hourly_dataframe['hour'] > start_time[index] - 1) & (hourly_dataframe['hour'] < start_time[index] + 4)]

    # Add to the weather frame dict
    weather_df = pd.concat([weather_df, hourly_dataframe])

In [None]:
filepath = Path('/content/drive/My Drive/200: Final Project/weather_dataset.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)

In [None]:
weather_df.head()

Unnamed: 0,date,temperature_2m,precipitation,wind_speed_10m,wind_direction_10m,race_id,hour
5,2018-03-25 05:00:00,74.225296,0.0,17.609894,295.58847,989,5
6,2018-03-25 06:00:00,73.145302,0.0,16.783464,291.092407,989,6
7,2018-03-25 07:00:00,70.985306,0.0,17.571489,301.464081,989,7
8,2018-03-25 08:00:00,68.555298,0.0,13.24372,322.549347,989,8
15,2018-04-08 15:00:00,81.347,0.0,9.511855,48.814175,990,15


In [None]:
weather_data = weather_df.reset_index().drop(['index'], axis=1)

## Define the criteria so each abnormal weather conditions
For wet condition, we defined it as precipitation greater than 0 inch. For windy condition, we defined it as wind speed within 10m as more than 25mph. For cold condition, we defined it as having a temperature in the lower 10% of all data points and hot condition is defined as having that in the upper 80% of all data points.

The output dataframe contains boolean values of each weather condition. The higher the value, the more intense such condition happened during that race over the course of 4 hours starting from the start time.

In [None]:
weather_data['wet_conditions'] = (weather_data.precipitation > 0).astype(int)
weather_data['windy_conditions'] = (weather_data.wind_speed_10m > 25).astype(int)
weather_data['cold_conditions'] = (weather_data.temperature_2m < weather_data.temperature_2m.quantile(0.1)).astype(int)
weather_data['hot_conditions'] = (weather_data.temperature_2m > weather_data.temperature_2m.quantile(0.8)).astype(int)

In [None]:
summarized_weather = weather_data.groupby('race_id')[['wet_conditions', 'cold_conditions', 'hot_conditions', 'windy_conditions']].sum()

In [None]:
summarized_weather.head()

Unnamed: 0_level_0,wet_conditions,cold_conditions,hot_conditions,windy_conditions
race_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
989,0,0,0,0
990,0,0,1,0
991,0,0,0,0
992,0,4,0,2
993,3,4,0,0


## Save dataframe as a parquet file to be read in another Colab for analysis

In [None]:
summarized_weather.to_parquet('summarized_weather.parquet', compression = 'BROTLI')

In [None]:
!cp summarized_weather.parquet /content/drive/MyDrive/200_Final_Project/ColabSharedFolder/