## COMP47670 Assignment 1

In [43]:
import os
import csv
import pandas
import json
import requests
import plotly.express as px
from requests.adapters import HTTPAdapter
from urllib3.util import Retry

### Formula 1 API

The API I have chosen to use for this project is: 
http://ergast.com/api/f1/
The API documentation is located here - https://ergast.com/mrd/

This API provides data about formula 1 since the 1950 season. 
The API does not require any form of authentication.

I also decided to use a json data set from https://openweathermap.org/api.

For historical data for 1 country from 1979 to now the price was $9 so I downloaded data for Monaco. 

For the graphs I am using the plotly library so the user of this notebook may need to pip install plotly in the anaconda python environment.

### Collecting the data

The functions below are:
* get_data_from_api
* read_json_file
* save_raw_data_to_file


They are used later in the notebook to make calls to the formula1 api, read, and write data from that api to files
There is more information about the individual funtions in their respective docstrings. 


In [44]:
def get_data_from_api(api_options='', api_limit='', offset=''):
    """
    This function calls the formula 1 api with
    options, a result limit and an offset passed as parameters.
    It loads the data using json and
    returns a dictionary. 
    
    param: api_options -> string (the specific path on the api)
    param: api_limit -> string (the limit of results returned in one call. Max=1000)
    param: offset -> string (If overall results are more than 1000 this can be used to navigate the pages of results)
    return: data -> dict
    """
    response = None
    data = None
    api_base_url = "http://ergast.com/api/f1/"

    session = requests.Session()
    retries = Retry(total=3, backoff_factor=1)
    api_url = api_base_url + api_options + api_limit + offset
    session.mount(
        prefix='http://', 
        adapter=HTTPAdapter(
        max_retries=retries))
    
    try:
        response = session.get(url=api_url, verify=False)
    except requests.RequestException:
        print("Could not connect to {}".format(api_url))

    if response:
        try:
            result = response.text
            data = json.loads(result,encoding='UTF-8')
        except (ValueError, KeyError) as error:
            print("Could not load json due to: {}".format(error))
    return data

In [46]:
def read_json_file(file_path):
    """
    This function reads a json file and returns a dictionary
    :param file_path -> String
    :return data -> dict
    """
    try:
        with open(file_path) as file:
            data = json.load(file, encoding='UTF-8')
    except FileNotFoundError as error:
        print("Can not open file due to: {}".format(error))
        return None
    
    return data

In [45]:
def save_raw_data_to_file(data, destination_file):
    """
    This function writes json data to a file. 
    :param data -> dict
    :param destination_file -> string
    """
    with open(file=destination_file, mode='w', encoding='UTF-8') as json_file:
        json.dump(data, json_file)

### Parsing and cleaning the data

The functions below are:
* get_finishers_data
* scrape_all_finishers_data

These functions are used to parse, clean and transform the data pulled from the api.

In get_finishers_data most statements are wrapped in try except blocks.
If a key or value does not exist then we want to append none to keep the order values added to the lists in race_data and for filtering purposes later to get rid of rows with null values.

For scrape_all_finishers_data, the formula1 api allows a maximum of 1000 results to be returned from one call to the api. To pull data for all races since 1950 with data for race finishers only we will need to call the api 7 times as there is close to 7000 results. 

There is more information about the individual funtions in their respective docstrings.

In [47]:
def get_finishers_data(data):
    """
    This function parses the raw data pulled from the api
    into relevant race data that can be further analysed.

    param: data -> dict
    return: race_data -> dict
    """
    # creates a dictonary compatible with a pandas dataframe
    race_data = {
        "Date": [],
        "Season": [],
        "Race": [],
        "Laps": [],
        "Constructor": [],
        "Constructor_Nationality": [],
        "Driver": [],
        "Driver_Nationality": [],
        "Position" : [],
        "Status" : [],
        "Time": [],
        "Time_mins": []
    }
    
    # Race data exists within a list in the json file
    try:
        races = data["MRData"]["RaceTable"]["Races"]
    except(KeyError, ValueError) as error:
        print("Could not parse race data due to: {}".format(error))
        return None
    
    # Loop through all races in that list
    # One race is a dictionary or data for an individual race
    for race in races:
        
        # Race results for different drivers is in a list within 
        # the dictionary for an individual race
        # loop through that list and append values for keys we are interested in to the 
        # corresponding key in the race_data dictionary created above.
        for result in race["Results"]:
            try:
                race_data["Laps"].append(int(result["laps"]))
            except(KeyError, ValueError):
                race_data["Laps"].append(None)

            try:
                race_data["Constructor"].append(result["Constructor"]["name"])
            except(KeyError, ValueError):
                race_data["Constructor"].append(None)

            try:
                race_data["Constructor_Nationality"].append(result["Constructor"]["nationality"])
            except(KeyError, ValueError):
                race_data["Constructor_Nationality"].append(None)

            try:
                race_data["Driver"].append("{0} {1}".format(result["Driver"]["givenName"],
                                               result["Driver"]["familyName"]))
            except(KeyError, ValueError):
                race_data["Driver"].append(None)

            try:
                race_data["Driver_Nationality"].append(result["Driver"]["nationality"])
            except(KeyError, ValueError):
                race_data["Driver_Nationality"].append(None)
            
            try:
                race_data["Position"].append(result["position"])
            except(KeyError, ValueError):
                race_data["Driver_Nationality"].append(None)
            
            try:
                race_data["Status"].append(result["status"])
            except(KeyError, ValueError):
                race_data["Status"].append(None)

            try:
                race_data["Time"].append(result["Time"]["time"])
            except(KeyError, ValueError):
                race_data["Time"].append(None)

            try:
                race_data["Time_mins"].append(int(result["Time"]["millis"]) / 60000)
            except(KeyError, ValueError):
                race_data["Time_mins"].append(None)
            
            # The next 3 statements append values from the outer loop as this data
            # exists outside the result list and inside the race dictionary. 
            # we want this data to repeat per result so we execute it in the 
            # inner loop. 
            try:
                race_data["Date"].append(race["date"])
            except(KeyError, ValueError):
                race_data["Date"].append(None)

            try:
                race_data["Season"].append(int(race["season"]))
            except(KeyError, ValueError):
                race_data["Season"].append(None)

            try:
                race_data["Race"].append(race["raceName"])
            except(KeyError, ValueError):
                race_data["Race"].append(None)
    
    # return the race_data after all values have been appended. 
    return race_data

In [48]:
def scrape_all_finishers_data(offset_max, offset_min=0, limit=1000):
    """
    This method will make calls to the api
    multiple times to get data on each page
    about races. 
    The max results returned by the api in one
    call is 1000.
    This particular call will only show data for
    drivers that finished the races.

    param: offset_min -> int (The page to start on, default 0 for first page)
    param: offset_max -> int (Max number of pages to get data from, 1000 is the 2nd page, 2000 is the 3rd and so on)
    param: limit -> int (default 1000 as this is the max results the api will return in one call)
    return: finishers_race_df -> Dataframe
    """
    # Create empty data frame
    finishers_race_df = pandas.DataFrame()
    
    # loop to make a call to the api per page up to max specified offset. 
    offset = offset_min
    while offset <= offset_max:
        f1_finishers = get_data_from_api(
            api_options='status/1/results.json',
            api_limit='?limit={0}'.format(limit),
            offset='&offset={0}'.format(offset))
        
        # parse the data using the get_finishers_data function
        finishers_race_data = get_finishers_data(f1_finishers)
        
        # Create a temporary Dataframe to hold data for the
        # the specific page we are on in the loop
        temp_df = pandas.DataFrame(finishers_race_data)
        
        # append the temporary Dataframe to the end of the finishers_race_df created above
        finishers_race_df = finishers_race_df.append(temp_df)
        
        # Increment offset by 1000 to move to the next page of data
        offset=offset+1000
    
    # Return the dataframe with all appended data
    return finishers_race_df



Unnamed: 0,Date,Season,Race,Laps,Constructor,Constructor_Nationality,Driver,Driver_Nationality,Position,Status,Time,Time_mins
0,1950-05-13,1950,British Grand Prix,70,Alfa Romeo,Italian,Nino Farina,Italian,1,Finished,2:13:23.6,133.393333
1,1950-05-13,1950,British Grand Prix,70,Alfa Romeo,Italian,Luigi Fagioli,Italian,2,Finished,+2.6,133.436667
2,1950-05-13,1950,British Grand Prix,70,Alfa Romeo,Italian,Reg Parnell,British,3,Finished,+52.0,134.26
3,1950-05-21,1950,Monaco Grand Prix,100,Alfa Romeo,Italian,Juan Fangio,Argentine,1,Finished,3:13:18.7,193.311667
4,1950-05-30,1950,Indianapolis 500,138,Kurtis Kraft,American,Johnnie Parsons,American,1,Finished,2:46:55.97,166.932833


## Using the functions and analysing the data

In the cells below we use most of the functions defined above to call the api, create DataFrames and create graphs from those DataFrames to show some interesting information about the data. 

In [None]:
# Calling the api here to get a sample of the 1st 1000 results of all races.
# This api call will show data for drivers that finished races only. 
sample_data = get_data_from_api(
              api_options='status/1/results.json',
              api_limit='?limit=1000')

# Save the sample data to a file. 
save_raw_data_to_file(sample_data, "/Users/nick.duggan/workspace/Notebooks/formula1_sample_finishers_race_data.json")

# Call scrape_all_finishers_data
finishers_race_df = scrape_all_finishers_data(offset_max=6000)

# Write the data to a csv file
finishers_race_df.to_csv("/Users/nick.duggan/workspace/Notebooks/formula1_finishers_race_data.csv")

# Show an example of the dataframe
finishers_race_df[0:5]

I wanted to see if race times were getting faster over time so I decided compare race finishers times for
single races through the seasons. 

Using the query function of pandas DataFrame we can create another DataFrame from our main finishers_race_df
to show a subset of columns and query for a specific race only. 

Below I have created new DataFrames for the Monaco Grand Prix and the British Grand Prix.
I use plotly express scatter graphs to show the data over time. 

In [49]:
# Create a new DataFrame from finishers_race_df with a subset of columns and query for the Monaco Grand Prix only
monaco_finishers = finishers_race_df[["Date",
                  "Season",
                  "Race",
                  "Driver",
                  "Laps",
                  "Position",
                  "Time_mins"]].query(
                  "Race == 'Monaco Grand Prix'").sort_values(
                  by="Time_mins",
                  ascending=False)

# Show an example of the new DataFrame
monaco_finishers[0:5]

Unnamed: 0,Date,Season,Race,Driver,Laps,Position,Time_mins
3,1950-05-21,1950,Monaco Grand Prix,Juan Fangio,100,1,193.311667
212,1957-05-19,1957,Monaco Grand Prix,Tony Brooks,105,2,190.633333
211,1957-05-19,1957,Monaco Grand Prix,Juan Fangio,105,1,190.213333
183,1956-05-13,1956,Monaco Grand Prix,Peter Collins,100,2,180.65
184,1956-05-13,1956,Monaco Grand Prix,Juan Fangio,100,2,180.65


In [61]:
# Create a new DataFrame from finishers_race_df with a subset of columns and query for the British Grand Prix only
uk_finishers = finishers_race_df[["Date",
                  "Season",
                  "Race",
                  "Driver",
                  "Laps",
                  "Position",
                  "Time_mins"]].query(
                  "Race == 'British Grand Prix'").sort_values(
                  by="Time_mins",
                  ascending=False)

# Show an example of the new DataFrame
uk_finishers[0:5]

Unnamed: 0,Date,Season,Race,Driver,Laps,Position,Time_mins
174,1955-07-16,1955,British Grand Prix,Karl Kling,90,3,188.55
173,1955-07-16,1955,British Grand Prix,Juan Fangio,90,2,187.356667
172,1955-07-16,1955,British Grand Prix,Stirling Moss,90,1,187.353333
230,1957-07-20,1957,British Grand Prix,Mike Hawthorn,90,3,187.343333
229,1957-07-20,1957,British Grand Prix,Luigi Musso,90,2,187.056667


These plots show Formula1 finishers times at the Monaco Grand Prix and at the British Grand Prix from 1950 to 2020

With plotly we can add hover data so we see the specified colums and values when we hover over
one of the points.
We can also specify a color map based on an interesting column for example below I chose Laps.

We can see that the earlier races had higher finish times but because of the yellow color in Monaco's graph we can see that they also had more laps which explains some the time difference with later races.

The take away from these graphs is that based on the trendline there is a slight down trend in race finish times in modern races since about 1990, these races have a similar amount of laps so are comparable. 
The faster finish times could be due to advances in car technology and aerodynamic science.

In [51]:
fig1 = px.scatter(monaco_finishers, x = 'Season', y = 'Time_mins',
                 title="Formula1 finisher's times at the Monaco Grand Prix",
                 hover_name='Driver', color="Laps",hover_data=["Position"],
                 trendline="lowess", trendline_color_override="turquoise")
fig1.show()

In [62]:
fig2 = px.scatter(uk_finishers, x = 'Season', y = 'Time_mins',
                 title="Formula1 finisher's times at the British Grand Prix",
                 hover_name='Driver', color="Laps",hover_data=["Position"],
                 trendline="lowess", trendline_color_override="turquoise")
fig2.show()

During the analysis of the above graphs I noticed some strange outliers for example the 1984 Monaco Grand Prix.

After reading up on the race it seems the race was stopped on the 31st lap due to extremely heavy rain. 

This got me wondering how much weather actually affects formula1 races.

As mentioned above, I downloaded json data for historical weather data for the city of Monaco from 1979 to 2018 
from https://openweathermap.org/api

I want to take a further look at the weather data and race data together in a graph for Monaco.

The function below is get_weather_data
We want to be able to merge this data with the race data for monaco using a common key in 2 DataFrames. 
This common key should be Date. 
The weather data holds a date time value at the key "dt_iso"

I chose to check the weather at the start of the race which is 3PM UTC. 
So whatever the value for the key "weather" was at 3PM UTC, that is the value I use. 

In [53]:
def get_weather_data(data):
    """
    
    """
    # create a dictionary compatible with a pandas DataFrame
    weather_data = {
        "Date" : [],
        "Weather" : []
    }
    for i in data:
        if i["dt_iso"].split().pop(1) == "15:00:00":
            weather_data["Date"].append(i["dt_iso"].split().pop(0))
            weather_data["Weather"].append(i["weather"][0]["main"])

    return pandas.DataFrame(weather_data)

weather_file=read_json_file("/Users/nick.duggan/workspace/Notebooks/weather_format.json")

weather_data = get_weather_data(weather_file)

weather_data[0:5]

Unnamed: 0,Date,Weather
0,1979-01-01,Clear
1,1979-01-02,Snow
2,1979-01-03,Clouds
3,1979-01-04,Clouds
4,1979-01-05,Clouds


In [54]:
result = pandas.merge(monaco_finishers,
                      weather_data,
                      how='inner',
                      on=['Date']).sort_values(by="Season",
                      ascending=True).reset_index(drop=True)
result.to_csv("/Users/nick.duggan/workspace/Notebooks/merged_data.csv")
result[0:5]

Unnamed: 0,Date,Season,Race,Driver,Laps,Position,Time_mins,Weather
0,1979-05-27,1979,Monaco Grand Prix,Clay Regazzoni,76,2,115.382,Clouds
1,1979-05-27,1979,Monaco Grand Prix,Jody Scheckter,76,1,115.374667,Clouds
2,1979-05-27,1979,Monaco Grand Prix,Carlos Reutemann,76,3,115.5175,Clouds
3,1979-05-27,1979,Monaco Grand Prix,John Watson,76,4,116.063167,Clouds
4,1980-05-18,1980,Monaco Grand Prix,Carlos Reutemann,76,1,115.57275,Clouds


In [55]:
fig4 = px.scatter(result, x = 'Season', y = 'Time_mins',
                 title="Formula1 winner's times at the Monaco Grand Prix",
                 hover_name='Driver', color="Laps",facet_col="Weather")
fig4.update_layout(plot_bgcolor='rgb(185,185,185)')
fig4.show()

In [56]:
collision_data = get_data_from_api(
              api_options='status/4/results.json',
              api_limit='?limit=1000')

save_raw_data_to_file(collision_data, "/Users/nick.duggan/workspace/Notebooks/formula1_collision_data.json")

collisions = get_finishers_data(read_json_file("/Users/nick.duggan/workspace/Notebooks/formula1_collision_data.json"))

collisions_df = pandas.DataFrame(collisions)

race_group_df = collisions_df.groupby("Race").size().reset_index(name='Collisions')

race_group_df[0:5]

Unnamed: 0,Race,Collisions
0,Abu Dhabi Grand Prix,9
1,Argentine Grand Prix,21
2,Australian Grand Prix,64
3,Austrian Grand Prix,27
4,Azerbaijan Grand Prix,6


In [57]:
fig = px.bar(race_group_df, x='Race', y='Collisions')
fig.show()

In [58]:
monaco_collisions = collisions_df[["Date",
                  "Season",
                  "Race",
                  "Driver",
                  "Laps",
                  "Position",
                  "Time_mins"]].query(
                  "Race == 'Monaco Grand Prix'").sort_values(
                  by="Time_mins",
                  ascending=False)

collisions_weather_data = pandas.merge(monaco_collisions,
                      weather_data,
                      how='inner',
                      on=['Date'])

collisions_weather_data[0:5]

Unnamed: 0,Date,Season,Race,Driver,Laps,Position,Time_mins,Weather
0,1981-05-31,1981,Monaco Grand Prix,Michele Alboreto,50,10,,Clouds
1,1981-05-31,1981,Monaco Grand Prix,Bruno Giacomelli,50,11,,Clouds
2,1981-05-31,1981,Monaco Grand Prix,Andrea de Cesaris,0,19,,Clouds
3,1981-05-31,1981,Monaco Grand Prix,Mario Andretti,0,20,,Clouds
4,1982-05-23,1982,Monaco Grand Prix,Keke Rosberg,64,11,,Clouds


In [59]:
weather_group_df = collisions_weather_data.groupby("Weather").size().reset_index(name='Collisions')

In [60]:
fig6 = px.bar(weather_group_df, x='Weather', y='Collisions')
fig6.show()