In [None]:
import pandas as pd
import json

#### Uploading data

In [None]:
Path = "data/"

circuits = pd.read_csv(Path + "circuits.csv")

constructor_standings = pd.read_csv(Path + "constructor_standings.csv")
constructors = pd.read_csv(Path + "constructors.csv")
driver_standings = pd.read_csv(Path + "driver_standings.csv")
drivers = pd.read_csv(Path + "drivers.csv")
lap_times = pd.read_csv(Path + "lap_times.csv")
pit_stops = pd.read_csv(Path + "pit_stops.csv")
#qualifying = pd.read_csv(Path + "qualifying.csv")
races = pd.read_csv(Path + "races.csv")
results = pd.read_csv(Path + "results.csv")
#sprint_results = pd.read_csv(Path + "sprint_results.csv")
status = pd.read_csv(Path + "status.csv")

In [None]:
circuits = circuits[["circuitId","name","country","location","lat","lng","alt"]]
constructor_standings = constructor_standings[["raceId","constructorId","points","position","wins"]]
constructors = constructors[["constructorId","name"]]
driver_standings = driver_standings[["raceId","driverId","points","position","wins"]]

name = drivers['forename'] + ' ' + drivers['surname']
drivers = drivers[["driverId","number","code"]]
drivers["name"]=name

races = races[["raceId","circuitId","year","date","name"]]
results = results[["raceId","driverId","constructorId","position","points"]]

### Data processing for drivers

#### driver_teammates
Generate a dictionary (JSON file) where every driver has a list of drivers they raced with (raced for the same constructor during the same year).

In [None]:
# Merge files to get relevant information and group by (constructorId, year)
merged = pd.merge(results[['raceId','driverId','constructorId']], races[['raceId','year']], on=["raceId"], how="inner")
merged = pd.merge(merged, drivers[['driverId','name']], on=["driverId"], how="inner")
constructor_list = merged[['name','constructorId','year']].drop_duplicates().groupby(['constructorId','year'])['name'].apply(list).to_dict()

In [None]:
# Create the driver dictionary
driver_list = {}
for (constructorId, year), names in constructor_list.items():
    for name in names:
        if name not in driver_list:
            driver_list[name] = []
        driver_list[name].extend([n for n in names if n != name])
driver_list

In [None]:
# Format
driver_teammates = []

for name, teammates in driver_list.items():
    driver = {
        'id': name,
        'data': teammates
    }
    driver_teammates.append(driver)

print(driver_teammates)

file_path = 'website/src/data/driver_teammates.json'
with open(file_path, 'w') as file:
    json.dump(driver_teammates, file)

#### data_points
Generate a dictionary (JSON file) where every driver has their cumulated points during a year, for each year

In [None]:
# Merge files to get relevant information
merged = pd.merge(results[['raceId','driverId','points']], races[['raceId','year','date','name']], on=["raceId"], how="inner").rename(columns={'name': 'race_name'})
merged = pd.merge(merged, drivers[['driverId','name']], on=["driverId"], how="inner")[['year','name','date','race_name','points']].set_index('year')

#merged[merged['year'] == 2021]
merged[merged['race_name'] == 'Qatar Grand Prix'].loc[2021]

In [None]:
# Format
formatted_data = []
for year, group in merged.groupby(['year']):
    year_data = []
    for name, name_data in group.groupby(['name']):
        cumulation_points = 0
        data = []
        for _, row in name_data.iterrows():
            cumulation_points += int(row['points'])
            data.append({'x': row['date'], 'y': cumulation_points})
        year_data.append({'id': name, 'data': data})

    formatted_data.append({'year': int(year), 'year_data': year_data})

print(formatted_data)

file_path = 'website/src/data/data_points.json'
with open(file_path, 'w') as file:
    json.dump(formatted_data, file)

#### data_rank
Generate a dictionary (JSON file) where every driver has their rank for each year

In [None]:
# Merge files to get relevant information
merged = pd.merge(driver_standings[['raceId','driverId','position']], races[['raceId','year']], on=["raceId"], how="inner")
merged = pd.merge(merged, drivers[['driverId','name']], on=["driverId"], how="inner")[['name','year','position']]
# Group by (name, year) to get the summed points
merged = merged.groupby(['name','year']).sum().reset_index()
# Set rank relative to the number of points earned
merged['rank'] = merged.groupby('year')['position'].rank(ascending=True, method='min')
merged = merged[['name','year','rank']].set_index('name')

In [None]:
# Format
formatted_data = []
for name, group in merged.groupby(['name']):
    data = [{'x': int(row['year']), 'y': int(row['rank'])} for _, row in group.iterrows()]
    formatted_data.append({'id': name, 'data': data})

formatted_data.sort(key=lambda x: len(x['data']), reverse=True)
print(formatted_data[1])

file_path = 'website/src/data/data_rank.json'
with open(file_path, 'w') as file:
    json.dump(formatted_data, file)

### TODO

For a given race, how the race unfolded (positions of the drivers, pit stops, etc.)

To vizualise it, we use svgs for some of the circuits.

For a chosen year, for a chosen location (must be able to chose one or the other first and the selection change depending what is available) -> output a track with list of drivers (that you can select or not, with a button to automatically select top 5 of the race) and a button start race
options:
- add pit stops (driver stops for the pit stop duration)
- add a plot which show more clearly the current position

In [None]:
#list of locations with available tracks svgs
locations_svg = ["Abu Dhabi", "Melbourne", "Spielberg", "Baku", "Sakhir", "Spa", "São Paulo", "Montreal", "Shanghai", "Le Castellet", "Silverstone", "Budapest", "Monza", "Suzuka", "Mexico City", "Monte-Carlo", "Zandvoort", "Sochi", "Marina Bay", "Montmeló", "Austin"]
circuits_svg = circuits[circuits.location.isin(locations_svg)]

In [None]:
merged = pd.merge(results, races, on=["raceId"], how="inner")
merged = pd.merge(merged,circuits_svg["circuitId"], on=["circuitId"], how="inner")
merged = pd.merge(merged, lap_times, on=["raceId","driverId"], how="inner")

merged = merged[["raceId","driverId","constructorId","circuitId","year","name","position_x","points","lap","position_y","milliseconds"]].rename(columns={"position_x":"final_position", "position_y":"lap_position"})

merged
#merged.to_csv("processed_data/race_svg.csv")
#merged.to_json("processed_data/race_svg.json")

For a given driver, how the driver's performance evolved over time, which teams they raced for, how many races did they win, compare them to other drivers, etc.

By selecting a driver -> output:
- plot for wins in time (+total)
- plot for points in time
- constructor they raced for in time
Options:
- possibility to select up to x drivers to compare them
- short description of each driver ?

In [None]:
merged = pd.merge(results, races[["raceId","year"]], on=["raceId"], how="inner")
merged = pd.merge(merged,driver_standings[["raceId","driverId","wins"]],on=["raceId","driverId"], how="inner")

merged
#merged.to_csv("processed_data/driver_results.csv")
#merged.to_json("processed_data/driver_results.json")

For each year, the schedule of the races, their locations on the world map, which teams participated, etc.

To determine.
An idea: By selecting a year -> output:
- list of races
- world map with locations
- list of constructors
Options:
- show the movement

In [None]:
merged = pd.merge(races, results[["raceId","driverId","constructorId"]], on=["raceId"], how="inner")
merged = merged[["raceId","circuitId","driverId","constructorId","year","date","name"]]

merged
#merged.to_csv("processed_data/schedule.csv")
#merged.to_json("processed_data/schedule.json")