In [503]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [505]:
# === Load Data ===
path = "../data/"
races = pd.read_csv(path + "races.csv")
results = pd.read_csv(path + "results.csv") 
drivers = pd.read_csv(path + "drivers.csv") 
constructors = pd.read_csv(path + "constructors.csv") 
circuits = pd.read_csv(path + "circuits.csv")
status = pd.read_csv(path + "status.csv")

**Combining Data Stage**  
Main dataset used will be results. Other relevent data will be merged into results dataset

In [508]:
#merge drivers and results

#Keep only relevent information
drivers = drivers[["driverId", "forename", "surname", "dob"]].copy()

#Combine sirname and forename
drivers["driver_name"] = drivers["forename"] + " " + drivers["surname"]
drivers = drivers.drop(columns=["forename", "surname"])

#Merge with results
results = results.merge(drivers, on="driverId", how="left")

#drop driverId (replaced by driver_name)
results = results.drop(columns=["driverId"])

#Change 'time' to 'finish_time' to avoid confusion
results = results.rename(columns={"time": "finish_time"})

In [510]:
#merge races
races.head()
races.columns

#Keep only relevent information
races = races[["raceId", "year", "round", "circuitId", "name", "date", "time"]].copy()

#Merge with results
results = results.merge(races, on="raceId", how="left")

#drop raceId 
results = results.drop(columns=["raceId"])

#Computer drivers age at race using data and DOB, then  drop DOB 
results["date"] = pd.to_datetime(results["date"])
results["dob"] = pd.to_datetime(results["dob"])
results["age_at_race"] = (results["date"] - results["dob"]).dt.days / 365.25
results = results.drop(columns=["dob"])

#Change 'name' to 'race_name' to avoid confusion
results = results.rename(columns={"name": "race_name"})
#Change 'time' to 'finish_time' to avoid confusion
results = results.rename(columns={"time": "race_start_time"})

In [512]:
#merge constuctors
constructors.head()
constructors.columns

#Keep only relevent information
constructors = constructors[["constructorId", "name"]].copy()

#Merge with results
results = results.merge(constructors, on="constructorId", how="left")

#drop constructorId 
results = results.drop(columns=["constructorId"])

#Change 'name' to 'constructor' to avoid confusion
results = results.rename(columns={"name": "constructor_name"})

In [514]:
#merge circuits 

#Keep only relevent information
circuits = circuits[["circuitId", "name"]].copy()

#Merge with results
results = results.merge(circuits, on="circuitId", how="left")

#drop circuitId 
results = results.drop(columns=["circuitId"])

#Change 'name' to 'circuit' to avoid confusion
results = results.rename(columns={"name": "circuit"})

In [516]:
#merge status 

results = results.merge(status, on="statusId", how="left")

#drop statusId 
results = results.drop(columns=["statusId"])
results.head()
results.columns

Index(['resultId', 'number', 'grid', 'position', 'positionText',
       'positionOrder', 'points', 'laps', 'finish_time', 'milliseconds',
       'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed',
       'driver_name', 'year', 'round', 'race_name', 'date', 'race_start_time',
       'age_at_race', 'constructor_name', 'circuit', 'status'],
      dtype='object')

In [518]:
#drop irrelevent information
results = results.drop(columns=["number", "position", "positionText", "finish_time", "rank", "fastestLapSpeed", "points", "laps", "milliseconds","fastestLap", "fastestLapTime", "resultId"])

# reorder a bit for readability
results = results[
    [
        "year", "round", "race_name", "circuit", "driver_name", "constructor_name", "grid", "positionOrder", 
        "date", "race_start_time", "age_at_race","status"
    ]
]



Unnamed: 0,year,round,race_name,circuit,driver_name,constructor_name,grid,positionOrder,date,race_start_time,age_at_race,status
0,2008,1,Australian Grand Prix,Albert Park Grand Prix Circuit,Lewis Hamilton,McLaren,1,1,2008-03-16,04:30:00,23.186858,Finished
1,2008,1,Australian Grand Prix,Albert Park Grand Prix Circuit,Nick Heidfeld,BMW Sauber,5,2,2008-03-16,04:30:00,30.850103,Finished
2,2008,1,Australian Grand Prix,Albert Park Grand Prix Circuit,Nico Rosberg,Williams,7,3,2008-03-16,04:30:00,22.718686,Finished
3,2008,1,Australian Grand Prix,Albert Park Grand Prix Circuit,Fernando Alonso,Renault,11,4,2008-03-16,04:30:00,26.631075,Finished
4,2008,1,Australian Grand Prix,Albert Park Grand Prix Circuit,Heikki Kovalainen,McLaren,3,5,2008-03-16,04:30:00,26.406571,Finished
