- Install Selenium library to crawl data

In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.20.0-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.25.0-py3-none-any.whl (467 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.2/467.2 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

- Import library

In [57]:
import pandas as pd
import numpy as np
import requests
import bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
import math

## Data Crawling

### Data from 1950 to 2023 season

#### Races

In [None]:
# query API

races = {'season': [],
        'round': [],
        'circuit_id': [],
        'lat': [],
        'long': [],
        'country': [],
        'date': [],
        'url': []}

for year in list(range(1950,2024)):

    url = 'https://ergast.com/api/f1/{}.json'
    r = requests.get(url.format(year))
    json = r.json()

    for item in json['MRData']['RaceTable']['Races']:
        try:
            races['season'].append(int(item['season']))
        except:
            races['season'].append(None)

        try:
            races['round'].append(int(item['round']))
        except:
            races['round'].append(None)

        try:
            races['circuit_id'].append(item['Circuit']['circuitId'])
        except:
            races['circuit_id'].append(None)

        try:
            races['lat'].append(float(item['Circuit']['Location']['lat']))
        except:
            races['lat'].append(None)

        try:
            races['long'].append(float(item['Circuit']['Location']['long']))
        except:
            races['long'].append(None)

        try:
            races['country'].append(item['Circuit']['Location']['country'])
        except:
            races['country'].append(None)

        try:
            races['date'].append(item['date'])
        except:
            races['date'].append(None)

        try:
            races['url'].append(item['url'])
        except:
            races['url'].append(None)

races = pd.DataFrame(races)
races.to_csv('races.csv', index=False)

In [25]:
races = pd.read_csv('races.csv')
races

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,1950,1,silverstone,52.0786,-1.01694,UK,1950-05-13,http://en.wikipedia.org/wiki/1950_British_Gran...
1,1950,2,monaco,43.7347,7.42056,Monaco,1950-05-21,http://en.wikipedia.org/wiki/1950_Monaco_Grand...
2,1950,3,indianapolis,39.7950,-86.23470,USA,1950-05-30,http://en.wikipedia.org/wiki/1950_Indianapolis...
3,1950,4,bremgarten,46.9589,7.40194,Switzerland,1950-06-04,http://en.wikipedia.org/wiki/1950_Swiss_Grand_...
4,1950,5,spa,50.4372,5.97139,Belgium,1950-06-18,http://en.wikipedia.org/wiki/1950_Belgian_Gran...
...,...,...,...,...,...,...,...,...
1096,2023,18,americas,30.1328,-97.64110,USA,2023-10-22,https://en.wikipedia.org/wiki/2023_United_Stat...
1097,2023,19,rodriguez,19.4042,-99.09070,Mexico,2023-10-29,https://en.wikipedia.org/wiki/2023_Mexico_City...
1098,2023,20,interlagos,-23.7036,-46.69970,Brazil,2023-11-05,https://en.wikipedia.org/wiki/2023_S%C3%A3o_Pa...
1099,2023,21,vegas,36.1147,-115.17300,United States,2023-11-19,https://en.wikipedia.org/wiki/2023_Las_Vegas_G...


#### Results

In [None]:
# append the number of rounds to each season from the races_df

rounds = []
for year in np.array(races.season.unique()):
    rounds.append([year, list(races[races.season == year]['round'])])


In [None]:
# query API

results = {'season': [],
          'round':[],
           'circuit_id':[],
          'driver': [],
           'date_of_birth': [],
           'nationality': [],
          'constructor': [],
          'grid': [],
          'time': [],
          'status': [],
          'points': [],
          'podium': []}

for n in list(range(len(rounds))):
    for i in rounds[n][1]:

        url = 'http://ergast.com/api/f1/{}/{}/results.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['RaceTable']['Races'][0]['Results']:
            try:
                results['season'].append(int(json['MRData']['RaceTable']['Races'][0]['season']))
            except:
                results['season'].append(None)

            try:
                results['round'].append(int(json['MRData']['RaceTable']['Races'][0]['round']))
            except:
                results['round'].append(None)

            try:
                results['circuit_id'].append(json['MRData']['RaceTable']['Races'][0]['Circuit']['circuitId'])
            except:
                results['circuit_id'].append(None)

            try:
                results['driver'].append(item['Driver']['driverId'])
            except:
                results['driver'].append(None)

            try:
                results['date_of_birth'].append(item['Driver']['dateOfBirth'])
            except:
                results['date_of_birth'].append(None)

            try:
                results['nationality'].append(item['Driver']['nationality'])
            except:
                results['nationality'].append(None)

            try:
                results['constructor'].append(item['Constructor']['constructorId'])
            except:
                results['constructor'].append(None)

            try:
                results['grid'].append(int(item['grid']))
            except:
                results['grid'].append(None)

            try:
                results['time'].append(int(item['Time']['millis']))
            except:
                results['time'].append(None)

            try:
                results['status'].append(item['status'])
            except:
                results['status'].append(None)

            try:
                results['points'].append(int(item['points']))
            except:
                results['points'].append(None)

            try:
                results['podium'].append(int(item['position']))
            except:
                results['podium'].append(None)


results = pd.DataFrame(results)
results.to_csv('results.csv', index=False)

In [26]:
results = pd.read_csv('results.csv')
results

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium
0,1950,1,silverstone,farina,1906-10-30,Italian,alfa,1,8003600.0,Finished,9.0,1
1,1950,1,silverstone,fagioli,1898-06-09,Italian,alfa,2,8006200.0,Finished,6.0,2
2,1950,1,silverstone,reg_parnell,1911-07-02,British,alfa,4,8055600.0,Finished,4.0,3
3,1950,1,silverstone,cabantous,1904-10-08,French,lago,6,,+2 Laps,3.0,4
4,1950,1,silverstone,rosier,1905-11-05,French,lago,9,,+2 Laps,2.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
25822,2023,22,yas_marina,sargeant,2000-12-31,American,williams,20,5310415.0,Finished,0.0,16
25823,2023,22,yas_marina,zhou,1999-05-30,Chinese,alfa,19,5312046.0,Finished,0.0,17
25824,2023,22,yas_marina,sainz,1994-09-01,Spanish,ferrari,16,,Retired,0.0,18
25825,2023,22,yas_marina,bottas,1989-08-28,Finnish,alfa,18,,+1 Lap,0.0,19


#### Driver Standings

In [27]:
# define lookup function to shift points and number of wins from previous rounds

def lookup (df, team, points):
    df['lookup1'] = df.season.astype(str) + df[team] + df['round'].astype(str)
    df['lookup2'] = df.season.astype(str) + df[team] + (df['round']-1).astype(str)
    new_df = df.merge(df[['lookup1', points]], how = 'left', left_on='lookup2',right_on='lookup1')
    new_df.drop(['lookup1_x', 'lookup2', 'lookup1_y'], axis = 1, inplace = True)
    new_df.rename(columns = {points+'_x': points+'_after_race', points+'_y': points}, inplace = True)
    new_df[points].fillna(0, inplace = True)
    return new_df

In [None]:
driver_standings = {'season': [],
                    'round':[],
                    'driver': [],
                    'driver_points': [],
                    'driver_wins': [],
                   'driver_standings_pos': []}

# query API

for n in list(range(len(rounds))):
    for i in rounds[n][1]:    # iterate through rounds of each year

        url = 'https://ergast.com/api/f1/{}/{}/driverStandings.json'
        r = requests.get(url.format(rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']:
            try:
                driver_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                driver_standings['season'].append(None)

            try:
                driver_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                driver_standings['round'].append(None)

            try:
                driver_standings['driver'].append(item['Driver']['driverId'])
            except:
                driver_standings['driver'].append(None)

            try:
                driver_standings['driver_points'].append(int(item['points']))
            except:
                driver_standings['driver_points'].append(None)

            try:
                driver_standings['driver_wins'].append(int(item['wins']))
            except:
                driver_standings['driver_wins'].append(None)

            try:
                driver_standings['driver_standings_pos'].append(int(item['position']))
            except:
                driver_standings['driver_standings_pos'].append(None)

driver_standings = pd.DataFrame(driver_standings)


driver_standings = lookup(driver_standings, 'driver', 'driver_points')
driver_standings = lookup(driver_standings, 'driver', 'driver_wins')
driver_standings = lookup(driver_standings, 'driver', 'driver_standings_pos')

driver_standings.drop(['driver_points_after_race', 'driver_wins_after_race', 'driver_standings_pos_after_race'],
                      axis = 1, inplace = True)

driver_standings.to_csv('driver_standings.csv', index=False)

In [28]:
driver_standings = pd.read_csv('driver_standings.csv')

driver_standings

Unnamed: 0,season,round,driver,driver_points,driver_wins,driver_standings_pos
0,1950,1,farina,0.0,0.0,0.0
1,1950,1,fagioli,0.0,0.0,0.0
2,1950,1,reg_parnell,0.0,0.0,0.0
3,1950,1,cabantous,0.0,0.0,0.0
4,1950,1,rosier,0.0,0.0,0.0
...,...,...,...,...,...,...
28037,2023,22,zhou,6.0,0.0,18.0
28038,2023,22,kevin_magnussen,3.0,0.0,19.0
28039,2023,22,lawson,2.0,0.0,20.0
28040,2023,22,sargeant,1.0,0.0,21.0


#### Constructor Standings

In [None]:
# start from year 1958

constructor_rounds = rounds[8:]

constructor_standings = {'season': [],
                    'round':[],
                    'constructor': [],
                    'constructor_points': [],
                    'constructor_wins': [],
                   'constructor_standings_pos': []}
# query API

for n in list(range(len(constructor_rounds))):
    for i in constructor_rounds[n][1]:

        url = 'https://ergast.com/api/f1/{}/{}/constructorStandings.json'
        r = requests.get(url.format(constructor_rounds[n][0], i))
        json = r.json()

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']:
            try:
                constructor_standings['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                constructor_standings['season'].append(None)

            try:
                constructor_standings['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                constructor_standings['round'].append(None)

            try:
                constructor_standings['constructor'].append(item['Constructor']['constructorId'])
            except:
                constructor_standings['constructor'].append(None)

            try:
                constructor_standings['constructor_points'].append(int(item['points']))
            except:
                constructor_standings['constructor_points'].append(None)

            try:
                constructor_standings['constructor_wins'].append(int(item['wins']))
            except:
                constructor_standings['constructor_wins'].append(None)

            try:
                constructor_standings['constructor_standings_pos'].append(int(item['position']))
            except:
                constructor_standings['constructor_standings_pos'].append(None)

constructor_standings = pd.DataFrame(constructor_standings)

constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_points')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_wins')
constructor_standings = lookup(constructor_standings, 'constructor', 'constructor_standings_pos')

constructor_standings.drop(['constructor_points_after_race', 'constructor_wins_after_race','constructor_standings_pos_after_race' ],
                           axis = 1, inplace = True)

constructor_standings.to_csv('constructor_standings.csv', index=False)

In [29]:
constructor_standings = pd.read_csv('constructor_standings.csv')
constructor_standings

Unnamed: 0,season,round,constructor,constructor_points,constructor_wins,constructor_standings_pos
0,1958,1,cooper,0.0,0.0,0.0
1,1958,1,ferrari,0.0,0.0,0.0
2,1958,1,maserati,0.0,0.0,0.0
3,1958,2,cooper,8.0,1.0,1.0
4,1958,2,ferrari,6.0,0.0,2.0
...,...,...,...,...,...,...
13146,2023,22,alpine,120.0,0.0,6.0
13147,2023,22,williams,28.0,0.0,7.0
13148,2023,22,alphatauri,21.0,0.0,8.0
13149,2023,22,alfa,16.0,0.0,9.0


#### Qualifying

In [None]:
qualifying_results = pd.DataFrame()

# Qualifying times are only available from 1983

for year in list(range(1983,2024)):
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    # find links to all circuits for a certain year

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link:
            year_links.append(link)

    # for each circuit, switch to the starting grid page and read table

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        try:
          df = pd.read_html(new_url.format(link))
        except:
          continue
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df], ignore_index=True)

    # concatenate all tables from all years

    qualifying_results = pd.concat([qualifying_results, year_df], ignore_index=True)

# rename columns

qualifying_results.rename(columns = {'Pos': 'grid', 'Driver': 'driver_name', 'Car': 'car',
                                     'Time': 'qualifying_time'}, inplace = True)
# drop driver number column

qualifying_results.drop('No', axis = 1, inplace = True)

qualifying_results.to_csv('qualifying.csv', index=False)

In [30]:
qualifying = pd.read_csv('qualifying.csv')

qualifying

Unnamed: 0,grid,driver_name,car,qualifying_time,season,round
0,1,Keke Rosberg ROS,Williams Honda,1:34.526,1983,1
1,2,Alain Prost PRO,Renault,1:34.672,1983,1
2,3,Patrick Tambay TAM,Ferrari,1:34.758,1983,1
3,4,Nelson Piquet PIQ,Brabham BMW,1:35.114,1983,1
4,5,Derek Warwick WAR,Toleman Hart,1:35.206,1983,1
...,...,...,...,...,...,...
16200,16,Carlos Sainz SAI,Ferrari,1:24.738,2023,23
16201,17,Kevin Magnussen MAG,Haas Ferrari,1:24.764,2023,23
16202,18,Valtteri Bottas BOT,Alfa Romeo Ferrari,1:24.788,2023,23
16203,19,Zhou Guanyu ZHO,Alfa Romeo Ferrari,1:25.159,2023,23


#### Weather

In [31]:
# Define function to clean text in weather data
def clean_text(text):
    puncs = [',', '.']

    for punc in puncs:
        text = text.replace(punc, '')

    words = text.lower().split()
    # remove 'and' word
    words = [word for word in words if word != 'and']
    return words

In [None]:
weather = races.iloc[:,[0,1,2]]

info = []

# read wikipedia tables

for link in races.url:
    try:
        df = pd.read_html(link)[0]
        if 'Weather' in list(df.iloc[:,0]):
            n = list(df.iloc[:,0]).index('Weather')
            info.append(df.iloc[n,1])
        else:
            df = pd.read_html(link)[1]
            if 'Weather' in list(df.iloc[:,0]):
                n = list(df.iloc[:,0]).index('Weather')
                info.append(df.iloc[n,1])
            else:
                df = pd.read_html(link)[2]
                if 'Weather' in list(df.iloc[:,0]):
                    n = list(df.iloc[:,0]).index('Weather')
                    info.append(df.iloc[n,1])
                else:
                    df = pd.read_html(link)[3]
                    if 'Weather' in list(df.iloc[:,0]):
                        n = list(df.iloc[:,0]).index('Weather')
                        info.append(df.iloc[n,1])
                    else:
                        driver = webdriver.Chrome()
                        driver.get(link)

                        # click language button
                        button = driver.find_element_by_link_text('Italiano')
                        button.click()

                        # find weather in italian with selenium

                        clima = driver.find_element_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[9]/td').text
                        info.append(clima)

    except:
        info.append('not found')

# append column with weather information to dataframe

weather['weather'] = info

# set up a dictionary to convert weather information into keywords

weather_dict = {'weather_warm': ['soleggiato', 'clear', 'warm', 'hot', 'sunny', 'fine', 'mild', 'sereno'],
               'weather_cold': ['cold', 'fresh', 'chilly', 'cool'],
               'weather_dry': ['dry', 'asciutto'],
               'weather_wet': ['showers', 'wet', 'rain', 'pioggia', 'damp', 'thunderstorms', 'rainy'],
               'weather_cloudy': ['overcast', 'nuvoloso', 'clouds', 'cloudy', 'grey', 'coperto']}

# map new df according to weather dictionary

weather_df = pd.DataFrame(columns = weather_dict.keys())
for col in weather_df:
    # weather_df[col] = weather['weather'].map(lambda x: 1 if any(i in weather_dict[col] for i in x.lower().split()) else 0)
    weather_df[col] = weather['weather'].map(lambda x: 1 if any(i in weather_dict[col] for i in clean_text(x)) else 0)

weather_info = pd.concat([weather, weather_df], axis = 1)

weather_info.to_csv('weather.csv', index=False)

In [32]:
weather_data = pd.read_csv('weather.csv')

weather_data

Unnamed: 0,season,round,circuit_id,weather,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
0,1950,1,silverstone,"Sunny, mild, dry.",1,0,1,0,0
1,1950,2,monaco,not found,0,0,0,0,0
2,1950,3,indianapolis,not found,0,0,0,0,0
3,1950,4,bremgarten,"Warm, dry and sunny",1,0,1,0,0
4,1950,5,spa,"Warm, dry and sunny",1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1096,2023,18,americas,Sunny,1,0,0,0,0
1097,2023,19,rodriguez,Sunny,1,0,0,0,0
1098,2023,20,interlagos,Partly cloudy,0,0,0,0,1
1099,2023,21,vegas,Clear,1,0,0,0,0


### Data from 2024 season

#### Drivers information in 2024 season

In [None]:
url_crawl_info_drivers = 'https://www.formula1.com/en/drivers'

def crawl_info_drivers(url):
  df = pd.DataFrame(columns=['name', 'points', 'name_team', 'number', 'avatar', 'nation'])
  response = requests.get(url)
  if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    all = soup.find_all('div', class_='f1-inner-wrapper')
    # print(all)
    for i, info in enumerate(all[1:22]):
      points = info.find('p', class_='f1-heading-wide').text.strip()
      # print(points)
      content = info.find_all('p', class_='f1-heading')
      name = content[0].text.strip() + ' ' + content[1].text.strip()
      # print(name)
      imgs = info.find_all('img', class_='f1-c-image')
      img_nation = imgs[0].get('src')
      img_number = imgs[1].get('src')
      img_avatar = imgs[2].get('src')
      team = info.find_all('p', class_='f1-heading')[2].text.strip()
      # print(img_nation)
      # print(img_number)
      # print(img_avatar)
      # print(team)
      # print('-------')
      df.loc[i] = [name, points, team, img_number, img_avatar, img_nation]
  return df


drivers_data = crawl_info_drivers(url_crawl_info_drivers)
drivers_data.to_csv('drivers.csv', index=False)



#### Schedule of 2024 season

In [None]:
url_schedule = "https://www.formula1.com/en/racing/2024.html"
def crawl_info_schedule(url):
  df = pd.DataFrame(columns=['name_round', 'start_day', 'end_day', 'month', 'country_flag', 'event_place', 'event_title', 'event_img'])
  response = requests.get(url)
  if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    completed_events = soup.find('div', class_='completed-events')
    completed_events = completed_events.find_all('div', class_='col-12 col-sm-6 col-lg-4 col-xl-3')
    for i, event in enumerate(completed_events):
      title = event.find('legend', class_='card-title').text.strip()
      print(title)
      start = event.find('span', class_='start-date').text.strip()
      end = event.find('span', class_='end-date').text.strip()
      print(start)
      print(end)
      month = event.find('span', class_='month-wrapper').text.strip()
      print(month)
      country_flag = event.find('div', class_='country-flag').find('img', class_='lazy').get('data-src')
      print(country_flag)
      event_place = event.find('div', class_='event-place').text.strip()
      print(event_place)
      event_title = event.find('div', class_='event-title').text.strip()
      print(event_title)
      event_img = event.find('div', class_='event-image').find('img', class_='lazy').get('data-src')
      print(event_img)
      print('--------')
      df.loc[i] = [title, start, end, month, country_flag, event_place, event_title, event_img]
    return df


schedule2024_data = crawl_info_schedule(url_schedule)
schedule2024_data.to_csv('Schedule_2024.csv', index=False)


#### Races in 2024 season

In [33]:
# query API

races2024 = {'season': [],
        'round': [],
        'circuit_id': [],
        'lat': [],
        'long': [],
        'country': [],
        'date': [],
        'url': []}

year = 2024
url = 'https://ergast.com/api/f1/{}.json'
r = requests.get(url.format(year))
json = r.json()

for item in json['MRData']['RaceTable']['Races']:
    try:
        races2024['season'].append(int(item['season']))
    except:
        races2024['season'].append(None)

    try:
        races2024['round'].append(int(item['round']))
    except:
        races2024['round'].append(None)

    try:
        races2024['circuit_id'].append(item['Circuit']['circuitId'])
    except:
        races2024['circuit_id'].append(None)

    try:
        races2024['lat'].append(float(item['Circuit']['Location']['lat']))
    except:
        races2024['lat'].append(None)

    try:
        races2024['long'].append(float(item['Circuit']['Location']['long']))
    except:
        races2024['long'].append(None)

    try:
        races2024['country'].append(item['Circuit']['Location']['country'])
    except:
        races2024['country'].append(None)

    try:
        races2024['date'].append(item['date'])
    except:
        races2024['date'].append(None)

    try:
        races2024['url'].append(item['url'])
    except:
        races2024['url'].append(None)

races2024 = pd.DataFrame(races2024)

In [34]:
races2024

Unnamed: 0,season,round,circuit_id,lat,long,country,date,url
0,2024,1,bahrain,26.0325,50.5106,Bahrain,2024-03-02,https://en.wikipedia.org/wiki/2024_Bahrain_Gra...
1,2024,2,jeddah,21.6319,39.1044,Saudi Arabia,2024-03-09,https://en.wikipedia.org/wiki/2024_Saudi_Arabi...
2,2024,3,albert_park,-37.8497,144.968,Australia,2024-03-24,https://en.wikipedia.org/wiki/2024_Australian_...
3,2024,4,suzuka,34.8431,136.541,Japan,2024-04-07,https://en.wikipedia.org/wiki/2024_Japanese_Gr...
4,2024,5,shanghai,31.3389,121.22,China,2024-04-21,https://en.wikipedia.org/wiki/2024_Chinese_Gra...
5,2024,6,miami,25.9581,-80.2389,USA,2024-05-05,https://en.wikipedia.org/wiki/2024_Miami_Grand...
6,2024,7,imola,44.3439,11.7167,Italy,2024-05-19,https://en.wikipedia.org/wiki/2024_Emilia_Roma...
7,2024,8,monaco,43.7347,7.42056,Monaco,2024-05-26,https://en.wikipedia.org/wiki/2024_Monaco_Gran...
8,2024,9,villeneuve,45.5,-73.5228,Canada,2024-06-09,https://en.wikipedia.org/wiki/2024_Canadian_Gr...
9,2024,10,catalunya,41.57,2.26111,Spain,2024-06-23,https://en.wikipedia.org/wiki/2024_Spanish_Gra...


#### Current results in 2024 season

In [35]:
# append the number of rounds to 2024 season from the races2024 df

rounds2024 = []
for year in np.array(races2024.season.unique()):
    rounds2024.append([year, list(races2024[races2024['season'] == year]['round'])])


In [36]:
# query API

results2024 = {'season': [],
          'round':[],
           'circuit_id':[],
          'driver': [],
           'date_of_birth': [],
           'nationality': [],
          'constructor': [],
          'grid': [],
          'time': [],
          'status': [],
          'points': [],
          'podium': []}

for n in list(range(len(rounds2024))):
    for i in rounds2024[n][1]:

        url = 'http://ergast.com/api/f1/{}/{}/results.json'
        r = requests.get(url.format(rounds2024[n][0], i))
        json = r.json()
        if len(json['MRData']['RaceTable']['Races']) == 0:
            break

        for item in json['MRData']['RaceTable']['Races'][0]['Results']:
            try:
                results2024['season'].append(int(json['MRData']['RaceTable']['Races'][0]['season']))
            except:
                results2024['season'].append(None)

            try:
                results2024['round'].append(int(json['MRData']['RaceTable']['Races'][0]['round']))
            except:
                results2024['round'].append(None)

            try:
                results2024['circuit_id'].append(json['MRData']['RaceTable']['Races'][0]['Circuit']['circuitId'])
            except:
                results2024['circuit_id'].append(None)

            try:
                results2024['driver'].append(item['Driver']['driverId'])
            except:
                results2024['driver'].append(None)

            try:
                results2024['date_of_birth'].append(item['Driver']['dateOfBirth'])
            except:
                results2024['date_of_birth'].append(None)

            try:
                results2024['nationality'].append(item['Driver']['nationality'])
            except:
                results2024['nationality'].append(None)

            try:
                results2024['constructor'].append(item['Constructor']['constructorId'])
            except:
                results2024['constructor'].append(None)

            try:
                results2024['grid'].append(int(item['grid']))
            except:
                results2024['grid'].append(None)

            try:
                results2024['time'].append(int(item['Time']['millis']))
            except:
                results2024['time'].append(None)

            try:
                results2024['status'].append(item['status'])
            except:
                results2024['status'].append(None)

            try:
                results2024['points'].append(int(item['points']))
            except:
                results2024['points'].append(None)

            try:
                results2024['podium'].append(int(item['position']))
            except:
                results2024['podium'].append(None)


results2024 = pd.DataFrame(results2024)


In [37]:
results2024

Unnamed: 0,season,round,circuit_id,driver,date_of_birth,nationality,constructor,grid,time,status,points,podium
0,2024,1,bahrain,max_verstappen,1997-09-30,Dutch,red_bull,1,5504742.0,Finished,26,1
1,2024,1,bahrain,perez,1990-01-26,Mexican,red_bull,5,5527199.0,Finished,18,2
2,2024,1,bahrain,sainz,1994-09-01,Spanish,ferrari,4,5529852.0,Finished,15,3
3,2024,1,bahrain,leclerc,1997-10-16,Monegasque,ferrari,2,5544411.0,Finished,12,4
4,2024,1,bahrain,russell,1998-02-15,British,mercedes,3,5551530.0,Finished,10,5
...,...,...,...,...,...,...,...,...,...,...,...,...
114,2024,6,miami,bottas,1989-08-28,Finnish,sauber,16,5502232.0,Finished,0,16
115,2024,6,miami,stroll,1998-10-29,Canadian,aston_martin,11,5505049.0,Finished,0,17
116,2024,6,miami,albon,1996-03-23,Thai,williams,14,5525967.0,Finished,0,18
117,2024,6,miami,kevin_magnussen,1992-10-05,Danish,haas,18,5534559.0,Finished,0,19


#### Current Driver Standings in 2024 season

In [38]:
driver_standings2024 = {'season': [],
                    'round':[],
                    'driver': [],
                    'driver_points': [],
                    'driver_wins': [],
                   'driver_standings_pos': []}

# query API

for n in list(range(len(rounds2024))):
    for i in rounds2024[n][1]:    # iterate through rounds of each year

        url = 'https://ergast.com/api/f1/{}/{}/driverStandings.json'
        r = requests.get(url.format(rounds2024[n][0], i))
        json = r.json()

        if len(json['MRData']['StandingsTable']['StandingsLists']) == 0:
            break

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['DriverStandings']:
            try:
                driver_standings2024['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                driver_standings2024['season'].append(None)

            try:
                driver_standings2024['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                driver_standings2024['round'].append(None)

            try:
                driver_standings2024['driver'].append(item['Driver']['driverId'])
            except:
                driver_standings2024['driver'].append(None)

            try:
                driver_standings2024['driver_points'].append(int(item['points']))
            except:
                driver_standings2024['driver_points'].append(None)

            try:
                driver_standings2024['driver_wins'].append(int(item['wins']))
            except:
                driver_standings2024['driver_wins'].append(None)

            try:
                driver_standings2024['driver_standings_pos'].append(int(item['position']))
            except:
                driver_standings2024['driver_standings_pos'].append(None)

driver_standings2024 = pd.DataFrame(driver_standings2024)


driver_standings2024 = lookup(driver_standings2024, 'driver', 'driver_points')
driver_standings2024 = lookup(driver_standings2024, 'driver', 'driver_wins')
driver_standings2024 = lookup(driver_standings2024, 'driver', 'driver_standings_pos')

driver_standings2024.drop(['driver_points_after_race', 'driver_wins_after_race', 'driver_standings_pos_after_race'],
                      axis = 1, inplace = True)

In [39]:
driver_standings2024

Unnamed: 0,season,round,driver,driver_points,driver_wins,driver_standings_pos
0,2024,1,max_verstappen,0.0,0.0,0.0
1,2024,1,perez,0.0,0.0,0.0
2,2024,1,sainz,0.0,0.0,0.0
3,2024,1,leclerc,0.0,0.0,0.0
4,2024,1,russell,0.0,0.0,0.0
...,...,...,...,...,...,...
120,2024,6,albon,0.0,0.0,15.0
121,2024,6,zhou,0.0,0.0,17.0
122,2024,6,gasly,0.0,0.0,19.0
123,2024,6,bottas,0.0,0.0,20.0


#### Current Constructor Standings in 2024 season

In [40]:
# constructor standing for 2024 season

constructor_rounds2024 = rounds2024

constructor_standings2024 = {'season': [],
                    'round':[],
                    'constructor': [],
                    'constructor_points': [],
                    'constructor_wins': [],
                   'constructor_standings_pos': []}
# query API

for n in list(range(len(constructor_rounds2024))):
    for i in constructor_rounds2024[n][1]:

        url = 'https://ergast.com/api/f1/{}/{}/constructorStandings.json'
        r = requests.get(url.format(constructor_rounds2024[n][0], i))
        json = r.json()

        if len(json['MRData']['StandingsTable']['StandingsLists']) == 0:
            break

        for item in json['MRData']['StandingsTable']['StandingsLists'][0]['ConstructorStandings']:
            try:
                constructor_standings2024['season'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['season']))
            except:
                constructor_standings2024['season'].append(None)

            try:
                constructor_standings2024['round'].append(int(json['MRData']['StandingsTable']['StandingsLists'][0]['round']))
            except:
                constructor_standings2024['round'].append(None)

            try:
                constructor_standings2024['constructor'].append(item['Constructor']['constructorId'])
            except:
                constructor_standings2024['constructor'].append(None)

            try:
                constructor_standings2024['constructor_points'].append(int(item['points']))
            except:
                constructor_standings2024['constructor_points'].append(None)

            try:
                constructor_standings2024['constructor_wins'].append(int(item['wins']))
            except:
                constructor_standings2024['constructor_wins'].append(None)

            try:
                constructor_standings2024['constructor_standings_pos'].append(int(item['position']))
            except:
                constructor_standings2024['constructor_standings_pos'].append(None)

constructor_standings2024 = pd.DataFrame(constructor_standings2024)

constructor_standings2024 = lookup(constructor_standings2024, 'constructor', 'constructor_points')
constructor_standings2024 = lookup(constructor_standings2024, 'constructor', 'constructor_wins')
constructor_standings2024 = lookup(constructor_standings2024, 'constructor', 'constructor_standings_pos')

constructor_standings2024.drop(['constructor_points_after_race', 'constructor_wins_after_race','constructor_standings_pos_after_race' ],
                           axis = 1, inplace = True)

In [41]:
constructor_standings2024

Unnamed: 0,season,round,constructor,constructor_points,constructor_wins,constructor_standings_pos
0,2024,1,red_bull,0.0,0.0,0.0
1,2024,1,ferrari,0.0,0.0,0.0
2,2024,1,mercedes,0.0,0.0,0.0
3,2024,1,mclaren,0.0,0.0,0.0
4,2024,1,aston_martin,0.0,0.0,0.0
5,2024,1,sauber,0.0,0.0,0.0
6,2024,1,haas,0.0,0.0,0.0
7,2024,1,rb,0.0,0.0,0.0
8,2024,1,williams,0.0,0.0,0.0
9,2024,1,alpine,0.0,0.0,0.0


#### Current Qualifying in 2024 season

In [42]:
qualifying_results2024 = pd.DataFrame()

# Qualifying times are only available from 1983

for year in list(range(2024,2025)):
    url = 'https://www.formula1.com/en/results.html/{}/races.html'
    r = requests.get(url.format(year))
    soup = BeautifulSoup(r.text, 'html.parser')

    # find links to all circuits for a certain year

    year_links = []
    for page in soup.find_all('a', attrs = {'class':"resultsarchive-filter-item-link FilterTrigger"}):
        link = page.get('href')
        if f'/en/results.html/{year}/races/' in link:
            year_links.append(link)

    # for each circuit, switch to the starting grid page and read table

    year_df = pd.DataFrame()
    new_url = 'https://www.formula1.com{}'
    for n, link in list(enumerate(year_links)):
        link = link.replace('race-result.html', 'starting-grid.html')
        try:
          df = pd.read_html(new_url.format(link))
        except:
          continue
        df = df[0]
        df['season'] = year
        df['round'] = n+1
        for col in df:
            if 'Unnamed' in col:
                df.drop(col, axis = 1, inplace = True)

        year_df = pd.concat([year_df, df], ignore_index=True)

    # concatenate all tables from all years

    qualifying_results2024 = pd.concat([qualifying_results2024, year_df], ignore_index=True)

# rename columns

qualifying_results2024.rename(columns = {'Pos': 'grid', 'Driver': 'driver_name', 'Car': 'car',
                                     'Time': 'qualifying_time'}, inplace = True)
# drop driver number column

qualifying_results2024.drop('No', axis = 1, inplace = True)


In [43]:
qualifying_results2024

Unnamed: 0,grid,driver_name,car,qualifying_time,season,round
0,1,Max Verstappen VER,Red Bull Racing Honda RBPT,1:29.179,2024,1
1,2,Charles Leclerc LEC,Ferrari,1:29.407,2024,1
2,3,George Russell RUS,Mercedes,1:29.485,2024,1
3,4,Carlos Sainz SAI,Ferrari,1:29.507,2024,1
4,5,Sergio Perez PER,Red Bull Racing Honda RBPT,1:29.537,2024,1
...,...,...,...,...,...,...
114,16,Valtteri Bottas BOT,Kick Sauber Ferrari,1:28.463,2024,6
115,17,Logan Sargeant SAR,Williams Mercedes,1:28.487,2024,6
116,18,Kevin Magnussen MAG,Haas Ferrari,1:28.619,2024,6
117,19,Zhou Guanyu ZHO,Kick Sauber Ferrari,1:28.824,2024,6


#### Weather in 2024 season
Default: not found if do not find

In [44]:
weather2024 = races2024.iloc[:,[0,1,2]]

info2024 = []

# read wikipedia tables

for link in races2024.url:
    try:
        df = pd.read_html(link)[0]
        if 'Weather' in list(df.iloc[:,0]):
            n = list(df.iloc[:,0]).index('Weather')
            info2024.append(df.iloc[n,1])
        else:
            df = pd.read_html(link)[1]
            if 'Weather' in list(df.iloc[:,0]):
                n = list(df.iloc[:,0]).index('Weather')
                info2024.append(df.iloc[n,1])
            else:
                df = pd.read_html(link)[2]
                if 'Weather' in list(df.iloc[:,0]):
                    n = list(df.iloc[:,0]).index('Weather')
                    info2024.append(df.iloc[n,1])
                else:
                    df = pd.read_html(link)[3]
                    if 'Weather' in list(df.iloc[:,0]):
                        n = list(df.iloc[:,0]).index('Weather')
                        info2024.append(df.iloc[n,1])
                    else:
                        driver = webdriver.Chrome()
                        driver.get(link)

                        # click language button
                        button = driver.find_element_by_link_text('Italiano')
                        button.click()

                        # find weather in italian with selenium

                        clima = driver.find_element_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[9]/td').text
                        info.append(clima)

    except:
        info2024.append('not found')

# append column with weather information to dataframe

weather2024['weather'] = info2024

# set up a dictionary to convert weather information into keywords

weather_dict2024 = {'weather_warm': ['soleggiato', 'clear', 'warm', 'hot', 'sunny', 'fine', 'mild', 'sereno'],
               'weather_cold': ['cold', 'fresh', 'chilly', 'cool'],
               'weather_dry': ['dry', 'asciutto'],
               'weather_wet': ['showers', 'wet', 'rain', 'pioggia', 'damp', 'thunderstorms', 'rainy'],
               'weather_cloudy': ['overcast', 'nuvoloso', 'clouds', 'cloudy', 'grey', 'coperto']}

# map new df according to weather dictionary

weather_df2024 = pd.DataFrame(columns = weather_dict2024.keys())
for col in weather_df2024:
    weather_df2024[col] = weather2024['weather'].map(lambda x: 1 if any(i in weather_dict2024[col] for i in clean_text(x)) else 0)

weather_info2024 = pd.concat([weather2024, weather_df2024], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather2024['weather'] = info2024


In [45]:
weather_info2024

Unnamed: 0,season,round,circuit_id,weather,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy
0,2024,1,bahrain,Clear,1,0,0,0,0
1,2024,2,jeddah,Clear,1,0,0,0,0
2,2024,3,albert_park,Sunny,1,0,0,0,0
3,2024,4,suzuka,Sunny,1,0,0,0,0
4,2024,5,shanghai,Cloudy,0,0,0,0,1
5,2024,6,miami,Partly cloudy,0,0,0,0,1
6,2024,7,imola,not found,0,0,0,0,0
7,2024,8,monaco,not found,0,0,0,0,0
8,2024,9,villeneuve,not found,0,0,0,0,0
9,2024,10,catalunya,not found,0,0,0,0,0


In [None]:
weather_info2024.to_csv('weathers2024.csv', index=False)

## Data Preparation

### Data Preparation for train

In [46]:
#merge df

df1 = pd.merge(races, weather_data, how='inner',
               on=['season', 'round', 'circuit_id']).drop(['lat', 'long','country','weather'],
                                                          axis = 1)
df2 = pd.merge(df1, results, how='inner',
               on=['season', 'round', 'circuit_id']).drop(['url','points', 'status', 'time'],
                                                                 axis = 1)
df3 = pd.merge(df2, driver_standings, how='left',
               on=['season', 'round', 'driver'])
df4 = pd.merge(df3, constructor_standings, how='left',
               on=['season', 'round', 'constructor']) #from 1958

final_df = pd.merge(df4, qualifying, how='inner',
                    on=['season', 'round', 'grid']).drop(['driver_name', 'car'],
                                                         axis = 1) #from 1983


# calculate age of drivers

from dateutil.relativedelta import *
final_df['date'] = pd.to_datetime(final_df.date)
final_df['date_of_birth'] = pd.to_datetime(final_df.date_of_birth)
final_df['driver_age'] = final_df.apply(lambda x:
                                        relativedelta(x['date'], x['date_of_birth']).years, axis=1)
final_df.drop(['date', 'date_of_birth'], axis = 1, inplace = True)


# fill/drop nulls

for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points',
            'constructor_wins' , 'constructor_standings_pos']:
    final_df[col].fillna(0, inplace = True)
    final_df[col] = final_df[col].map(lambda x: int(x))

final_df.dropna(inplace = True )


# convert to boolean to save space

for col in ['weather_warm', 'weather_cold','weather_dry', 'weather_wet', 'weather_cloudy']:
    final_df[col] = final_df[col].map(lambda x: bool(x))

print(final_df)
# calculate difference in qualifying times

final_df['qualifying_time'] = final_df.qualifying_time.map(lambda x: float(x) if ':' not in str(x)
                             else(float(str(x).split(':')[1]) +
                                  (60 * float(str(x).split(':')[0])) if x != 0 else 0))
print(final_df['qualifying_time'])
final_df = final_df[final_df['qualifying_time'] != 0]
final_df.sort_values(['season', 'round', 'grid'], inplace = True)
final_df['qualifying_time_diff'] = final_df.groupby(['season', 'round']).qualifying_time.diff()
final_df['qualifying_time'] = final_df.groupby(['season',
                                                'round']).qualifying_time_diff.cumsum().fillna(0)
final_df.drop('qualifying_time_diff', axis = 1, inplace = True)


# get dummies

df_dum = pd.get_dummies(final_df, columns = ['circuit_id', 'nationality', 'constructor'] )

for col in df_dum.columns:
    if 'nationality' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)

    elif 'constructor' in col and df_dum[col].sum() < 140:
        df_dum.drop(col, axis = 1, inplace = True)

    elif 'circuit_id' in col and df_dum[col].sum() < 70:
        df_dum.drop(col, axis = 1, inplace = True)

    else:
        pass


       season  round   circuit_id  weather_warm  weather_cold  weather_dry  \
0        1983      1  jacarepagua         False         False         True   
1        1983      1  jacarepagua         False         False         True   
2        1983      1  jacarepagua         False         False         True   
3        1983      1  jacarepagua         False         False         True   
4        1983      1  jacarepagua         False         False         True   
...       ...    ...          ...           ...           ...          ...   
16122    2023     22   yas_marina          True         False        False   
16123    2023     22   yas_marina          True         False        False   
16124    2023     22   yas_marina          True         False        False   
16125    2023     22   yas_marina          True         False        False   
16126    2023     22   yas_marina          True         False        False   

       weather_wet  weather_cloudy           driver nationality

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.sort_values(['season', 'round', 'grid'], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['qualifying_time_diff'] = final_df.groupby(['season', 'round']).qualifying_time.diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['qualifying_time'] = final_df.groupby(['season',
A value is trying to be set on a copy of a

In [47]:
print(final_df)

       season  round   circuit_id  weather_warm  weather_cold  weather_dry  \
14       1983      1  jacarepagua         False         False         True   
5        1983      1  jacarepagua         False         False         True   
3        1983      1  jacarepagua         False         False         True   
0        1983      1  jacarepagua         False         False         True   
6        1983      1  jacarepagua         False         False         True   
...       ...    ...          ...           ...           ...          ...   
16124    2023     22   yas_marina          True         False        False   
16126    2023     22   yas_marina          True         False        False   
16125    2023     22   yas_marina          True         False        False   
16123    2023     22   yas_marina          True         False        False   
16122    2023     22   yas_marina          True         False        False   

       weather_wet  weather_cloudy           driver nationality

### Data Pre-Processing

In [48]:
df = final_df.copy()

In [49]:
# Create LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder for columns and save the mapping
df['circuit_id_encoded'] = label_encoder.fit_transform(df['circuit_id'])
df['driver_encoded'] = label_encoder.fit_transform(df['driver'])
df['nationality_encoded'] = label_encoder.fit_transform(df['nationality'])
df['constructor_encoded'] = label_encoder.fit_transform(df['constructor'])

# Make mapping from numeric value to string value
circuit_id_mapping = dict(zip(df['circuit_id_encoded'], df['circuit_id']))
driver_mapping = dict(zip(df['driver_encoded'], df['driver']))
nationality_mapping = dict(zip(df['nationality_encoded'], df['nationality']))
constructor_mapping = dict(zip(df['constructor_encoded'], df['constructor']))

# Make mapping from string value to numeric value
circuit_id_reverse_mapping = dict(zip(df['circuit_id'], df['circuit_id_encoded']))
driver_reverse_mapping = dict(zip(df['driver'], df['driver_encoded']))
nationality_reverse_mapping = dict(zip(df['nationality'], df['nationality_encoded']))
constructor_reverse_mapping = dict(zip(df['constructor'], df['constructor_encoded']))


df['circuit_id'] = df['circuit_id_encoded']
df['driver'] = df['driver_encoded']
df['nationality'] = df['nationality_encoded']
df['constructor'] = df['constructor_encoded']

df = df.drop(['circuit_id_encoded', 'driver_encoded', 'nationality_encoded', 'constructor_encoded'], axis=1)

In [50]:
df

Unnamed: 0,season,round,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,nationality,...,grid,podium,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age
14,1983,1,21,False,False,True,False,False,110,14,...,1,15,0,0,0,0,0,0,0.000,34
5,1983,1,21,False,False,True,False,False,178,15,...,2,6,0,0,0,0,0,0,0.146,28
3,1983,1,21,False,False,True,False,False,214,15,...,3,4,0,0,0,0,0,0,0.232,33
0,1983,1,21,False,False,True,False,False,173,5,...,4,1,0,0,0,0,0,0,0.588,30
6,1983,1,21,False,False,True,False,False,225,6,...,5,7,0,0,0,0,0,0,0.680,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16124,2023,22,49,True,False,False,False,False,191,30,...,16,18,200,1,4,388,1,3,2.108,29
16126,2023,22,49,True,False,False,False,False,111,12,...,17,20,3,0,19,12,0,10,2.123,31
16125,2023,22,49,True,False,False,False,False,31,14,...,18,19,10,0,15,16,0,9,2.124,34
16123,2023,22,49,True,False,False,False,False,235,9,...,19,17,6,0,18,16,0,9,1.473,24


In [51]:
# Divide df into data (X) và label (y)
X = df.drop(['podium'], axis=1)
y = df['podium']

# Divide the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
X_train

Unnamed: 0,season,round,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,nationality,constructor,grid,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age
11199,2011,19,19,True,False,False,False,False,94,6,36,4,227,3,5,482,6,2,0.562,26
5824,1997,10,15,True,False,True,False,False,180,16,25,7,7,0,11,15,0,6,0.625,22
11459,2012,11,16,True,False,True,False,False,185,30,23,23,0,0,24,0,0,12,4.963,41
12722,2015,12,31,False,False,False,False,True,186,16,37,4,199,3,2,426,9,1,0.306,30
9137,2006,14,20,True,False,False,False,False,205,0,60,17,0,0,19,1,0,9,2.251,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5198,1995,13,12,True,False,False,False,False,109,22,62,16,0,0,19,2,0,8,3.750,32
13648,2017,17,2,True,False,False,False,False,9,30,36,8,10,0,16,23,0,9,1.899,36
5397,1996,5,17,True,False,False,False,False,97,6,50,15,0,0,14,0,0,9,2.651,31
861,1985,4,30,False,False,True,False,False,167,21,1,12,0,0,25,0,0,12,1.363,31


In [53]:
X_test

Unnamed: 0,season,round,circuit_id,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,nationality,constructor,grid,driver_points,driver_wins,driver_standings_pos,constructor_points,constructor_wins,constructor_standings_pos,qualifying_time,driver_age
7032,2000,16,45,False,False,False,False,False,70,21,8,12,18,0,6,20,0,4,1.654,27
11900,2013,11,44,False,False,False,False,True,186,16,37,4,84,2,6,208,3,2,1.239,28
1859,1987,12,12,False,False,True,False,False,104,31,36,8,20,0,5,51,2,3,2.514,31
10491,2010,9,46,False,False,False,False,False,211,16,19,13,23,0,10,35,0,6,1.264,27
6900,2000,10,37,False,False,True,False,True,180,16,64,19,14,0,6,17,0,4,1.937,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11893,2013,10,16,True,False,False,False,False,186,16,37,4,84,2,6,183,2,2,0.332,28
11741,2013,3,41,False,False,False,False,True,211,16,19,13,6,0,9,10,0,5,1.921,30
13046,2016,9,37,False,False,False,False,True,92,24,21,11,0,0,18,22,0,8,-0.344,24
4969,1995,4,7,True,False,False,False,False,138,21,39,19,0,0,19,0,0,12,5.167,34


In [54]:
y_train

11199    22
5824      5
11459    22
12722    17
9137     13
         ..
5198     23
13648    17
5397     20
861      13
7277     17
Name: podium, Length: 12566, dtype: int64

In [55]:
y_test

7032     14
11900     4
1859      5
10491     6
6900     14
         ..
11893    19
11741    21
13046    11
4969     14
8534      8
Name: podium, Length: 3142, dtype: int64

- Calculate precision score for Random Forest Classification

In [61]:
X_c = X_train.copy()
y_c = y_train.copy()

#cross validation for models
models = [RandomForestClassifier()]
names = ['RandomForestClassifier']
model_dict = dict(zip(models,names))
mean_results_const = []
results_const = []
name = []
for model in models:
    cv = StratifiedKFold(n_splits=10,random_state=None)
    result = cross_val_score(model,X_c,y_c,cv=cv,scoring='accuracy')
    mean_results_const.append(result.mean())
    results_const.append(result)
    name.append(model_dict[model])
    print(f'{model_dict[model]} : {result.mean()}')



RandomForestClassifier : 0.10027141004008129


- Testing model with real data

In [62]:
rf = RandomForestClassifier(n_estimators=1600,min_samples_split=20,min_samples_leaf=1,max_features='sqrt',max_depth=90,bootstrap=True)

rf.fit(X_train, y_train)

new_race_data = {
    'season': [2024],                     # New Season
    'round': [7],                         # New Round
    'circuit_id': [3],                    # New Circuit id
    'weather_warm': [True],               #
    'weather_cold': [False],              #
    'weather_dry': [False],               # Weather condition
    'weather_wet': [False],               #
    'weather_cloudy': [False],            #
    'driver': [141],                      # Driver
    'nationality': [13],                  # Nationality
    'constructor': [47],                  # Constructor
    'grid': [1],                          # Start position
    'driver_points': [136],               # Driver points
    'driver_wins': [4],                   # Number of Driver wins
    'driver_standings_pos': [1],          # Driver standing position
    'constructor_points': [239],          # Constructor points
    'constructor_wins': [4],              # Number of Constructor wins
    'constructor_standings_pos': [1],     # Constructor standing position
    'qualifying_time': [88.194],          # Qualifying time
    'driver_age': [27],                   # Driver age
}

# Preparing data for new race
new_race_df = pd.DataFrame(new_race_data)

# Prediction
predicted_podium = rf.predict(new_race_df)
print(rf)

# Print podium prediction
print("Predicted podium:", predicted_podium)

RandomForestClassifier(max_depth=90, min_samples_split=20, n_estimators=1600)
Predicted podium: [1]


- Testing model with another data

In [63]:
rf = RandomForestClassifier(n_estimators=1600,min_samples_split=20,min_samples_leaf=1,max_features='sqrt',max_depth=90,bootstrap=True)

rf.fit(X_train, y_train)

new_race_data = {
    'season': [2024],                     # New Season
    'round': [7],                         # New Round
    'circuit_id': [3],                    # New Circuit id
    'weather_warm': [True],               #
    'weather_cold': [False],              #
    'weather_dry': [False],               # Weather condition
    'weather_wet': [False],               #
    'weather_cloudy': [False],            #
    'driver': [235],                      # Driver
    'nationality': [9],                   # Nationality
    'constructor': [9],                   # Constructor
    'grid': [17],                         # Start position
    'driver_points': [0],                 # Driver points
    'driver_wins': [0],                   # Number of Driver wins
    'driver_standings_pos': [18],         # Driver standing position
    'constructor_points': [0],            # Constructor points
    'constructor_wins': [0],              # Number of Constructor wins
    'constructor_standings_pos': [10],    # Constructor standing position
    'qualifying_time': [88.194],          # Qualifying time
    'driver_age': [25],                   # Driver age
}

# Preparing data for new race
new_race_df = pd.DataFrame(new_race_data)

# Prediction
predicted_podium = rf.predict(new_race_df)
print(rf)

# Print podium prediction
print("Predicted podium:", predicted_podium)

RandomForestClassifier(max_depth=90, min_samples_split=20, n_estimators=1600)
Predicted podium: [16]


- Save model to .joblib file

In [64]:
from joblib import dump, load

rf = RandomForestClassifier(n_estimators=1600,min_samples_split=20,min_samples_leaf=1,max_features='sqrt',max_depth=90,bootstrap=True)

rf.fit(X_train, y_train)

dump(rf, 'f1_prediction.joblib')

['f1_prediction.joblib']

- Testing with model from .joblib file

In [65]:
loaded_model = load('f1_prediction.joblib')

new_race_data = {
    'season': [2024],                     # New Season
    'round': [7],                         # New Round
    'circuit_id': [3],                    # New Circuit id
    'weather_warm': [True],               #
    'weather_cold': [False],              #
    'weather_dry': [False],               # Weather condition
    'weather_wet': [False],               #
    'weather_cloudy': [False],            #
    'driver': [235],                      # Driver
    'nationality': [9],                   # Nationality
    'constructor': [9],                   # Constructor
    'grid': [17],                         # Start position
    'driver_points': [0],                 # Driver points
    'driver_wins': [0],                   # Number of Driver wins
    'driver_standings_pos': [18],         # Driver standing position
    'constructor_points': [0],            # Constructor points
    'constructor_wins': [0],              # Number of Constructor wins
    'constructor_standings_pos': [10],    # Constructor standing position
    'qualifying_time': [1.126],          # Qualifying time
    'driver_age': [25],                   # Driver age
}

# Preparing data for new race
new_race_df = pd.DataFrame(new_race_data)

# Prediction
predicted_podium = loaded_model.predict(new_race_df)

# Print podium prediction
print("Predicted podium:", predicted_podium[0])

Predicted podium: 15


- Check mapping after encoder

In [66]:
circuit_id_reverse_mapping['marina_bay']

28

In [67]:
driver_reverse_mapping['sargeant']

195

In [68]:
nationality_reverse_mapping['French']

15

In [69]:
constructor_reverse_mapping['alpine']

3

#### Create Dataframe to look up related data for particular driver

In [70]:
print(weather_info2024['circuit_id'][23])

yas_marina


- Create data dict for mapping

In [71]:
#merge df

df1_2024 = pd.merge(races2024, weather_info2024, how='inner',
               on=['season', 'round', 'circuit_id']).drop(['lat', 'long','country','weather'],
                                                          axis = 1)
df2_2024 = pd.merge(df1_2024, results2024, how='inner',
               on=['season', 'round', 'circuit_id']).drop(['url','points', 'status', 'time'],
                                                                 axis = 1)
df3_2024 = pd.merge(df2_2024, driver_standings2024, how='left',
               on=['season', 'round', 'driver'])
df4_2024 = pd.merge(df3_2024, constructor_standings2024, how='left',
               on=['season', 'round', 'constructor']) #from 1958

final_df2024 = pd.merge(df4_2024, qualifying_results2024, how='inner',
                    on=['season', 'round', 'grid']) #from 1983


# calculate age of drivers

from dateutil.relativedelta import *
final_df2024['date'] = pd.to_datetime(final_df2024.date)
final_df2024['date_of_birth'] = pd.to_datetime(final_df2024.date_of_birth)
final_df2024['driver_age'] = final_df2024.apply(lambda x:
                                        relativedelta(x['date'], x['date_of_birth']).years, axis=1)
final_df2024.drop(['date', 'date_of_birth'], axis = 1, inplace = True)


# fill/drop nulls

for col in ['driver_points', 'driver_wins', 'driver_standings_pos', 'constructor_points',
            'constructor_wins' , 'constructor_standings_pos']:
    final_df2024[col].fillna(0, inplace = True)
    final_df2024[col] = final_df2024[col].map(lambda x: int(x))

final_df2024.dropna(inplace = True )


# convert to boolean to save space

for col in ['weather_warm', 'weather_cold','weather_dry', 'weather_wet', 'weather_cloudy']:
    final_df2024[col] = final_df2024[col].map(lambda x: bool(x))

print(final_df2024)
# calculate difference in qualifying times

final_df2024['qualifying_time'] = final_df2024.qualifying_time.map(lambda x: float(x) if ':' not in str(x)
                             else(float(str(x).split(':')[1]) +
                                  (60 * float(str(x).split(':')[0])) if x != 0 else 0))
print(final_df2024['qualifying_time'])
final_df2024 = final_df2024[final_df2024['qualifying_time'] != 0]
final_df2024.sort_values(['season', 'round', 'grid'], inplace = True)
final_df2024['qualifying_time_diff'] = final_df2024.groupby(['season', 'round']).qualifying_time.diff()
final_df2024['qualifying_time'] = final_df2024.groupby(['season',
                                                'round']).qualifying_time_diff.cumsum().fillna(0)
final_df2024.drop('qualifying_time_diff', axis = 1, inplace = True)


# get dummies

df_dum2024 = pd.get_dummies(final_df2024, columns = ['circuit_id', 'nationality', 'constructor'] )

for col in df_dum2024.columns:
    if 'nationality' in col and df_dum2024[col].sum() < 140:
        df_dum2024.drop(col, axis = 1, inplace = True)

    elif 'constructor' in col and df_dum2024[col].sum() < 140:
        df_dum2024.drop(col, axis = 1, inplace = True)

    elif 'circuit_id' in col and df_dum2024[col].sum() < 70:
        df_dum2024.drop(col, axis = 1, inplace = True)

    else:
        pass


     season  round circuit_id  weather_warm  weather_cold  weather_dry  \
0      2024      1    bahrain          True         False        False   
1      2024      1    bahrain          True         False        False   
2      2024      1    bahrain          True         False        False   
3      2024      1    bahrain          True         False        False   
4      2024      1    bahrain          True         False        False   
..      ...    ...        ...           ...           ...          ...   
114    2024      6      miami         False         False        False   
115    2024      6      miami         False         False        False   
116    2024      6      miami         False         False        False   
117    2024      6      miami         False         False        False   
118    2024      6      miami         False         False        False   

     weather_wet  weather_cloudy           driver nationality  ...  \
0          False           False   max_ve

- Remove unnecessary columns in data dict

In [72]:
cleaned_data2024 = final_df2024.copy()

data = cleaned_data2024.drop(['season', 'weather_warm', 'weather_cold', 'weather_dry', 'weather_wet', 'weather_cloudy',
                          'grid', 'podium', 'car', 'driver_points', 'driver_wins', 'driver_standings_pos',
                          'constructor_points', 'constructor_wins', 'constructor_standings_pos', 'round', 'circuit_id'], axis=1)
data = data.drop_duplicates(subset='driver')

# Delete personal name abbreviations
data['driver_name'] = data['driver_name'].apply(lambda x: x.split()[0] + ' ' + x.split()[1])

data

Unnamed: 0,driver,nationality,constructor,driver_name,qualifying_time,driver_age
0,max_verstappen,Dutch,red_bull,Max Verstappen,0.0,26
3,leclerc,Monegasque,ferrari,Charles Leclerc,0.228,26
4,russell,British,mercedes,George Russell,0.306,26
2,sainz,Spanish,ferrari,Carlos Sainz,0.328,29
1,perez,Mexican,red_bull,Sergio Perez,0.358,34
8,alonso,Spanish,aston_martin,Fernando Alonso,0.363,42
5,norris,British,mclaren,Lando Norris,0.435,24
7,piastri,Australian,mclaren,Oscar Piastri,0.504,22
6,hamilton,British,mercedes,Lewis Hamilton,0.531,39
15,hulkenberg,German,haas,Nico Hulkenberg,1.323,36


- Look up any driver

In [73]:
dn = 'Oliver Bearman'
tay_dua = data.loc[data['driver_name'] == dn]
print(tay_dua.at[tay_dua.index[0], 'driver'])
print(tay_dua.at[tay_dua.index[0], 'nationality'])
print(tay_dua.at[tay_dua.index[0], 'constructor'])
print(tay_dua.at[tay_dua.index[0], 'qualifying_time'])
print(tay_dua.at[tay_dua.index[0], 'driver_age'])

bearman
British
ferrari
1.1699999999999875
18


- Define functions to new data pre-processing

In [74]:
data_rounds = pd.read_csv('Schedule_2024.csv')
rounds_2024 = list(data_rounds['name_round'])
rounds_2024 = rounds_2024[1:]
rounds_2024[0]

def get_round(name_round):
    nr = name_round.split(' ')
    return nr[1]

def get_circuit(round):
    info = pd.read_csv('weathers2024.csv')
    return info['circuit_id'][round-1]

def get_or_create_circuit_id(circuit):
  if circuit not in circuit_id_reverse_mapping:
    max_existing_value = max(circuit_id_reverse_mapping.values())

    # Create new value for new label by increase max value to 1
    new_label_value = max_existing_value + 1

    # Add new label into mapping with new value
    circuit_id_reverse_mapping[circuit] = new_label_value
    return new_label_value
  else:
    return circuit_id_reverse_mapping[circuit]

def get_or_create_driver_id(driver):
  if driver not in driver_reverse_mapping:
    max_existing_value = max(driver_reverse_mapping.values())

    # Create new value for new label by increase max value to 1
    new_label_value = max_existing_value + 1

    # Add new label into mapping with new value
    driver_reverse_mapping[driver] = new_label_value
    return new_label_value
  else:
    return driver_reverse_mapping[driver]

def get_or_create_nationality_id(nationality):
  if nationality not in nationality_reverse_mapping:
    max_existing_value = max(nationality_reverse_mapping.values())

    # Create new value for new label by increase max value to 1
    new_label_value = max_existing_value + 1

    # Add new label into mapping with new value
    nationality_reverse_mapping[nationality] = new_label_value
    return new_label_value
  else:
    return nationality_reverse_mapping[nationality]

def get_or_create_constructor_id(constructor):
  if constructor not in constructor_reverse_mapping:
    max_existing_value = max(constructor_reverse_mapping.values())

    # Create new value for new label by increase max value to 1
    new_label_value = max_existing_value + 1

    # Add new label into mapping with new value
    constructor_reverse_mapping[constructor] = new_label_value
    return new_label_value
  else:
    return constructor_reverse_mapping[constructor]



c = get_or_create_circuit_id('bahrain')
print(c)
d = get_or_create_driver_id('bearman')
print(d)
e = get_or_create_nationality_id('Laos')
print(e)
g = get_or_create_constructor_id('rb')
print(g)

3
237
35
66


- Install Gradio Library to run and deploy program

In [75]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.31.0-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.16.2 (from gradio)
  Downloading gradio_client-0.16.2-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━

## Main program

In [76]:
import gradio as gr
import pandas as pd

# Podium prediction function
def podium_prediction(round, driver, grid):
    # Get round
    vong_dua = int(get_round(round))

    # Get circuit id
    circuit = get_circuit(vong_dua)
    circuit_id = get_or_create_circuit_id(circuit)

    # Get driver and constructor information
    tay_dua = data.loc[data['driver_name'] == driver]
    driver_id = tay_dua.at[tay_dua.index[0], 'driver']
    nationality_id = tay_dua.at[tay_dua.index[0], 'nationality']
    constructor_id = tay_dua.at[tay_dua.index[0], 'constructor']
    qualifying_time = tay_dua.at[tay_dua.index[0], 'qualifying_time']
    age = tay_dua.at[tay_dua.index[0], 'driver_age']

    loaded_model = load('f1_prediction.joblib')

    driver_id = get_or_create_driver_id(driver_id)
    nationality_id = get_or_create_nationality_id(nationality_id)
    constructor_id = get_or_create_constructor_id(constructor_id)


    new_race_data = {
        'season': [2024],                        # Season
        'round': [vong_dua],                     # New round
        'circuit_id': [circuit_id],              # Circuit id
        'weather_warm': [True],                  #
        'weather_cold': [False],                 #
        'weather_dry': [False],                  # Weather condition (default: weather_warm)
        'weather_wet': [False],                  #
        'weather_cloudy': [False],               #
        'driver': [driver_id],                   # Driver
        'nationality': [nationality_id],         # Nationality
        'constructor': [constructor_id],         # Constructor
        'grid': [grid],                          # Start position
        'driver_points': [0],                    # Driver point
        'driver_wins': [0],                      # Number of Driver wins
        'driver_standings_pos': [1],             # Driver standings position
        'constructor_points': [0],               # Constructor point
        'constructor_wins': [0],                 # Number of Constructor wins
        'constructor_standings_pos': [10],       # Constructor standings position
        'qualifying_time': [qualifying_time],    # Qualifying Times
        'driver_age': [age],                     # Driver age
    }

    # Prepare Dataframe for new round
    new_race_df = pd.DataFrame(new_race_data)

    # Predict and return result
    predicted_podium = loaded_model.predict(new_race_df)

    return predicted_podium[0]


# Read drivers.csv file to get name of all drivers
data_drivers = pd.read_csv('drivers.csv')
drivers = list(data_drivers['name'])

# Read Schedule_2024.csv to get all round in 2024 season
data_rounds = pd.read_csv('Schedule_2024.csv')
rounds = list(data_rounds['name_round'])
rounds = rounds[1:]

pos = [i for i in range(1, 21)]


# Gradio
gr.Interface(
    fn=podium_prediction,
    inputs=[
        gr.Dropdown(rounds, label="Round"),
        gr.Dropdown(drivers, label="Driver"),
        gr.Dropdown(pos, label="Start position")
    ],
    outputs= gr.Textbox(label="Prediction result"),
    title="F1 Prediction Application",
    description="Select round, driver, and start position, then return podium of driver."
).launch(debug=False)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://ff1ccc441aa8e51ecf.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


