# 1.TAKING RAW_DATA FROM ERGAST BY USING API

## 1.1 BASE URL 

In [1]:
import requests # type: ignore
import json
import pandas as pd
import csv
import numpy as np
BASE_URL="http://ergast.com/api/f1"

   


## 1.2 Result F1 match from 2020 to 2024

In [3]:
DATA=[]
for year in range(2020,2025):
    for round in range(17):
        try:
            response=requests.get(f"{BASE_URL}/{year}/{round}/results.json")
            if(response.status_code==200):
                data=response.json() 
            DATA.append(data) 
            with open(f"RAW_DATA/Result_F1_matches.json", 'w', encoding='utf-8') as file:
                json.dump(DATA, file, ensure_ascii=False, indent=4)
        except requests.exceptions.Timeout:
            print("Timeout error")
        except requests.RequestException as e:
            print(f"Request error: {e}")

## 1.3 Profile of F1 players

In [52]:
DATA=[]
for year in range(2020,2025):
    try:
        response=requests.get(f"{BASE_URL}/{year}/drivers.json")
        if(response.status_code==200):
            data=response.json()
        DATA.append(data)  
        with open("RAW_DATA/Profile_F1_players.json", 'w', encoding='utf-8') as file:
                json.dump(DATA, file, ensure_ascii=False, indent=4)
    except requests.exceptions.Timeout:
        print("Timeout error")
    except requests.RequestException as e:
        print(f"Request error: {e}")

## 1.4 Ranking F1 players each year

In [None]:
DATA=[]
for year in range(2020,2025):
    try:
        response=requests.get(f"{BASE_URL}/{year}/driverStandings.json")
        if(response.status_code==200):
            data=response.json()
        DATA.append(data)  
        with open(f"RAW_DATA/DriveStanding_F1_players.json", 'w', encoding='utf-8') as file:
            json.dump(DATA, file, ensure_ascii=False, indent=4)
    except requests.exceptions.Timeout:
        print("Timeout error")
    except requests.RequestException as e:
        print(f"Request error: {e}")

## 1.5 Pitstop information of each F1 player between 2020 and 2024

In [None]:
DATA=[]
for year in range(2020,2025):
    for round in range(17):
        try:
            response=requests.get(f"{BASE_URL}/{year}/{round}/pitstops.json")
            if(response.status_code==200):
                data=response.json() 
            DATA.append(data) 
            with open(f"RAW_DATA/Pitstop.json", 'w', encoding='utf-8') as file:
                json.dump(DATA, file, ensure_ascii=False, indent=4)
        except requests.exceptions.Timeout:
            print("Timeout error")
        except requests.RequestException as e:
            print(f"Request error: {e}")

## 1.6 Qualifying Results of F1 player each season from 2020 to 2024

In [None]:
DATA=[]
for year in range(2020,2025):
    for round in range(17):
        try:
            response=requests.get(f"{BASE_URL}/{year}/{round}/qualifying.json")
            if(response.status_code==200):
                data=response.json() 
            DATA.append(data) 
            with open(f"RAW_DATA/Qualifying_information.json", 'w', encoding='utf-8') as file:
                json.dump(DATA, file, ensure_ascii=False, indent=4)
        except requests.exceptions.Timeout:
            print("Timeout error")
        except requests.RequestException as e:
            print(f"Request error: {e}")

# 2.GETTING RAW DATA

In [2]:
with open(f"RAW_DATA/Result_F1_matches.json", "r") as json_file:
        result_data = json.load(json_file)  

In [3]:
with open("RAW_DATA/Profile_F1_players.json", "r") as json_file:
        player_data = json.load(json_file) 

In [4]:
with open(f"RAW_DATA/DriveStanding_F1_players.json", "r") as json_file:
        driveStanding_data=json.load(json_file)

In [5]:
with open(f"RAW_DATA/Pitstop.json", "r") as json_file:
        pitstop_data = json.load(json_file) 

In [6]:
with open(f"RAW_DATA/Qualifying_information.json", "r") as json_file:
        qualifying_data = json.load(json_file) 

# 3.DATA PREPROCESSING

## 3.1 Some information about drivers and constructors

In [7]:

List_driver=set()
columns=["ID"]
for information in player_data:
    for detail in information["MRData"]["DriverTable"]["Drivers"]:
        List_driver.add(detail["driverId"])
List_driver_table=pd.DataFrame(List_driver,columns=columns)
List_driver_table.to_csv("CLEAN_DATA/List_driver.csv",index=False, encoding='utf-8')
print(List_driver_table)

                   ID
0              aitken
1            hamilton
2              alonso
3                zhou
4              kubica
5             piastri
6              vettel
7               gasly
8              stroll
9          hulkenberg
10              kvyat
11         giovinazzi
12            mazepin
13            tsunoda
14               ocon
15    kevin_magnussen
16              sainz
17  pietro_fittipaldi
18             norris
19             latifi
20            leclerc
21            bearman
22           grosjean
23           sargeant
24    mick_schumacher
25     max_verstappen
26          colapinto
27             bottas
28           de_vries
29              albon
30          raikkonen
31             lawson
32          ricciardo
33            russell
34              perez


In [8]:
List_of_constructor=set()
for information in result_data:
    for detail in information["MRData"]["RaceTable"]["Races"]:
        result=detail["Results"]
        for i in range(len(result)):
            List_of_constructor.add(result[i]["Constructor"]["constructorId"])
print(List_of_constructor)

{'haas', 'alphatauri', 'alfa', 'racing_point', 'mclaren', 'williams', 'aston_martin', 'mercedes', 'alpine', 'renault', 'rb', 'sauber', 'red_bull', 'ferrari'}


In [9]:
Constructor_table=pd.DataFrame(List_of_constructor,columns=["ConstructorId"])
Constructor_table.to_csv("CLEAN_DATA/List_constructor.csv",index=False, encoding='utf-8')
print(Constructor_table)

   ConstructorId
0           haas
1     alphatauri
2           alfa
3   racing_point
4        mclaren
5       williams
6   aston_martin
7       mercedes
8         alpine
9        renault
10            rb
11        sauber
12      red_bull
13       ferrari


## 3.2 Driver Ranking

In [10]:
total_point={}
for information in driveStanding_data:
    for detail in information["MRData"]["StandingsTable"]["StandingsLists"]:
        season=detail["season"]
        for i in range(len(detail["DriverStandings"])):
             total_point[detail["DriverStandings"][i]["Driver"]["driverId"],season]=[detail["DriverStandings"][i]["points"],detail["DriverStandings"][i]["wins"]]
list_total_point=[[],[],[],[],[]]
for key, value in total_point.items():
    list_total_point[int(key[1])-2020].append([key[0],key[1],value[0],value[1]])
column=["DriverID","Year","Score","Win"]
for index in range(len(list_total_point)):
    with open("CLEAN_DATA/Rank_driver.csv", mode="a", encoding="utf-8", newline="") as file:
        writer = csv.writer(file)
        writer.writerows(list_total_point[index]) 

        

In [11]:
Total_point_table_0=pd.DataFrame(list_total_point[0],columns=column)
sum_score_2020=[]
for index in range(len(list_total_point[0])):
    sum_score_2020.append(float(list_total_point[0][index][2]))
median_score_2020=np.median(sum_score_2020)
print(Total_point_table_0)  

             DriverID  Year Score Win
0            hamilton  2020   347  11
1              bottas  2020   223   2
2      max_verstappen  2020   214   2
3               perez  2020   125   1
4           ricciardo  2020   119   0
5               sainz  2020   105   0
6               albon  2020   105   0
7             leclerc  2020    98   0
8              norris  2020    97   0
9               gasly  2020    75   1
10             stroll  2020    75   0
11               ocon  2020    62   0
12             vettel  2020    33   0
13              kvyat  2020    32   0
14         hulkenberg  2020    10   0
15          raikkonen  2020     4   0
16         giovinazzi  2020     4   0
17            russell  2020     3   0
18           grosjean  2020     2   0
19    kevin_magnussen  2020     1   0
20             latifi  2020     0   0
21             aitken  2020     0   0
22  pietro_fittipaldi  2020     0   0


In [12]:
Total_point_table_1=pd.DataFrame(list_total_point[1],columns=column)
sum_score_2021=[]
for index in range(len(list_total_point[1])):
    sum_score_2021.append(float(list_total_point[1][index][2]))
median_score_2021=np.median(sum_score_2021)
print(Total_point_table_1)  

           DriverID  Year  Score Win
0    max_verstappen  2021  395.5  10
1          hamilton  2021  387.5   8
2            bottas  2021    226   1
3             perez  2021    190   1
4             sainz  2021  164.5   0
5            norris  2021    160   0
6           leclerc  2021    159   0
7         ricciardo  2021    115   1
8             gasly  2021    110   0
9            alonso  2021     81   0
10             ocon  2021     74   1
11           vettel  2021     43   0
12           stroll  2021     34   0
13          tsunoda  2021     32   0
14          russell  2021     16   0
15        raikkonen  2021     10   0
16           latifi  2021      7   0
17       giovinazzi  2021      3   0
18  mick_schumacher  2021      0   0
19           kubica  2021      0   0
20          mazepin  2021      0   0


In [13]:

Total_point_table_2=pd.DataFrame(list_total_point[2],columns=column)
sum_score_2022=[]
for index in range(len(list_total_point[2])):
    sum_score_2022.append(float(list_total_point[2][index][2]))
median_score_2022=np.median(sum_score_2022)
print(median_score_2022)
print(Total_point_table_2)  

37.0
           DriverID  Year Score Win
0    max_verstappen  2022   454  15
1           leclerc  2022   308   3
2             perez  2022   305   2
3           russell  2022   275   1
4             sainz  2022   246   1
5          hamilton  2022   240   0
6            norris  2022   122   0
7              ocon  2022    92   0
8            alonso  2022    81   0
9            bottas  2022    49   0
10        ricciardo  2022    37   0
11           vettel  2022    37   0
12  kevin_magnussen  2022    25   0
13            gasly  2022    23   0
14           stroll  2022    18   0
15  mick_schumacher  2022    12   0
16          tsunoda  2022    12   0
17             zhou  2022     6   0
18            albon  2022     4   0
19           latifi  2022     2   0
20         de_vries  2022     2   0
21       hulkenberg  2022     0   0


In [14]:

Total_point_table_3=pd.DataFrame(list_total_point[3],columns=column)
sum_score_2023=[]
for index in range(len(list_total_point[3])):
    sum_score_2023.append(float(list_total_point[3][index][2]))
median_score_2023=np.median(sum_score_2023)
print(median_score_2023)
print(Total_point_table_3)  

60.0
           DriverID  Year Score Win
0    max_verstappen  2023   575  19
1             perez  2023   285   2
2          hamilton  2023   234   0
3            alonso  2023   206   0
4           leclerc  2023   206   0
5            norris  2023   205   0
6             sainz  2023   200   1
7           russell  2023   175   0
8           piastri  2023    97   0
9            stroll  2023    74   0
10            gasly  2023    62   0
11             ocon  2023    58   0
12            albon  2023    27   0
13          tsunoda  2023    17   0
14           bottas  2023    10   0
15       hulkenberg  2023     9   0
16        ricciardo  2023     6   0
17             zhou  2023     6   0
18  kevin_magnussen  2023     3   0
19           lawson  2023     2   0
20         sargeant  2023     1   0
21         de_vries  2023     0   0


In [15]:

Total_point_table_4=pd.DataFrame(list_total_point[4],columns=column)
sum_score_2024=[]
for index in range(len(list_total_point[4])):
    sum_score_2024.append(float(list_total_point[4][index][2]))
median_score_2024=np.median(sum_score_2024)
print(median_score_2024)
print(Total_point_table_4)  

26.0
           DriverID  Year Score Win
0    max_verstappen  2024   403   8
1            norris  2024   340   3
2           leclerc  2024   319   3
3           piastri  2024   268   2
4             sainz  2024   259   2
5           russell  2024   217   2
6          hamilton  2024   208   2
7             perez  2024   152   0
8            alonso  2024    62   0
9        hulkenberg  2024    35   0
10          tsunoda  2024    30   0
11            gasly  2024    26   0
12           stroll  2024    24   0
13             ocon  2024    23   0
14  kevin_magnussen  2024    14   0
15            albon  2024    12   0
16        ricciardo  2024    12   0
17          bearman  2024     7   0
18        colapinto  2024     5   0
19           lawson  2024     4   0
20             zhou  2024     0   0
21         sargeant  2024     0   0
22           bottas  2024     0   0


## 3.3 Score of driver per round in each season

In [16]:
result_per_round={}
for information in result_data:
    for detail in information["MRData"]["RaceTable"]["Races"]:
        round=detail["round"]
        season=detail["season"]
        for result in detail["Results"]:
            point=result["points"]
            constructorID=result["Constructor"]["constructorId"]
            result_per_round[(result["Driver"]["driverId"],round,season)]=[constructorID,point]
list_result_point=[[],[],[],[],[]]
for key, value in result_per_round.items():
    list_result_point[int(key[2])-2020].append([key[0],key[1],key[2],value[0],value[1]])
column=["DriverID","Round","Season","Constructor","Score"]
for index in range(len(list_result_point)):
    with open("CLEAN_DATA/Score_of_driver.csv", mode="a", encoding="utf-8", newline="") as file:
        writer = csv.writer(file)
        writer.writerows(list_result_point[index]) 



In [17]:
Total_result_table=pd.DataFrame(list_result_point[0],columns=column)
print(Total_result_table) 


              DriverID Round Season Constructor Score
0               bottas     1   2020    mercedes    25
1              leclerc     1   2020     ferrari    18
2               norris     1   2020     mclaren    16
3             hamilton     1   2020    mercedes    12
4                sainz     1   2020     mclaren    10
..                 ...   ...    ...         ...   ...
315             aitken    16   2020    williams     0
316  pietro_fittipaldi    16   2020        haas     0
317             latifi    16   2020    williams     0
318     max_verstappen    16   2020    red_bull     0
319            leclerc    16   2020     ferrari     0

[320 rows x 5 columns]


In [18]:
Total_result_table=pd.DataFrame(list_result_point[1],columns=column)
print(Total_result_table) 

            DriverID Round Season   Constructor Score
0           hamilton     1   2021      mercedes    25
1     max_verstappen     1   2021      red_bull    18
2             bottas     1   2021      mercedes    16
3             norris     1   2021       mclaren    12
4              perez     1   2021      red_bull    10
..               ...   ...    ...           ...   ...
315           alonso    16   2021        alpine     0
316           latifi    16   2021      williams     0
317           vettel    16   2021  aston_martin     0
318  mick_schumacher    16   2021          haas     0
319          mazepin    16   2021          haas     0

[320 rows x 5 columns]


In [19]:
Total_result_table=pd.DataFrame(list_result_point[2],columns=column)
print(Total_result_table) 

            DriverID Round Season   Constructor Score
0            leclerc     1   2022       ferrari    26
1              sainz     1   2022       ferrari    18
2           hamilton     1   2022      mercedes    15
3            russell     1   2022      mercedes    12
4    kevin_magnussen     1   2022          haas    10
..               ...   ...    ...           ...   ...
315  kevin_magnussen    16   2022          haas     0
316        ricciardo    16   2022       mclaren     0
317           stroll    16   2022  aston_martin     0
318           alonso    16   2022        alpine     0
319           vettel    16   2022  aston_martin     0

[320 rows x 5 columns]


In [20]:
Total_result_table=pd.DataFrame(list_result_point[3],columns=column)
print(Total_result_table) 

           DriverID Round Season   Constructor Score
0    max_verstappen     1   2023      red_bull    25
1             perez     1   2023      red_bull    18
2            alonso     1   2023  aston_martin    15
3             sainz     1   2023       ferrari    12
4          hamilton     1   2023      mercedes    10
..              ...   ...    ...           ...   ...
315           albon    16   2023      williams     0
316        sargeant    16   2023      williams     0
317          stroll    16   2023  aston_martin     0
318           perez    16   2023      red_bull     0
319          bottas    16   2023          alfa     0

[320 rows x 5 columns]


In [21]:
Total_result_table=pd.DataFrame(list_result_point[4],columns=column)
print(Total_result_table) 

           DriverID Round Season   Constructor Score
0    max_verstappen     1   2024      red_bull    26
1             perez     1   2024      red_bull    18
2             sainz     1   2024       ferrari    15
3           leclerc     1   2024       ferrari    12
4           russell     1   2024      mercedes    10
..              ...   ...    ...           ...   ...
314          bottas    16   2024        sauber     0
315      hulkenberg    16   2024          haas     0
316            zhou    16   2024        sauber     0
317          stroll    16   2024  aston_martin     0
318         tsunoda    16   2024            rb     0

[319 rows x 5 columns]


## 3.4 Pitstop of each player 

In [22]:
pitstop_result={}
for information in pitstop_data:
    for detail in information["MRData"]["RaceTable"]["Races"]:
        season=detail["season"]
        round=detail["round"]
        for i in range(len(detail["PitStops"])):
            stop=detail["PitStops"][i]["stop"]
            time=detail["PitStops"][i]["time"]
            duration=detail["PitStops"][i]["duration"]
            lap=detail["PitStops"][i]["lap"]
            pitstop_result[(detail["PitStops"][i]["driverId"],season,round)]=[stop,lap,time,duration]


In [23]:
list_pitstop_point=[[],[],[],[],[]]
for key, value in pitstop_result.items():
    list_pitstop_point[int(key[1])-2020].append([key[0],key[1],key[2],value[0],value[1],value[2],value[3]])
column=["DriverId","Year","Round","Stop","Lap","Time","Duration"]

for index in range(len(list_pitstop_point)):
    with open("CLEAN_DATA/Pitstop_information.csv", mode="a", encoding="utf-8", newline="") as file:
        writer = csv.writer(file)
        writer.writerows(list_pitstop_point[index]) 

In [24]:
Table_pitstop=pd.DataFrame(list_pitstop_point[0],columns=column)
print(Table_pitstop)

       DriverId  Year Round Stop Lap      Time Duration
0      grosjean  2020     1    1  20  15:37:03   23.280
1     raikkonen  2020     1    2  51  16:15:48   21.569
2    giovinazzi  2020     1    2  55  16:22:14   22.800
3       russell  2020     1    1  25  15:42:53   21.417
4        bottas  2020     1    2  56  16:23:58   16.407
..          ...   ...   ...  ...  ..       ...      ...
288        ocon  2020    16    1  41  20:55:57   24.414
289      stroll  2020    16    1  42  20:56:54   24.684
290     russell  2020    16    1  45  20:59:22   24.708
291       albon  2020    16    1  47  21:01:54   23.874
292      bottas  2020    16    1  49  21:03:18   24.552

[293 rows x 7 columns]


In [25]:
Table_pitstop=pd.DataFrame(list_pitstop_point[1],columns=column)
print(Table_pitstop)

            DriverId  Year Round Stop Lap      Time Duration
0              perez  2021     1    2  19  18:40:35   24.105
1              gasly  2021     1    2  19  18:41:17   24.317
2             alonso  2021     1    2  29  18:57:13   24.775
3             norris  2021     1    1  12  18:29:05   24.899
4            leclerc  2021     1    2  32  19:01:40   24.176
..               ...   ...   ...  ...  ..       ...      ...
277           stroll  2021    16    1  39  16:05:17   35.509
278  mick_schumacher  2021    16    1  39  16:07:05   23.892
279       giovinazzi  2021    16    1  40  16:07:22   23.150
280          leclerc  2021    16    1  47  16:17:27   23.646
281         hamilton  2021    16    1  50  16:22:19   22.684

[282 rows x 7 columns]


In [26]:
Table_pitstop=pd.DataFrame(list_pitstop_point[2],columns=column)
print(Table_pitstop)

            DriverId  Year Round Stop Lap      Time Duration
0           hamilton  2022     1    2  27  18:48:44   25.796
1             alonso  2022     1    2  25  18:45:53   24.909
2    mick_schumacher  2022     1    1  12  18:24:08   25.214
3              albon  2022     1    1  13  18:25:47   24.874
4     max_verstappen  2022     1    2  30  18:53:16   24.937
..               ...   ...   ...  ...  ..       ...      ...
295            sainz  2022    16    2  47  16:12:01   24.525
296         hamilton  2022    16    1  33  15:52:05   24.282
297  mick_schumacher  2022    16    1  33  15:52:21   24.817
298           norris  2022    16    2  47  16:12:41   24.784
299           bottas  2022    16    1  35  15:55:18   24.758

[300 rows x 7 columns]


In [27]:
Table_pitstop=pd.DataFrame(list_pitstop_point[3],columns=column)
print(Table_pitstop)

    DriverId  Year Round Stop Lap      Time Duration
0      gasly  2023     1    2  25  18:45:48   24.879
1     norris  2023     1    3  27  18:49:35   33.467
2    tsunoda  2023     1    2  26  18:47:26   24.372
3     bottas  2023     1    2  29  18:52:16   25.643
4      albon  2023     1    2  26  18:47:25   24.920
..       ...   ...   ...  ...  ..       ...      ...
296   norris  2023    16    1  17  14:36:20   23.523
297  leclerc  2023    16    1  17  14:36:23   23.197
298    sainz  2023    16    1  18  14:38:04   23.075
299    gasly  2023    16    1  18  14:38:20   24.488
300  russell  2023    16    1  24  14:48:05   24.034

[301 rows x 7 columns]


In [28]:
Table_pitstop=pd.DataFrame(list_pitstop_point[4],columns=column)
print(Table_pitstop)

           DriverId  Year Round Stop Lap      Time Duration
0        hulkenberg  2024     1    2  20  18:37:08   24.768
1              zhou  2024     1    2  28  18:49:59   25.094
2            stroll  2024     1    2  27  18:48:25   24.545
3              ocon  2024     1    2  30  18:53:30   25.145
4          sargeant  2024     1    2  28  18:51:33   25.070
..              ...   ...   ...  ...  ..       ...      ...
300          stroll  2024    16    3  50  16:15:41   25.269
301  max_verstappen  2024    16    2  41  16:01:54   24.150
302           perez  2024    16    2  35  15:53:38   24.030
303            ocon  2024    16    1  31  15:48:20   24.146
304          bottas  2024    16    1  33  15:51:24   24.410

[305 rows x 7 columns]


## 3.5 Rate of Constructor

In [29]:
stat_of_constructor={key: [] for key in List_of_constructor}
for information in driveStanding_data:
    for detail in information["MRData"]["StandingsTable"]["StandingsLists"]:
        for i in range(len(detail["DriverStandings"])):
            position=detail["DriverStandings"][i]["position"]
            point=detail["DriverStandings"][i]["points"]
            Id=detail["DriverStandings"][i]["Constructors"][0]["constructorId"]
            stat_of_constructor[Id].append([position,point])
print(stat_of_constructor)

{'haas': [['19', '2'], ['20', '1'], ['23', '0'], ['19', '0'], ['21', '0'], ['13', '25'], ['16', '12'], ['16', '9'], ['19', '3'], ['10', '35'], ['15', '14']], 'alphatauri': [['10', '75'], ['14', '32'], ['9', '110'], ['14', '32'], ['14', '23'], ['17', '12'], ['14', '17'], ['17', '6'], ['20', '2'], ['22', '0']], 'alfa': [['16', '4'], ['17', '4'], ['16', '10'], ['18', '3'], ['20', '0'], ['10', '49'], ['18', '6'], ['15', '10'], ['18', '6']], 'racing_point': [['4', '125'], ['11', '75'], ['15', '10']], 'mclaren': [['6', '105'], ['9', '97'], ['6', '160'], ['8', '115'], ['7', '122'], ['11', '37'], ['6', '205'], ['9', '97'], ['2', '340'], ['4', '268']], 'williams': [['18', '3'], ['21', '0'], ['22', '0'], ['15', '16'], ['17', '7'], ['19', '4'], ['20', '2'], ['21', '2'], ['13', '27'], ['21', '1'], ['16', '12'], ['19', '5'], ['22', '0']], 'aston_martin': [['12', '43'], ['13', '34'], ['12', '37'], ['15', '18'], ['22', '0'], ['4', '206'], ['10', '74'], ['9', '62'], ['13', '24']], 'mercedes': [['1', '

In [30]:
Evaluate_constructor={}
for key,value in stat_of_constructor.items():
    point_mean=0
    position_mean=0
    for i in range(len(value)):
        for j in range(len(value[i])):
            if(j==0):
                position_mean+=float(value[i][j])
            elif(j==1):
                point_mean+=float(value[i][j])
    Evaluate_constructor[key]=[position_mean/len(value), point_mean/len(value)]
print(Evaluate_constructor)



{'haas': [17.363636363636363, 9.181818181818182], 'alphatauri': [15.1, 30.9], 'alfa': [16.444444444444443, 10.222222222222221], 'racing_point': [10.0, 70.0], 'mclaren': [6.8, 154.6], 'williams': [18.76923076923077, 6.076923076923077], 'aston_martin': [12.222222222222221, 55.333333333333336], 'mercedes': [4.2, 253.25], 'alpine': [10.875, 62.125], 'renault': [8.5, 90.5], 'rb': [16.0, 15.333333333333334], 'sauber': [22.0, 0.0], 'red_bull': [3.1, 307.85], 'ferrari': [7.090909090909091, 181.77272727272728]}


In [31]:
List_of_constructor_evaluation=[]
for key,value in Evaluate_constructor.items():
    List_of_constructor_evaluation.append([key,value[0],value[1]])
column=["ConstructorID","Position_Average","Point_Average"]
Table_constructor=pd.DataFrame(List_of_constructor_evaluation,columns=column)
Table_constructor.to_csv("CLEAN_DATA/Constructor_stat.csv",index=False, encoding='utf-8')
print(Table_constructor)

   ConstructorID  Position_Average  Point_Average
0           haas         17.363636       9.181818
1     alphatauri         15.100000      30.900000
2           alfa         16.444444      10.222222
3   racing_point         10.000000      70.000000
4        mclaren          6.800000     154.600000
5       williams         18.769231       6.076923
6   aston_martin         12.222222      55.333333
7       mercedes          4.200000     253.250000
8         alpine         10.875000      62.125000
9        renault          8.500000      90.500000
10            rb         16.000000      15.333333
11        sauber         22.000000       0.000000
12      red_bull          3.100000     307.850000
13       ferrari          7.090909     181.772727


## 3.6 TOTAL DATA

In [32]:
Total_data={key: [] for key in List_driver}
#Score season 2020 của các driverID
required_length = 1
for j in range(len(list_total_point[0])):
    Total_data[list_total_point[0][j][0]].append(float(list_total_point[0][j][2]))
for key, values in Total_data.items():
    if(len(values)<required_length):
        Total_data[key].append(median_score_2020)
print(Total_data)
    



{'aitken': [0.0], 'hamilton': [347.0], 'alonso': [62.0], 'zhou': [62.0], 'kubica': [62.0], 'piastri': [62.0], 'vettel': [33.0], 'gasly': [75.0], 'stroll': [75.0], 'hulkenberg': [10.0], 'kvyat': [32.0], 'giovinazzi': [4.0], 'mazepin': [62.0], 'tsunoda': [62.0], 'ocon': [62.0], 'kevin_magnussen': [1.0], 'sainz': [105.0], 'pietro_fittipaldi': [0.0], 'norris': [97.0], 'latifi': [0.0], 'leclerc': [98.0], 'bearman': [62.0], 'grosjean': [2.0], 'sargeant': [62.0], 'mick_schumacher': [62.0], 'max_verstappen': [214.0], 'colapinto': [62.0], 'bottas': [223.0], 'de_vries': [62.0], 'albon': [105.0], 'raikkonen': [4.0], 'lawson': [62.0], 'ricciardo': [119.0], 'russell': [3.0], 'perez': [125.0]}


In [33]:
required_length = 2
#Score season 2021 của các driverID
for j in range(len(list_total_point[1])):
    Total_data[list_total_point[1][j][0]].append(float(list_total_point[1][j][2]))
for key, values in Total_data.items():
    if(len(values)<required_length):
        Total_data[key].append(median_score_2021)
print(Total_data)

{'aitken': [0.0, 74.0], 'hamilton': [347.0, 387.5], 'alonso': [62.0, 81.0], 'zhou': [62.0, 74.0], 'kubica': [62.0, 0.0], 'piastri': [62.0, 74.0], 'vettel': [33.0, 43.0], 'gasly': [75.0, 110.0], 'stroll': [75.0, 34.0], 'hulkenberg': [10.0, 74.0], 'kvyat': [32.0, 74.0], 'giovinazzi': [4.0, 3.0], 'mazepin': [62.0, 0.0], 'tsunoda': [62.0, 32.0], 'ocon': [62.0, 74.0], 'kevin_magnussen': [1.0, 74.0], 'sainz': [105.0, 164.5], 'pietro_fittipaldi': [0.0, 74.0], 'norris': [97.0, 160.0], 'latifi': [0.0, 7.0], 'leclerc': [98.0, 159.0], 'bearman': [62.0, 74.0], 'grosjean': [2.0, 74.0], 'sargeant': [62.0, 74.0], 'mick_schumacher': [62.0, 0.0], 'max_verstappen': [214.0, 395.5], 'colapinto': [62.0, 74.0], 'bottas': [223.0, 226.0], 'de_vries': [62.0, 74.0], 'albon': [105.0, 74.0], 'raikkonen': [4.0, 10.0], 'lawson': [62.0, 74.0], 'ricciardo': [119.0, 115.0], 'russell': [3.0, 16.0], 'perez': [125.0, 190.0]}


In [34]:
required_length = 3
#Score season 2022 của các driverID
for j in range(len(list_total_point[2])):
    Total_data[list_total_point[2][j][0]].append(float(list_total_point[2][j][2]))
for key, values in Total_data.items():
    if(len(values)<required_length):
        Total_data[key].append(median_score_2022)
print(Total_data)

{'aitken': [0.0, 74.0, 37.0], 'hamilton': [347.0, 387.5, 240.0], 'alonso': [62.0, 81.0, 81.0], 'zhou': [62.0, 74.0, 6.0], 'kubica': [62.0, 0.0, 37.0], 'piastri': [62.0, 74.0, 37.0], 'vettel': [33.0, 43.0, 37.0], 'gasly': [75.0, 110.0, 23.0], 'stroll': [75.0, 34.0, 18.0], 'hulkenberg': [10.0, 74.0, 0.0], 'kvyat': [32.0, 74.0, 37.0], 'giovinazzi': [4.0, 3.0, 37.0], 'mazepin': [62.0, 0.0, 37.0], 'tsunoda': [62.0, 32.0, 12.0], 'ocon': [62.0, 74.0, 92.0], 'kevin_magnussen': [1.0, 74.0, 25.0], 'sainz': [105.0, 164.5, 246.0], 'pietro_fittipaldi': [0.0, 74.0, 37.0], 'norris': [97.0, 160.0, 122.0], 'latifi': [0.0, 7.0, 2.0], 'leclerc': [98.0, 159.0, 308.0], 'bearman': [62.0, 74.0, 37.0], 'grosjean': [2.0, 74.0, 37.0], 'sargeant': [62.0, 74.0, 37.0], 'mick_schumacher': [62.0, 0.0, 12.0], 'max_verstappen': [214.0, 395.5, 454.0], 'colapinto': [62.0, 74.0, 37.0], 'bottas': [223.0, 226.0, 49.0], 'de_vries': [62.0, 74.0, 2.0], 'albon': [105.0, 74.0, 4.0], 'raikkonen': [4.0, 10.0, 37.0], 'lawson': [62

In [35]:
required_length = 4
#Score season 2023 của các driverID
for j in range(len(list_total_point[3])):
    Total_data[list_total_point[3][j][0]].append(float(list_total_point[3][j][2]))
for key, values in Total_data.items():
    if(len(values)<required_length):
        Total_data[key].append(median_score_2023)
print(Total_data)

{'aitken': [0.0, 74.0, 37.0, 60.0], 'hamilton': [347.0, 387.5, 240.0, 234.0], 'alonso': [62.0, 81.0, 81.0, 206.0], 'zhou': [62.0, 74.0, 6.0, 6.0], 'kubica': [62.0, 0.0, 37.0, 60.0], 'piastri': [62.0, 74.0, 37.0, 97.0], 'vettel': [33.0, 43.0, 37.0, 60.0], 'gasly': [75.0, 110.0, 23.0, 62.0], 'stroll': [75.0, 34.0, 18.0, 74.0], 'hulkenberg': [10.0, 74.0, 0.0, 9.0], 'kvyat': [32.0, 74.0, 37.0, 60.0], 'giovinazzi': [4.0, 3.0, 37.0, 60.0], 'mazepin': [62.0, 0.0, 37.0, 60.0], 'tsunoda': [62.0, 32.0, 12.0, 17.0], 'ocon': [62.0, 74.0, 92.0, 58.0], 'kevin_magnussen': [1.0, 74.0, 25.0, 3.0], 'sainz': [105.0, 164.5, 246.0, 200.0], 'pietro_fittipaldi': [0.0, 74.0, 37.0, 60.0], 'norris': [97.0, 160.0, 122.0, 205.0], 'latifi': [0.0, 7.0, 2.0, 60.0], 'leclerc': [98.0, 159.0, 308.0, 206.0], 'bearman': [62.0, 74.0, 37.0, 60.0], 'grosjean': [2.0, 74.0, 37.0, 60.0], 'sargeant': [62.0, 74.0, 37.0, 1.0], 'mick_schumacher': [62.0, 0.0, 12.0, 60.0], 'max_verstappen': [214.0, 395.5, 454.0, 575.0], 'colapinto':

In [36]:
required_length = 5
#Score season 2024 của các driverID
for j in range(len(list_total_point[4])):
    Total_data[list_total_point[4][j][0]].append(float(list_total_point[4][j][2]))
for key, values in Total_data.items():
    if(len(values)<required_length):
        Total_data[key].append(median_score_2024)
print(Total_data)

{'aitken': [0.0, 74.0, 37.0, 60.0, 26.0], 'hamilton': [347.0, 387.5, 240.0, 234.0, 208.0], 'alonso': [62.0, 81.0, 81.0, 206.0, 62.0], 'zhou': [62.0, 74.0, 6.0, 6.0, 0.0], 'kubica': [62.0, 0.0, 37.0, 60.0, 26.0], 'piastri': [62.0, 74.0, 37.0, 97.0, 268.0], 'vettel': [33.0, 43.0, 37.0, 60.0, 26.0], 'gasly': [75.0, 110.0, 23.0, 62.0, 26.0], 'stroll': [75.0, 34.0, 18.0, 74.0, 24.0], 'hulkenberg': [10.0, 74.0, 0.0, 9.0, 35.0], 'kvyat': [32.0, 74.0, 37.0, 60.0, 26.0], 'giovinazzi': [4.0, 3.0, 37.0, 60.0, 26.0], 'mazepin': [62.0, 0.0, 37.0, 60.0, 26.0], 'tsunoda': [62.0, 32.0, 12.0, 17.0, 30.0], 'ocon': [62.0, 74.0, 92.0, 58.0, 23.0], 'kevin_magnussen': [1.0, 74.0, 25.0, 3.0, 14.0], 'sainz': [105.0, 164.5, 246.0, 200.0, 259.0], 'pietro_fittipaldi': [0.0, 74.0, 37.0, 60.0, 26.0], 'norris': [97.0, 160.0, 122.0, 205.0, 340.0], 'latifi': [0.0, 7.0, 2.0, 60.0, 26.0], 'leclerc': [98.0, 159.0, 308.0, 206.0, 319.0], 'bearman': [62.0, 74.0, 37.0, 60.0, 7.0], 'grosjean': [2.0, 74.0, 37.0, 60.0, 26.0], 

In [37]:
List_driver_per_round=set()
for driverId in List_driver:
    for round in range(1,17):
        List_driver_per_round.add((driverId,round))

In [None]:
# Thêm feature score trung bình của mỗi round 1 đến 16
Score_per_round={key: [] for key in List_driver_per_round}
for index in range(len(list_result_point)):
    for j in range(len(list_result_point[index])):
        driverID=list_result_point[index][j][0]
        round=list_result_point[index][j][1]
        score=list_result_point[index][j][4]
        Score_per_round[(driverID,int(round))].append(float(score))
for key,value in Total_data.items():
    for round in range(1,17):
        Total_data[key].append(np.mean(Score_per_round[(key,round)]))
print(Total_data)

{'aitken': [0.0, 74.0, 37.0, 60.0, 26.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.0], 'hamilton': [347.0, 387.5, 240.0, 234.0, 208.0, 13.6, 11.4, 16.2, 12.0, 8.8, 11.2, 14.6, 12.0, 14.0, 17.2, 16.4, 17.9, 17.4, 11.6, 16.4, 10.0], 'alonso': [62.0, 81.0, 81.0, 206.0, 62.0, 4.75, 6.5, 5.75, 5.0, 5.5, 7.5, 4.0, 6.5, 5.25, 5.5, 3.75, 5.5, 7.75, 5.0, 4.25, 1.0], 'zhou': [62.0, 74.0, 6.0, 6.0, 0.0, 0.3333333333333333, 0.0, 0.6666666666666666, 0.0, 0.0, 0.0, 0.6666666666666666, 0.0, 1.3333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333], 'kubica': [62.0, 0.0, 37.0, 60.0, 26.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.0, 0.0, nan, nan], 'piastri': [62.0, 74.0, 37.0, 97.0, 268.0, 2.0, 6.0, 8.0, 2.0, 2.0, 0.5, 6.0, 9.0, 5.0, 9.0, 14.0, 6.0, 13.5, 9.0, 9.0, 16.5], 'vettel': [33.0, 43.0, 37.0, 60.0, 26.0, 0.5, 0.0, 2.6666666666666665, 1.6666666666666667, 3.3333333333333335, 8.0, 1.0, 2.6666666666666665, 0.3333333333333333, 0.66

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [None]:
# Đổi giờ ra thành đơn vị giây
def time_to_seconds(time_str):
    if ":" in time_str:
        # Tách phút và giây
        minutes, seconds = time_str.split(":")
        # Chuyển đổi sang tổng số giây
        return float(minutes) * 60 + float(seconds)
    return float(time_str)
    

In [None]:
# Thêm stop_times và duration từ round 1 đến round 16
Stop_times_per_round={key: [] for key in List_driver_per_round}
duration_per_round={key: [] for key in List_driver_per_round}
for index in range(len(list_pitstop_point)):
    for j in range(len(list_pitstop_point[index])):
        driverID=list_pitstop_point[index][j][0]
        round=list_pitstop_point[index][j][2]
        stop_times=list_pitstop_point[index][j][3]
        duration=list_pitstop_point[index][j][6]
        Stop_times_per_round[(driverID,int(round))].append(float(stop_times))
        duration_per_round[(driverID,int(round))].append(time_to_seconds(duration))
for key,value in Total_data.items():
    for round in range(1,17):
        Total_data[key].append(np.mean(Stop_times_per_round[(key,round)]))
        Total_data[key].append(np.mean(duration_per_round[(key,round)]))
print(Total_data)

53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
53
{'aitken': [0.0, 74.0, 37.0, 60.0, 26.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 2.0, 25.688], 'hamilton': [347.0, 387.5, 240.0, 234.0, 208.0, 13.6, 11.4, 16.2, 12.0, 8.8, 11.2, 14.6, 12.0, 14.0, 17.2, 16.4, 17.9, 17.4, 11.6, 16.4, 10.0, 1.8, 23.2028, 1.2, 24.5974, 1.2, 201.0276, 1.2, 336.92859999999996, 1.4, 23.256400000000003, 1.6, 21.941200000000002, 1.2, 26.038, 11.8, 321.46320000000003, 2.0, 305.63539999999995, 1.0, 1037.525, 1.2, 307.8004, 1.5, 29.352999999999998, 2.0, 22.7412, 1.25, 24.109499999999997, 1.8, 25.5518, 1.25, 23.9905], 'alonso': [62.0, 81.0, 81.0, 206.0, 62.0, 4.75, 6.5, 5.75, 5.0, 5.5, 7.5, 4.0, 6.5, 5.25, 5.5, 3.75, 5.5, 7.75, 5.0, 4.25, 1.0, 1.5, 25.0675, 1.0, 25.35375, 1.2

In [None]:
#Thêm feature constructor
required_length = 54
for j in range(len(list_result_point[4])):
    if(len( Total_data[list_result_point[4][j][0]])!=required_length):
        Total_data[list_result_point[4][j][0]].append(list_result_point[4][j][3])
for key, values in Total_data.items():
    if(len(values)<required_length):
        Total_data[key].append("None")
print(Total_data)  

{'aitken': [0.0, 74.0, 37.0, 60.0, 26.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 2.0, 25.688, 'None'], 'hamilton': [347.0, 387.5, 240.0, 234.0, 208.0, 13.6, 11.4, 16.2, 12.0, 8.8, 11.2, 14.6, 12.0, 14.0, 17.2, 16.4, 17.9, 17.4, 11.6, 16.4, 10.0, 1.8, 23.2028, 1.2, 24.5974, 1.2, 201.0276, 1.2, 336.92859999999996, 1.4, 23.256400000000003, 1.6, 21.941200000000002, 1.2, 26.038, 11.8, 321.46320000000003, 2.0, 305.63539999999995, 1.0, 1037.525, 1.2, 307.8004, 1.5, 29.352999999999998, 2.0, 22.7412, 1.25, 24.109499999999997, 1.8, 25.5518, 1.25, 23.9905, 'mercedes'], 'alonso': [62.0, 81.0, 81.0, 206.0, 62.0, 4.75, 6.5, 5.75, 5.0, 5.5, 7.5, 4.0, 6.5, 5.25, 5.5, 3.75, 5.5, 7.75, 5.0, 4.25, 1.0, 1.5, 25.0675, 1.0, 25.35375, 1.25, 246.2365, 1.3333333333333333, 22.72966666666667, 1.25, 22.68975, 1.75, 22.54675, 1

In [None]:
#Tạo label cho mỗi feature
column=[]
for season in range(2020,2025):
    column.append(f"Score_season_{season}")
for round in range(1,17):
    column.append(f"Score_round_{round}")
for round in range(1,17):
    column.append(f"Stop_times_round_{round}")
    column.append(f"duration_round_{round}")
column.append("Contructor")

In [47]:
Table_feature=pd.DataFrame.from_dict(Total_data,orient="index",columns=column)
Table_feature.reset_index(inplace=True)
Table_feature.rename(columns={"index": "Driver Name"}, inplace=True)
print(Table_feature)
Table_feature.to_csv("CLEAN_DATA/Features.csv",index=False, encoding='utf-8')


          Driver Name  Score_season_2020  Score_season_2021  \
0              aitken                0.0               74.0   
1            hamilton              347.0              387.5   
2              alonso               62.0               81.0   
3                zhou               62.0               74.0   
4              kubica               62.0                0.0   
5             piastri               62.0               74.0   
6              vettel               33.0               43.0   
7               gasly               75.0              110.0   
8              stroll               75.0               34.0   
9          hulkenberg               10.0               74.0   
10              kvyat               32.0               74.0   
11         giovinazzi                4.0                3.0   
12            mazepin               62.0                0.0   
13            tsunoda               62.0               32.0   
14               ocon               62.0               