### Inspect F1 data

In [3]:
import pandas as pd
import sqlite3
from sklearn import tree

In [4]:
# Connect to the SQLite database
conn = sqlite3.connect('F1_Prediction.db')

# Read the data from the database into pandas dataframes
race_results = pd.read_sql_query("SELECT * FROM RACE_RESULTS", conn)
circuits = pd.read_sql_query("SELECT DISTINCT CIRCUIT_NAME, CIRCUIT_TYPE FROM CIRCUITS", conn)


In [6]:
race_results.shape

(3220, 6)

In [7]:
race_results.head()

Unnamed: 0,PK,DRIVER,TEAM,POSITION,CIRCUIT,YEAR
0,1,M VERSTAPPEN,Red Bull Racing,1.0,Bahrain Grand Prix,2023
1,2,S PEREZ,Red Bull Racing,2.0,Bahrain Grand Prix,2023
2,3,F ALONSO,Aston Martin,3.0,Bahrain Grand Prix,2023
3,4,C SAINZ,Ferrari,4.0,Bahrain Grand Prix,2023
4,5,L HAMILTON,Mercedes,5.0,Bahrain Grand Prix,2023


In [8]:
circuits.head()

Unnamed: 0,CIRCUIT_NAME,CIRCUIT_TYPE
0,Bahrain Grand Prix,Low-Speed
1,Saudi Arabian Grand Prix,Other
2,Australian Grand Prix,Other
3,Azerbaijan Grand Prix,High Speed
4,Miami Grand Prix,Street


In [9]:
circuits.shape

(21, 2)

In [19]:

# Merge the dataframes based on the circuit name
data = pd.merge(race_results, circuits, left_on='CIRCUIT', right_on='CIRCUIT_NAME')
data.head()



Unnamed: 0,PK,DRIVER,TEAM,POSITION,CIRCUIT,YEAR,CIRCUIT_NAME,CIRCUIT_TYPE
0,1,M VERSTAPPEN,Red Bull Racing,1.0,Bahrain Grand Prix,2023,Bahrain Grand Prix,Low-Speed
1,2,S PEREZ,Red Bull Racing,2.0,Bahrain Grand Prix,2023,Bahrain Grand Prix,Low-Speed
2,3,F ALONSO,Aston Martin,3.0,Bahrain Grand Prix,2023,Bahrain Grand Prix,Low-Speed
3,4,C SAINZ,Ferrari,4.0,Bahrain Grand Prix,2023,Bahrain Grand Prix,Low-Speed
4,5,L HAMILTON,Mercedes,5.0,Bahrain Grand Prix,2023,Bahrain Grand Prix,Low-Speed


In [20]:
data.isnull().sum()

PK               0
DRIVER           0
TEAM             0
POSITION        20
CIRCUIT          0
YEAR             0
CIRCUIT_NAME     0
CIRCUIT_TYPE     0
dtype: int64

In [21]:
data[data['POSITION'].isna()]

Unnamed: 0,PK,DRIVER,TEAM,POSITION,CIRCUIT,YEAR,CIRCUIT_NAME,CIRCUIT_TYPE
2780,1941,C LECLERC,Ferrari,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2781,1942,C SAINZ,Ferrari,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2782,1943,M VERSTAPPEN,Red Bull Racing,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2783,1944,D RICCIARDO,AlphaTauri,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2784,1945,S PEREZ,Red Bull Racing,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2785,1946,L HAMILTON,Mercedes,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2786,1947,O PIASTRI,McLaren,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2787,1948,G RUSSELL,Mercedes,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2788,1949,V BOTTAS,Alfa Romeo,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation
2789,1950,G ZHOU,Alfa Romeo,,Mexico City Grand Prix,2023,Mexico City Grand Prix,High-Elevation


In [22]:
# Positions have not been filled in for Mexico City Grand Prix yet so drop from the dataset
data.dropna(inplace=True)

In [23]:
data.isnull().sum()

PK              0
DRIVER          0
TEAM            0
POSITION        0
CIRCUIT         0
YEAR            0
CIRCUIT_NAME    0
CIRCUIT_TYPE    0
dtype: int64

In [24]:
# Preprocess your data if necessary, encoding categorical variables?
# Have not done any of this yet

# Define features and target variable
X = data[['DRIVER', 'TEAM', 'CIRCUIT_TYPE']]
y = data['POSITION']

# Convert categorical variables into numerical values using one-hot encoding
X = pd.get_dummies(X)

In [25]:
X

Unnamed: 0,DRIVER_A ALBON,DRIVER_A GIOVINAZZI,DRIVER_C LECLERC,DRIVER_C SAINZ,DRIVER_D KVYAT,DRIVER_D RICCIARDO,DRIVER_E OCON,DRIVER_F ALONSO,DRIVER_G RUSSELL,DRIVER_G ZHOU,...,TEAM_Red Bull Racing,TEAM_Renault,TEAM_Williams,TEAM_nan,CIRCUIT_TYPE_High Speed,CIRCUIT_TYPE_High-Elevation,CIRCUIT_TYPE_Intermediate,CIRCUIT_TYPE_Low-Speed,CIRCUIT_TYPE_Other,CIRCUIT_TYPE_Street
0,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3216,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,False
3217,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3218,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [26]:

# Change n_jobs to modify how many CPU's are being utilized
model = tree.DecisionTreeClassifier()

# Fit the model to your data
model.fit(X, y)



In [27]:
# mark a 1 for the values we would like to predict, refer to the table to know which circuit type to use. 
# for example Albon = 1, Williams = 1 and Street type circuit = 1 for las vegas. Alex Albon would be projected 14th.
upcoming_race_df = pd.DataFrame({
    'DRIVER_A ALBON': [1], 'DRIVER_A GIOVINAZZI': [0],
    'DRIVER_C LECLERC': [0], 'DRIVER_C SAINZ': [0], 'DRIVER_D KVYAT': [0],
    'DRIVER_D RICCIARDO': [0], 'DRIVER_E OCON': [0], 'DRIVER_F ALONSO': [0],
    'DRIVER_G RUSSELL': [0], 'DRIVER_G ZHOU': [0], 'DRIVER_K MAGNUSSEN': [0],
    'DRIVER_K RAIKKONEN': [0],'DRIVER_L HAMILTON': [0], 'DRIVER_L LAWSON': [0], 'DRIVER_L NORRIS': [0],
    'DRIVER_L SARGEANT': [0], 'DRIVER_L STROLL': [0], 'DRIVER_M SCHUMACHER': [0],
    'DRIVER_M VERSTAPPEN': [0], 'DRIVER_N DE VRIES': [0], 'DRIVER_N HULKENBERG': [0],
    'DRIVER_N LATIFI': [0], 'DRIVER_N MAZEPIN': [0], 'DRIVER_O PIASTRI': [0],
    'DRIVER_P FITTIPALDI': [0], 'DRIVER_P GASLY': [0], 'DRIVER_R GROSJEAN': [0],
    'DRIVER_R KUBICA': [0], 'DRIVER_S PEREZ': [0], 'DRIVER_S VETTEL': [0],
    'DRIVER_V BOTTAS': [0],'DRIVER_Y TSUNODA': [0], 'DRIVER_nan': [0],
    'TEAM_Alfa Romeo': [0], 'TEAM_Alfa Romeo Racing': [0], 'TEAM_AlphaTauri':[1],
    'TEAM_Alpine':[0], 'TEAM_Aston Martin': [0], 'TEAM_Ferrari': [0], 'TEAM_Haas F1 Team': [0],
    'TEAM_McLaren': [0], 'TEAM_Mercedes': [0], 'TEAM_Racing Point': [0], 'TEAM_Red Bull Racing': [1], 
    'TEAM_Renault': [0], 'TEAM_Williams': [1], 'TEAM_nan': [0],
    'CIRCUIT_TYPE_High Speed': [0], 'CIRCUIT_TYPE_High-Elevation': [0],
    'CIRCUIT_TYPE_Intermediate': [0], 'CIRCUIT_TYPE_Low-Speed': [0],
    'CIRCUIT_TYPE_Other': [0], 'CIRCUIT_TYPE_Street': [1]
})

# Make predictions using the trained model
predicted_positions = model.predict(upcoming_race_df)

# Print the predicted positions for the upcoming race
print("Predicted positions for the upcoming race in Las Vegas:")
print(predicted_positions)


Predicted positions for the upcoming race in Las Vegas:
[14.]
