In [43]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
races = pd.read_csv('cleaned_races.csv')
sprint_results = pd.read_csv('cleaned_sprint_results.csv')
results = pd.read_csv('cleaned_results.csv')
drivers = pd.read_csv('cleaned_drivers.csv')

In [45]:
races_2022 = races[races['year'] == 2022]
race_ids_2022 = set(races_2022['raceId'])
print(f"Number of races in 2022 season: {len(races_2022)}")

Number of races in 2022 season: 22


In [46]:
sprints_2022 = sprint_results[sprint_results['raceId'].isin(race_ids_2022)]
sprint_races_2022 = set(sprints_2022['raceId'])
print(f"Number of sprint races in 2022: {len(sprint_races_2022)}")

Number of sprint races in 2022: 3


In [47]:
driver_names = dict(zip(drivers['driverId'], drivers['driver_forename'] + ' ' +
                        drivers['driver_surname']))

In [48]:
sprint_points = sprints_2022.groupby('driverId')['points'].sum().reset_index()
sprint_points['name'] = sprint_points['driverId'].map(driver_names)

In [49]:
median_sprint_points = sprint_points['points'].median()
sprint_points['sprint_success'] = (sprint_points['points'] > median_sprint_points).astype(int)

In [50]:
print(f"Median sprint points: {median_sprint_points}")
print("\nSprint points by driver:")
print(sprint_points.sort_values(by='points', ascending=False)[['name', 'points', 'sprint_success']])

Median sprint points: 2.5

Sprint points by driver:
                name  points  sprint_success
7     Max Verstappen      21               1
8       Carlos Sainz      18               1
12   Charles Leclerc      17               1
3       Sergio Pérez      14               1
14    George Russell      13               1
0     Lewis Hamilton       7               1
13      Lando Norris       6               1
6    Kevin Magnussen       4               1
9       Esteban Ocon       3               1
4   Daniel Ricciardo       3               1
5    Valtteri Bottas       2               0
1    Fernando Alonso       0               0
2   Sebastian Vettel       0               0
10      Lance Stroll       0               0
11      Pierre Gasly       0               0
15   Alexander Albon       0               0
16   Nicholas Latifi       0               0
17      Yuki Tsunoda       0               0
18   Mick Schumacher       0               0
19       Guanyu Zhou       0               0


In [51]:
results_2022 = results[results['raceId'].isin(race_ids_2022)]
championship_points = results_2022.groupby('driverId')['points'].sum().reset_index()
championship_points['name'] = championship_points['driverId'].map(driver_names)

In [52]:
championship_points = championship_points.sort_values(by='points', ascending=False)
championship_points['position'] = range(1, len(championship_points) + 1)

In [53]:
halfway_position = np.ceil(len(championship_points) / 2)
championship_points['championship_success'] = (championship_points['position']
                                               <= halfway_position).astype(int)

In [54]:
print("\nChampionship standings (top 10):")
print(championship_points.head(10)[['name', 'points', 'position', 'championship_success']])


Championship standings (top 10):
               name  points  position  championship_success
8    Max Verstappen   433.0         1                     1
4      Sergio Pérez   291.0         2                     1
13  Charles Leclerc   291.0         3                     1
15   George Russell   262.0         4                     1
0    Lewis Hamilton   233.0         5                     1
9      Carlos Sainz   228.0         6                     1
14     Lando Norris   116.0         7                     1
10     Esteban Ocon    89.0         8                     1
1   Fernando Alonso    81.0         9                     1
6   Valtteri Bottas    47.0        10                     1


In [55]:
combined_data = pd.merge(sprint_points, championship_points,
                         on=['driverId', 'name'], suffixes=('_sprint', '_championship'))

In [56]:
print(combined_data[['name', 'points_sprint', 'sprint_success', 'points_championship', 'championship_success']])

                name  points_sprint  sprint_success  points_championship  \
0     Lewis Hamilton              7               1                233.0   
1    Fernando Alonso              0               0                 81.0   
2   Sebastian Vettel              0               0                 37.0   
3       Sergio Pérez             14               1                291.0   
4   Daniel Ricciardo              3               1                 34.0   
5    Valtteri Bottas              2               0                 47.0   
6    Kevin Magnussen              4               1                 21.0   
7     Max Verstappen             21               1                433.0   
8       Carlos Sainz             18               1                228.0   
9       Esteban Ocon              3               1                 89.0   
10      Lance Stroll              0               0                 18.0   
11      Pierre Gasly              0               0                 23.0   
12   Charles

In [62]:
y = combined_data['championship_success']
X = sm.add_constant(combined_data[['sprint_success']])

In [63]:
model = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.555633
         Iterations 5


In [71]:
print(model.summary())

                            Logit Regression Results                            
Dep. Variable:     championship_success   No. Observations:                   20
Model:                            Logit   Df Residuals:                       18
Method:                             MLE   Df Model:                            1
Date:                  Mon, 12 May 2025   Pseudo R-squ.:                  0.1926
Time:                          20:59:13   Log-Likelihood:                -11.113
converged:                         True   LL-Null:                       -13.763
Covariance Type:              nonrobust   LLR p-value:                   0.02132
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.8473      0.690     -1.228      0.220      -2.200       0.505
sprint_success     2.2336      1.049      2.128      0.033       0.177       4.290


In [74]:
odds_ratio = np.exp(model.params)
print(odds_ratio)

const             0.428571
sprint_success    9.333333
dtype: float64
