### Hypothesis

The ones higher up on the grid (1st 4) are more likely to win.

In [14]:
# importing required libraries 

import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)

In [15]:
circuit_df = pd.read_csv("data/circuits.csv")
constructor_results_df = pd.read_csv("data/constructor_results.csv")
constructors_df = pd.read_csv("data/constructors.csv")
driver_standings_df = pd.read_csv("data/driver_standings.csv")
drivers_df = pd.read_csv("data/drivers.csv")
lap_times_df = pd.read_csv("data/lap_times.csv")
pit_stops_df = pd.read_csv("data/pit_stops.csv")
qualifying_df = pd.read_csv("data/qualifying.csv")
races_df = pd.read_csv("data/races.csv")
results_df = pd.read_csv("data/results.csv")
seasons_df = pd.read_csv("data/seasons.csv")
sprint_res_df = pd.read_csv("data/sprint_results.csv")
status_df = pd.read_csv("data/status.csv")
tyre_data_df = pd.read_csv("data/tyre_data.csv")
races_with_weather_df = pd.read_csv("data/races_with_weather.csv")
races_with_dummies_df = pd.read_csv("data/races_with_dummies.csv")

To check the hypothesis, we will need tp merge these 4 tables.
- __results_df__ + __drivers_Id__ based on the columns __driver_Id__
- Plus __races_df__ based on __raceId__ column
- And last __circuit_df__ based on __circuitId__ column

In [16]:
merged_qualifying_race_results = pd.merge(drivers_df,results_df, on= 'driverId')

merged_data_with_races = pd.merge(merged_qualifying_race_results, races_df, on='raceId')

final_merged_data = pd.merge(merged_data_with_races, circuit_df, on='circuitId')

print(final_merged_data)

       driverId   driverRef number_x code  forename     surname         dob   
0             1    hamilton       44  HAM     Lewis    Hamilton  1985-01-07  \
1             2    heidfeld       \N  HEI      Nick    Heidfeld  1977-05-10   
2             3     rosberg        6  ROS      Nico     Rosberg  1985-06-27   
3             4      alonso       14  ALO  Fernando      Alonso  1981-07-29   
4             5  kovalainen       \N  KOV    Heikki  Kovalainen  1981-10-19   
...         ...         ...      ...  ...       ...         ...         ...   
25835       697     bonetto       \N   \N    Felice     Bonetto  1903-06-09   
25836       704   cabantous       \N   \N      Yves   Cabantous  1904-10-08   
25837       741   etancelin       \N   \N  Philippe   Étancelin  1896-12-28   
25838       782       jover       \N   \N      Juan       Jover  1903-11-23   
25839       783    grignard       \N   \N   Georges    Grignard  1905-07-25   

      nationality                                  

In [17]:
final_merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25840 entries, 0 to 25839
Data columns (total 51 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   driverId         25840 non-null  int64  
 1   driverRef        25840 non-null  object 
 2   number_x         25840 non-null  object 
 3   code             25840 non-null  object 
 4   forename         25840 non-null  object 
 5   surname          25840 non-null  object 
 6   dob              25840 non-null  object 
 7   nationality      25840 non-null  object 
 8   url_x            25840 non-null  object 
 9   resultId         25840 non-null  int64  
 10  raceId           25840 non-null  int64  
 11  constructorId    25840 non-null  int64  
 12  number_y         25840 non-null  object 
 13  grid             25840 non-null  int64  
 14  position         25840 non-null  object 
 15  positionText     25840 non-null  object 
 16  positionOrder    25840 non-null  int64  
 17  points      

Our __final_merged_data__ has 25840 rows and 51 columns. 

In [18]:
final_merged_data.year.unique()

array([2008, 2007, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
       2018, 2019, 2022, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999,
       1998, 1997, 1996, 2020, 2021, 1995, 1994, 1993, 1992, 1991, 1990,
       1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979,
       1978, 1977, 1976, 1975, 1974, 1973, 1972, 1971, 1970, 1969, 1968,
       1967, 1964, 1963, 1966, 1965, 1962, 1961, 1960, 1959, 1958, 1957,
       1956, 1955, 1950, 1954, 1953, 1952, 1951])

Our data frame has data from 1950 and 2022.

In [19]:
final_merged_data.isna().sum()

driverId           0
driverRef          0
number_x           0
code               0
forename           0
surname            0
dob                0
nationality        0
url_x              0
resultId           0
raceId             0
constructorId      0
number_y           0
grid               0
position           0
positionText       0
positionOrder      0
points             0
laps               0
time_x             0
milliseconds       0
fastestLap         0
rank               0
fastestLapTime     0
fastestLapSpeed    0
statusId           0
year               0
round              0
circuitId          0
name_x             0
date               0
time_y             0
url_y              0
fp1_date           0
fp1_time           0
fp2_date           0
fp2_time           0
fp3_date           0
fp3_time           0
quali_date         0
quali_time         0
sprint_date        0
sprint_time        0
circuitRef         0
name_y             0
location           0
country            0
lat          

And has no Nan Values

Let's start to have a look on the distribution of wins versus starting position

In [20]:
winning_positions_data = final_merged_data[
    final_merged_data['positionOrder'] == 1
]

# Get the unique grid positions
unique_grid_positions = sorted(winning_positions_data['grid'].unique())

# Calculate the histogram data
hist, bin_edges = np.histogram(winning_positions_data['grid'], bins=len(unique_grid_positions), range=(min(unique_grid_positions) - 0.5, max(unique_grid_positions) + 0.5))

# Create a bar plot
fig = go.Figure(data=[go.Bar(x=unique_grid_positions, y=hist, marker_color='#00A08B')])

fig.update_traces(
    marker=dict(line=dict(width=1, color='black')),  # Add black borders between bars
    texttemplate='%{y:.0f}',  # Display integer values for y-axis
    textposition='outside',  # Display text labels outside the bars
)

fig.update_layout(
    title='Distribution of Wins Based on Starting Position from 1950',
    xaxis_title='Grid Position',
    yaxis_title='Number of Wins',
    title_font=dict(size=24),  # Increase title size
    xaxis=dict(tickvals=unique_grid_positions, ticktext=unique_grid_positions),  # Set custom tick values and labels
    bargap=0.1,  # Adjust gap between bars
    bargroupgap=0.1,  # Adjust gap between groups of bars
    #width=800,  # Set the width of the graph
    #height=600,  # Set the height of the graph
)

fig.show()

With no surprises, drivers who start up on the grid (1st 4) are more likely to win. Our Hypothesis confirms! 

Interestingly, there are more winners who start sixth on the grid than those who start fifth.

Who was the driver that won a Grand Prix starting on 22nd position?

In [21]:
# Filter data for positionOrder equal to 1 (winning position)
winning_positions_data = final_merged_data[
    (final_merged_data['positionOrder'] == 1) &
    (final_merged_data['grid'] == 22)
]

# Get the driver's information
if not winning_positions_data.empty:
    winning_driver = winning_positions_data.iloc[0]
    driver_name = f"{winning_driver['forename']} {winning_driver['surname']}"
    print(f"The driver who won starting from the 22nd grid position is {driver_name}.")
else:
    print("No driver won from the 22nd grid position.")


The driver who won starting from the 22nd grid position is John Watson.


John Watson is a British former racing driver and current commentator from Northern Ireland. He competed in Formula One, winning five Grands Prix and was third in the 1982 championship.
John Watson holds the record, after starting 22nd at the historic 1983 Long Beach Grand Prix in the US.
Find [here](https://www.youtube.com/watch?v=fl0pFVqShas) more information about this event.


#### Fernando Alonso's Numbers

In [22]:
fernando_alonso_wins = final_merged_data[
    (final_merged_data['forename'] == 'Fernando') &
    (final_merged_data['surname'] == 'Alonso') &
    (final_merged_data['positionOrder'] == 1)
]

fig = px.histogram(fernando_alonso_wins, x='grid', nbins=20, color_discrete_sequence=["#00A08B"], text_auto='.0f', title='Distribution of Wins Based on Starting Position for Fernando Alonso')

fig.update_xaxes(title_text='Starting Position', title_font=dict(size=18))  # Increase title size
fig.update_yaxes(title_text='Number of Wins', title_font=dict(size=18))  # Increase title size

x_labels = fernando_alonso_wins['grid'].unique()
fig.update_xaxes(tickmode='array', tickvals=x_labels, ticktext=x_labels)

fig.update_traces(
    marker=dict(line=dict(width=1, color='black')),  # Add black borders between bars
    textangle=0,  # Rotate text labels to be horizontal
)

fig.update_layout(
    title_font=dict(size=24),  # Increase title size
    bargap=0.1,  # Adjust gap between bars
    bargroupgap=0.1,  # Adjust gap between groups of bars
)

fig.show()

Fernando Alonso won a Grand Prix for 32 times. Of these 32 victories, 14 came from pole positions.

#### Lance Stroll's Numbers

In [23]:
lance_stroll_data = final_merged_data[
    (final_merged_data['forename'] == 'Lance') &
    (final_merged_data['surname'] == 'Stroll') &
    (final_merged_data['positionOrder'] >= 1) &
    (final_merged_data['positionOrder'] <= 10)
]

# Create a histogram
fig = px.histogram(
    lance_stroll_data,
    x='positionOrder',  # Ending Position Order on x-axis
    color_discrete_sequence=["#00A08B"],  # Set the color for all bars
    barmode='group',    # Grouped histogram
    title='Distribution of Ending Positions for Lance Stroll ',
    text_auto='.0f',
    labels={'positionOrder': 'Ending Position'},
    nbins=10,  # Set the number of bins
)

fig.update_xaxes(title_text='Ending Position', title_font=dict(size=18))  # Increase title size
fig.update_yaxes(title_text='Number of Races', title_font=dict(size=18))  # Increase title size

# Adjust y-axis range and ticks
fig.update_layout(
    title_font=dict(size=24),  # Increase title size
    bargap=0.1,  # Adjust gap between bars
    bargroupgap=0.1,  # Adjust gap between groups of bars
    yaxis=dict(range=[0,12], tickvals=list(range(12))),  # Set y-axis range and tick values
)

fig.update_traces(
    marker=dict(line=dict(width=1, color='black')),  # Add black borders between bars
)

fig.show()

Lance Stroll, the second Aston Martin driver, never won a Grand Prix and his best position was third place.

#### Monaco's Numbers

Drivers loves the romance, the tradition and the history of the place. They love the idea of driving on the edge, knowing a single mistake can ruin a weekend's work. They love the pressure of knowing that their abilities behind the wheel can make all the difference.

Indeed, there is a lot to admire about Monaco from a driving perspective—but what about those of us who don't have that privilege?

For spectators, the Monte Carlo weekend can rank among the most boring of the season, to the point where it crosses your mind whether F1 is beginning to outgrow its most famous race.

But why is so boring? 

Because there are almost no opportunities to overtaking. So, Pole positions is the key in this circuit.

Let's have a look in the distribution of wins based on starting position.


In [24]:
title='Tyre Type Distribution on Street Circuits 2012-2023<br><sup>Street Circuits: Monaco, Australia, Singapore, Russia, Azerbaijan, Saudi Arabia, Miami and Canada</sup>',

In [25]:
winning_positions_data_monaco = final_merged_data[
    (final_merged_data['positionOrder'] == 1) &
    (final_merged_data['country'] == 'Monaco')
]

fig = px.histogram(winning_positions_data_monaco, x='grid', nbins=20, color_discrete_sequence=["#00A08B"], text_auto='.0f', title="Distribution of Wins Based on Starting Position (Monaco) from 1950")

# Manually set the x-axis tick positions and labels for each bar
x_labels = winning_positions_data_monaco['grid'].unique()
fig.update_xaxes(tickmode='array', tickvals=x_labels, ticktext=x_labels)
fig.update_xaxes(title_text='Starting Position', title_font=dict(size=18))  # Increase title size
fig.update_yaxes(title_text='Number of Wins', title_font=dict(size=18))  # Increase title size

fig.update_traces(
    marker=dict(line=dict(width=1, color='black')),  # Add black borders between bars
    textangle=0,  # Rotate text labels to be vertical
)

fig.update_layout(
    title_font=dict(size=24),  # Increase title size
    bargap=0.1,  # Adjust gap between bars
    bargroupgap=0.1,  # Adjust gap between groups of bars
)

fig.show()

Of Monaco's 68 grand prix, only one driver managed to win, having started on the grid beyond the first 10 qualifiers.

Let's find out who did it.

In [26]:
 #Filter data for positionOrder equal to 1 (winning position)
winning_positions_data = final_merged_data[
    (final_merged_data['positionOrder'] == 1) &
    (final_merged_data['grid'] == 14)
]

# Get the driver's information
if not winning_positions_data.empty:
    winning_driver = winning_positions_data.iloc[0]
    driver_name = f"{winning_driver['forename']} {winning_driver['surname']}"
    print(f"The driver who won starting from the 14th grid position is {driver_name}.")
else:
    print("No driver won from the 22nd grid position.")


The driver who won starting from the 14th grid position is Olivier Panis.


Olivier Jean Denis Marie Panis is a French former racing driver. Panis raced in Formula One for ten seasons, earning his first and only win at the 1996 Monaco Grand Prix for the Ligier team.

For more information about this win, click [here](https://www.youtube.com/watch?v=dBRm5iCqQBk)