### Hypothesis

The ones higher up on the grid (1st 4) are more likely to win.

In [113]:
# importing required libraries 

import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)

To answer this question, we will use the follow 6 tables:

In [114]:
results_df = pd.read_csv("data/results.csv")
races_df = pd.read_csv("data/races.csv")
circuit_df = pd.read_csv("data/circuits.csv")
constructors_df = pd.read_csv("data/constructors.csv")
drivers_df = pd.read_csv("data/drivers.csv")
pit_stops_df = pd.read_csv("data/pit_stops.csv")

In [115]:
results_df.info()
races_df.info()
circuit_df.info()
constructors_df.info()
drivers_df.info()
pit_stops_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25840 entries, 0 to 25839
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   resultId         25840 non-null  int64  
 1   raceId           25840 non-null  int64  
 2   driverId         25840 non-null  int64  
 3   constructorId    25840 non-null  int64  
 4   number           25840 non-null  object 
 5   grid             25840 non-null  int64  
 6   position         25840 non-null  object 
 7   positionText     25840 non-null  object 
 8   positionOrder    25840 non-null  int64  
 9   points           25840 non-null  float64
 10  laps             25840 non-null  int64  
 11  time             25840 non-null  object 
 12  milliseconds     25840 non-null  object 
 13  fastestLap       25840 non-null  object 
 14  rank             25840 non-null  object 
 15  fastestLapTime   25840 non-null  object 
 16  fastestLapSpeed  25840 non-null  object 
 17  statusId    

Let's check if they need a prior cleaning process:

In [116]:
results_df.isna().sum()
races_df.isna().sum()
circuit_df.isna().sum()
constructors_df.isna().sum()
drivers_df.isna().sum()
pit_stops_df.isna().sum()

raceId          0
driverId        0
stop            0
lap             0
time            0
duration        0
milliseconds    0
dtype: int64

We have no Nan values in our tables. Let's check the duplicate values.

In [117]:
results_df.duplicated().sum()
races_df.duplicated().sum()
circuit_df.duplicated().sum()
constructors_df.duplicated().sum()
drivers_df.duplicated().sum()
pit_stops_df.duplicated().sum()

0

Let's have a look on the columns names, to check in which columns can we base on:

In [118]:
print("Results DataFrame columns:")
print(results_df.columns)

print("\nRaces DataFrame columns:")
print(races_df.columns)

print("\nCircuit DataFrame columns:")
print(circuit_df.columns)

print("\nConstructors DataFrame columns:")
print(constructors_df.columns)

print("\nDrivers DataFrame columns:")
print(drivers_df.columns)




Results DataFrame columns:
Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'position', 'positionText', 'positionOrder', 'points', 'laps', 'time',
       'milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId'],
      dtype='object')

Races DataFrame columns:
Index(['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'url',
       'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time',
       'quali_date', 'quali_time', 'sprint_date', 'sprint_time'],
      dtype='object')

Circuit DataFrame columns:
Index(['circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng',
       'alt', 'url'],
      dtype='object')

Constructors DataFrame columns:
Index(['constructorId', 'constructorRef', 'name', 'nationality', 'url'], dtype='object')

Drivers DataFrame columns:
Index(['driverId', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob',
       'nationality', 'url'],
      dtype='

Our tables have no duplicate values. 

Let's drop columns to help our merging 

#### Races

In [119]:
# List of columns to keep for our analysis
columns_to_keep = ['raceId', 'year', 'round', 'circuitId', 'name', 'date']

# Filter the DataFrame to keep only the necessary columns
final_race_data = races_df[columns_to_keep]

In [120]:
# Renaming columns
final_constructor_data.rename(columns={
    'name': 'race_name',
    'date': 'race_date',
}, inplace=True)

#### Circuit

In [121]:
# List of columns to keep for our analysis
columns_to_keep = ['circuitId', 'circuitRef', 'name', 'location', 'country']
       
# Filter the DataFrame to keep only the necessary columns
final_circuit_data = circuit_df[columns_to_keep]


In [122]:
# Renaming columns
final_constructor_data.rename(columns={
    'name': 'circuit_name',
}, inplace=True)

#### Constructor

In [123]:
# List of columns to keep for our analysis
columns_to_keep = ['constructorId', 'constructorRef', 'name']
       
# Filter the DataFrame to keep only the necessary columns
final_constructor_data = constructors_df[columns_to_keep]

In [124]:
# Renaming columns
final_constructor_data.rename(columns={
    'name': 'constructor_name',
}, inplace=True)

##### Drivers

In [125]:
# List of columns to keep for our analysis
columns_to_keep = ['driverId', 'driverRef', 'forename', 'surname']

# Filter the DataFrame to keep only the necessary columns
final_drivers_data = drivers_df[columns_to_keep]

In [126]:
# Renaming columns
final_drivers_data.rename(columns={
    'forename': 'driver_forename',
    'surname': 'driver_surname',
}, inplace=True)

#### Final tables to merge


- final_drivers_data
- final_constructor_data
- final_circuit_data
- final_race_data 
- results_df
- pit_stops_df



#### Merging tables

Since we have column names with the same naming, let's change it, to avoid issues after merging.

In [127]:
merged_races_results = pd.merge(final_race_data,results_df, on= 'raceId')

merged_races_results_circuits = pd.merge(merged_races_results, final_circuit_data, on='circuitId')

merged_races_results_circuits_constructors = pd.merge(merged_races_results_circuits, final_constructor_data, on='constructorId')

merged_races_results_circuits_constructors_drivers = pd.merge(merged_races_results_circuits_constructors, final_drivers_data, on='driverId')

In [128]:
merged_races_results_circuits_constructors_drivers_pit_stops = pd.merge(merged_races_results_circuits_constructors_drivers, pit_stops_df, on='raceId')

In [129]:
final_hypothesis_df = merged_races_results_circuits_constructors_drivers_pit_stops

Our 6 tables are merged. Let's check our data

In [130]:
final_hypothesis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205941 entries, 0 to 205940
Data columns (total 38 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   raceId            205941 non-null  int64  
 1   year              205941 non-null  int64  
 2   round             205941 non-null  int64  
 3   circuitId         205941 non-null  int64  
 4   name_x            205941 non-null  object 
 5   date              205941 non-null  object 
 6   resultId          205941 non-null  int64  
 7   driverId_x        205941 non-null  int64  
 8   constructorId     205941 non-null  int64  
 9   number            205941 non-null  object 
 10  grid              205941 non-null  int64  
 11  position          205941 non-null  object 
 12  positionText      205941 non-null  object 
 13  positionOrder     205941 non-null  int64  
 14  points            205941 non-null  float64
 15  laps              205941 non-null  int64  
 16  time_x            20

In [131]:
final_hypothesis_df.isna().count()

raceId              205941
year                205941
round               205941
circuitId           205941
name_x              205941
date                205941
resultId            205941
driverId_x          205941
constructorId       205941
number              205941
grid                205941
position            205941
positionText        205941
positionOrder       205941
points              205941
laps                205941
time_x              205941
milliseconds_x      205941
fastestLap          205941
rank                205941
fastestLapTime      205941
fastestLapSpeed     205941
statusId            205941
circuitRef          205941
name_y              205941
location            205941
country             205941
constructorRef      205941
constructor_name    205941
driverRef           205941
driver_forename     205941
driver_surname      205941
driverId_y          205941
stop                205941
lap                 205941
time_y              205941
duration            205941
m

In [150]:
missing_values_summary = final_hypothesis_df.isna().sum()
print(missing_values_summary)


raceId                   0
year                     0
round                    0
circuitId                0
grand_prix               0
date                     0
resultId                 0
driverId_x               0
constructorId            0
number                   0
grid                     0
position                 0
positionText             0
positionOrder            0
points                   0
laps                     0
circuit_time             0
circuit_milliseconds     0
fastestLap               0
rank                     0
fastestLapTime           0
fastestLapSpeed          0
statusId                 0
circuitRef               0
circuit_name             0
location                 0
country                  0
constructorRef           0
constructor_name         0
driverRef                0
driver_forename          0
driver_surname           0
driverId_y               0
stop                     0
lap                      0
time_y                   0
pit_stop_duration        0
p

In [151]:
final_hypothesis_df.head()

Unnamed: 0,raceId,year,round,circuitId,grand_prix,date,resultId,driverId_x,constructorId,number,grid,position,positionText,positionOrder,points,laps,circuit_time,circuit_milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,circuitRef,circuit_name,location,country,constructorRef,constructor_name,driverRef,driver_forename,driver_surname,driverId_y,stop,lap,time_y,pit_stop_duration,pit_stop_milliseconds
0,841,2011,1,1,Australian Grand Prix,2011-03-27,20784,18,1,4,4,6,6,6,8.0,58,54.304,5424563,49,5,1:29.883,212.396,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,mclaren,McLaren,button,Jenson,Button,153,1,1,17:05:23,26.898,26898
1,841,2011,1,1,Australian Grand Prix,2011-03-27,20784,18,1,4,4,6,6,6,8.0,58,54.304,5424563,49,5,1:29.883,212.396,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,mclaren,McLaren,button,Jenson,Button,30,1,1,17:05:52,25.021,25021
2,841,2011,1,1,Australian Grand Prix,2011-03-27,20784,18,1,4,4,6,6,6,8.0,58,54.304,5424563,49,5,1:29.883,212.396,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,mclaren,McLaren,button,Jenson,Button,17,1,11,17:20:48,23.426,23426
3,841,2011,1,1,Australian Grand Prix,2011-03-27,20784,18,1,4,4,6,6,6,8.0,58,54.304,5424563,49,5,1:29.883,212.396,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,mclaren,McLaren,button,Jenson,Button,4,1,12,17:22:34,23.251,23251
4,841,2011,1,1,Australian Grand Prix,2011-03-27,20784,18,1,4,4,6,6,6,8.0,58,54.304,5424563,49,5,1:29.883,212.396,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,mclaren,McLaren,button,Jenson,Button,13,1,13,17:24:10,23.842,23842


Since we still have columns with duplicate names, let's renaming.

In [136]:
print(final_hypothesis_df.columns)


Index(['raceId', 'year', 'round', 'circuitId', 'name_x', 'date', 'resultId',
       'driverId_x', 'constructorId', 'number', 'grid', 'position',
       'positionText', 'positionOrder', 'points', 'laps', 'time_x',
       'milliseconds_x', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId', 'circuitRef', 'name_y', 'location',
       'country', 'constructorRef', 'constructor_name', 'driverRef',
       'driver_forename', 'driver_surname', 'driverId_y', 'stop', 'lap',
       'time_y', 'duration', 'milliseconds_y'],
      dtype='object')


In [139]:
# Renaming columns
final_hypothesis_df.rename(columns={
    'number_x': 'driver_number',
    'time_x': 'circuit_time',
    'milliseconds_x': 'circuit_milliseconds',
    'name_y': 'circuit_name',
    #'name': 'constructor',
    'duration': 'pit_stop_duration',
    'milliseconds_y': 'pit_stop_milliseconds',
    'name_x': 'grand_prix' 
}, inplace=True)

In [140]:
final_hypothesis_df.duplicated().sum()

0

In [148]:
final_hypothesis_df.describe()

Unnamed: 0,raceId,year,round,circuitId,resultId,driverId_x,constructorId,grid,positionOrder,points,laps,statusId,driverId_y,stop,lap,pit_stop_milliseconds
count,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0,205941.0
mean,951.308787,2015.811606,10.241875,21.676242,23056.741309,504.451498,68.381294,11.074347,11.25031,4.757484,53.673926,9.573392,502.860965,1.765671,25.277351,70491.43
std,77.119231,3.580305,5.823373,22.538534,1501.854319,392.487682,85.206699,6.311088,6.273243,7.12219,17.39221,19.885795,393.421617,0.913427,14.530574,268109.8
min,841.0,2011.0,1.0,1.0,20779.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,12897.0
25%,881.0,2013.0,5.0,7.0,21747.0,18.0,5.0,6.0,6.0,0.0,52.0,1.0,18.0,1.0,13.0,21870.0
50%,948.0,2016.0,10.0,14.0,22927.0,815.0,10.0,11.0,11.0,0.0,56.0,2.0,815.0,2.0,25.0,23507.0
75%,1019.0,2019.0,15.0,22.0,24399.0,830.0,131.0,16.0,17.0,8.0,66.0,11.0,830.0,2.0,36.0,26066.0
max,1096.0,2022.0,22.0,79.0,25845.0,856.0,214.0,24.0,24.0,50.0,87.0,141.0,856.0,6.0,78.0,3069017.0


In [149]:
final_hypothesis_df.year.unique()

array([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2022, 2021, 2019,
       2020])

Let's have a look what's going on with our constructor column by saving this "newresults" table in csv file

In [155]:
final_hypothesis_df.to_csv('final_hypothesis_pit_stops_df.csv', index=False)
