File used to generate the first annotations on the pit stops. The rest of the annotations has been enter manually


In [58]:
import pandas as pd

In [59]:
DATASETS_PATH = "./../data/"

pit_stops = pd.read_json(DATASETS_PATH + "pitStops.json")
pit_stops.head()

Unnamed: 0,eventId,driverId,lap,time,timeOfDay
0,1994-2,erik-comas,1,49111.0,14:01:34
1,1994-2,ukyo-katayama,17,28482.0,14:20:46
2,1994-2,mika-hakkinen,18,43745.0,14:22:35
3,1994-2,damon-hill,18,21992.0,14:23:00
4,1994-2,michele-alboreto,19,27693.0,14:24:39


In [60]:
pit_stops.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19532 entries, 0 to 19531
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   eventId     19532 non-null  object 
 1   driverId   19532 non-null  object 
 2   lap        19532 non-null  int64  
 3   time       19529 non-null  float64
 4   timeOfDay  19532 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 763.1+ KB


In [61]:
pit_stops["year"] = pit_stops["eventId"].apply(lambda x: str(x).split("-")[0]).astype("Int64")
pit_stops["pitStopIndex"] = pit_stops.index

pit_stops.groupby(["year"])["time"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1994,475.0,36744.707368,36955.865947,14014.0,26747.0,30921.0,36381.0,587176.0
1995,615.0,37641.819512,28815.253868,13035.0,28251.5,31522.0,38215.0,417635.0
1996,459.0,36963.287582,50109.741392,19334.0,27229.5,30425.0,34544.5,851244.0
1997,508.0,31920.5,18496.57363,11556.0,26526.75,28763.5,31677.75,315377.0
1998,450.0,38673.035556,69159.445822,11242.0,26616.25,29272.5,31666.75,825304.0
1999,452.0,32760.061947,26030.060454,11371.0,26641.25,29282.5,32169.75,410649.0
2000,556.0,34255.498201,26950.79275,19355.0,27665.75,30108.5,33875.25,496808.0
2001,493.0,39171.075051,62525.680909,13083.0,28534.0,31227.0,34383.0,1084911.0
2002,519.0,37564.757225,39496.465299,14159.0,28653.0,31119.0,35264.5,573508.0
2003,624.0,32406.737179,17891.872895,14699.0,28312.5,30839.5,33624.0,356415.0


Here we realize that some pit stops are too large to be real. More than 1000 secs (more than 15mins). So we continue to see what happen:

In [62]:
pit_stops[(pit_stops["time"] > 1000000) & (pit_stops["year"] == 2016)]

Unnamed: 0,eventId,driverId,lap,time,timeOfDay,year,pitStopIndex
14512,2016-1,seb-vettel,18,1089312.0,16:36:56,2016,14512
14513,2016-1,nico-rosberg,18,1089303.0,16:36:58,2016,14513
14514,2016-1,kimi-raikkonen,18,1089365.0,16:37:00,2016,14514
14515,2016-1,daniel-ricciardo,18,1089822.0,16:37:02,2016,14515
14516,2016-1,max-verstappen,18,1089372.0,16:37:03,2016,14516
14517,2016-1,carlos-sainz-jr,18,1089971.0,16:37:05,2016,14517
14518,2016-1,lewis-hamilton,18,1090087.0,16:37:09,2016,14518
14519,2016-1,felipe-massa,18,1089805.0,16:37:10,2016,14519
14520,2016-1,romain-grosjean,18,1089417.0,16:37:12,2016,14520
14521,2016-1,nico-hulkenberg,18,1089155.0,16:37:14,2016,14521


We can see that red flags are taken into account and count as a Pit Stop. We should fix that

In [63]:
pit_stops[(pit_stops["driverId"] == "daniel-ricciardo") & (pit_stops["eventId"] == "2016-1")]

Unnamed: 0,eventId,driverId,lap,time,timeOfDay,year,pitStopIndex
14499,2016-1,daniel-ricciardo,12,22245.0,16:25:59,2016,14499
14515,2016-1,daniel-ricciardo,18,1089822.0,16:37:02,2016,14515
14536,2016-1,daniel-ricciardo,42,21981.0,17:32:47,2016,14536


We see that also the "red flags pit stops" count for the total number of pits for the driver. By the moment, we just want to remove the rows of this pits, because we want the median time of the pit. We could deal with the pitStopNumber later on.

In [64]:
red_flags = pd.read_json(DATASETS_PATH + "redFlags.json")

red_flags.head()

Unnamed: 0,lap,resumed,incident,excluded,eventId
0,138,N,Rain.,,1950-3
1,64,N,Mist.,,1971-10
2,2,Y,"Crash involving Jody Scheckter, Jean-Pierre Be...","Jody Scheckter, Jean-Pierre Beltoise, George F...",1973-9
3,32,N,Rain.,,1974-2
4,29,N,Crash of Rolf Stommelen which killed five spec...,,1975-4


In [65]:
pit_stops.rename(columns={"lap": "pitStopLap"}, inplace=True)

merged = pd.merge(
    red_flags, pit_stops, on=["eventId"]
)

In [66]:
# Get the data of the laps with pit stops refearing as a red flag
merged = merged[
    (~merged["pitStopLap"].isna())
    & (
        (merged["pitStopLap"] == merged["lap"])
        | ((merged["pitStopLap"] == merged["lap"] - 1) & (merged["time"] > 200))
        | ((merged["pitStopLap"] == merged["lap"] - 2) & (merged["time"] > 300))
    )
]

merged.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 423 entries, 34 to 2003
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lap           423 non-null    int64  
 1   resumed       423 non-null    object 
 2   incident      423 non-null    object 
 3   excluded      359 non-null    object 
 4   eventId        423 non-null    object 
 5   driverId      423 non-null    object 
 6   pitStopLap    423 non-null    int64  
 7   time          423 non-null    float64
 8   timeOfDay     423 non-null    object 
 9   year          423 non-null    Int64  
 10  pitStopIndex  423 non-null    int64  
dtypes: Int64(1), float64(1), int64(3), object(6)
memory usage: 40.1+ KB


In [67]:
# Add annotations to the pit stops in special cases like this:
pit_stops["annotation"] = ""
pit_stops.loc[merged["pitStopIndex"], "annotation"] = "Red flag"

In [68]:
# Check that annotations has been added correctly
pit_stops[pit_stops["annotation"] == "Red flag"]

Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation
305,1994-12,olivier-panis,1,50948.0,14:01:45,1994,305,Red flag
426,1994-15,nigel-mansell,13,35665.0,14:29:02,1994,426,Red flag
427,1994-15,mika-hakkinen,14,33630.0,14:31:23,1994,427,Red flag
507,1995-2,eddie-irvine,1,59885.0,14:02:03,1995,507,Red flag
2218,1998-7,eddie-irvine,1,53521.0,13:29:06,1998,2218,Red flag
...,...,...,...,...,...,...,...,...
19370,2022-18,valtteri-bottas,2,407553.0,14:09:11,2022,19370,Red flag
19371,2022-18,nicholas-latifi,2,407673.0,14:09:14,2022,19371,Red flag
19372,2022-18,seb-vettel,2,409647.0,14:09:17,2022,19372,Red flag
19373,2022-18,guanyu-zhou,2,410005.0,14:09:20,2022,19373,Red flag


In [69]:
pit_stops[(pit_stops["time"] > 200000) & (pit_stops["annotation"] == "")]


Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation
172,1994-7,mark-blundell,33,270350.0,14:45:28,1994,172,
193,1994-8,mark-blundell,19,241559.0,14:29:35,1994,193,
433,1994-16,domenico-schiattarella,9,587176.0,14:12:54,1994,433,
475,1994-16,jean-denis-deletraz,56,384951.0,15:22:08,1994,475,
501,1995-1,taki-inoue,41,326335.0,15:02:45,1995,501,
...,...,...,...,...,...,...,...,...
9885,2010-18,lucas-di-grassi,40,399641.0,14:57:59,2010,9885,
9944,2011-1,timo-glock,19,536552.0,17:36:18,2011,9944,
10674,2011-13,daniel-ricciardo,1,1004718.0,14:08:50,2011,10674,
15124,2016-14,felipe-nasr,4,810454.0,14:11:17,2016,15124,


In [70]:
# Pit stops that took less than 13.75seg
pit_stops[pit_stops["time"] < 13750].head()

Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation
555,1995-3,johnny-herbert,9,13035.0,14:18:01,1995,555,
1646,1997-4,nicola-larini,34,11778.0,14:51:19,1997,1646,
1656,1997-4,nicola-larini,54,11556.0,15:21:11,1997,1656,
2076,1998-1,mika-hakkinen,36,11242.0,15:00:22,1998,2076,
2532,1999-1,michael-schumacher,37,11371.0,15:17:18,1999,2532,


We can now stop worrying about excessively long stops. There are only a couple of them that are isolated cases and we can manually delete them later. However, we see that there are very short pits. This is in many cases due to drive-throughs, which we must identify

In [71]:
# Preparing for dropping unreal low-time pits. First, calculate the standard deviation per race, and then, get the values with a high variation from that value
std_per_race = (
    pit_stops[pit_stops["annotation"] == ""]
    .groupby(["eventId", "year"])["time"]
    .aggregate(["std", "median"])
)
std_per_race = std_per_race.rename(
    columns={"std": "pitStopSegsRaceVariation", "median": "pitStopSegsRaceMedian"}
)
std_per_race


Unnamed: 0_level_0,Unnamed: 1_level_0,pitStopSegsRaceVariation,pitStopSegsRaceMedian
eventId,year,Unnamed: 2_level_1,Unnamed: 3_level_1
1994-10,1994,23588.713836,31321.0
1994-11,1994,26624.467363,35666.5
1994-12,1994,6358.968675,26166.5
1994-13,1994,5733.954355,29139.0
1994-14,1994,11555.191995,28422.5
...,...,...,...
2022-5,2022,2701.515300,19543.0
2022-6,2022,1792.074567,22399.0
2022-7,2022,3786.932379,25710.0
2022-8,2022,4210.177689,22059.0


In [72]:
std_per_race = pd.merge(pit_stops[pit_stops["annotation"] == ""], std_per_race, on="eventId")
std_per_race.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19128 entries, 0 to 19127
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   eventId                    19128 non-null  object 
 1   driverId                  19128 non-null  object 
 2   pitStopLap                19128 non-null  int64  
 3   time                      19125 non-null  float64
 4   timeOfDay                 19128 non-null  object 
 5   year                      19128 non-null  Int64  
 6   pitStopIndex              19128 non-null  int64  
 7   annotation                19128 non-null  object 
 8   pitStopSegsRaceVariation  19128 non-null  float64
 9   pitStopSegsRaceMedian     19128 non-null  float64
dtypes: Int64(1), float64(3), int64(2), object(4)
memory usage: 1.6+ MB


In [73]:
# Values with a non-normal low pit-stop
std_per_race["deviation"] = (
    std_per_race["time"] - std_per_race["pitStopSegsRaceMedian"]
) / std_per_race["pitStopSegsRaceVariation"]

low_pit_stops = std_per_race[std_per_race["deviation"] < -1.5].sort_values(by="eventId")

low_pit_stops.tail()


Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation,pitStopSegsRaceVariation,pitStopSegsRaceMedian,deviation
18364,2021-22,charles-leclerc,35,21173.0,17:56:24,2021,18714,,450.826881,21871.0,-1.548266
18349,2021-22,max-verstappen,13,21152.0,17:22:42,2021,18699,,450.826881,21871.0,-1.594847
17925,2021-5,lance-stroll,58,23474.0,16:17:28,2021,18168,,445.015388,24150.0,-1.519049
17965,2021-7,sergio-perez,24,29682.0,15:43:14,2021,18242,,551.046419,30553.0,-1.580629
18449,2022-2,guanyu-zhou,27,15103.0,20:51:18,2022,18799,,3327.020359,21425.0,-1.900199


In [74]:
pit_stops[pit_stops["eventId"] == "2022-2"].sort_values("time")

Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation
18799,2022-2,guanyu-zhou,27,15103.0,20:51:18,2022,18799,
18797,2022-2,lando-norris,16,20383.0,20:29:54,2022,18797,
18788,2022-2,sergio-perez,15,20481.0,20:27:12,2022,18788,
18785,2022-2,daniel-ricciardo,8,20650.0,20:16:37,2022,18785,
18787,2022-2,alexander-albon,13,20683.0,20:24:54,2022,18787,
18791,2022-2,max-verstappen,16,20731.0,20:28:58,2022,18791,
18790,2022-2,charles-leclerc,16,21038.0,20:28:55,2022,18790,
18796,2022-2,esteban-ocon,16,21141.0,20:29:50,2022,18796,
18793,2022-2,george-russell,16,21415.0,20:29:21,2022,18793,
18802,2022-2,nico-hulkenberg,37,21425.0,21:07:16,2022,18802,


In [75]:
low_pit_stops[low_pit_stops["time"] < 13250]


Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation,pitStopSegsRaceVariation,pitStopSegsRaceMedian,deviation
551,1995-3,johnny-herbert,9,13035.0,14:18:01,1995,555,,6844.909719,27505.0,-2.11398
1642,1997-4,nicola-larini,34,11778.0,14:51:19,1997,1646,,7780.348089,24184.5,-1.594594
1652,1997-4,nicola-larini,54,11556.0,15:21:11,1997,1656,,7780.348089,24184.5,-1.623128
2527,1999-1,michael-schumacher,37,11371.0,15:17:18,1999,2532,,8391.284208,26695.5,-1.82624
3952,2001-16,jacques-villeneuve,43,13083.0,13:59:21,2001,3957,,7424.264894,25569.0,-1.681783
5592,2004-10,nick-heidfeld,35,11042.0,14:49:57,2004,5597,,2274.429248,21263.0,-4.493875
5612,2004-10,giorgio-pantano,52,11084.0,15:13:27,2004,5617,,2274.429248,21263.0,-4.475409
5637,2004-11,gianmaria-bruni,25,12429.0,13:39:48,2004,5642,,7307.279662,23489.5,-1.513628
5395,2004-5,olivier-panis,29,12377.0,14:43:00,2004,5400,,2558.169338,23902.0,-4.505175
7159,2006-12,tiago-monteiro,43,11068.0,15:01:23,2006,7164,,3245.163317,22437.5,-3.503522


In [76]:
# Now we can get which of this values are because a drive-through

# First, create a df containing all penalties that have the same driver and race that a non-usual low-time pit stop 

penalties = pd.read_json(DATASETS_PATH + "penalties.json")
low_pits_with_penalties = pd.merge(low_pit_stops, penalties, on=["eventId", "driverId"])

print(f"{len(low_pits_with_penalties)} rows with rare low-pit time and a penalty in the same race:\n")
low_pits_with_penalties.sort_values("time").head()

115 rows with rare low-pit time and a penalty in the same race:



Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation,pitStopSegsRaceVariation,pitStopSegsRaceMedian,deviation,Session,Allegation,Incident involving,Outcome,Reason,Notes
8,2004-10,giorgio-pantano,52,11084.0,15:13:27,2004,5617,,2274.429248,21263.0,-4.475409,R,Speeding in the pit lane,,Drive-through penalty,,
24,2010-10,fernando-alonso,30,12304.0,13:54:12,2010,9654,,4190.300125,20704.5,-2.004749,R,,,Drive-through penalty,,
14,2004-5,olivier-panis,29,12377.0,14:43:00,2004,5400,,2558.169338,23902.0,-4.505175,R,Speeding in the pit lane,,Drive-through penalty,,
19,2007-1,adrian-sutil,23,12672.0,14:40:04,2007,7413,,4005.487776,23625.0,-2.734498,R,Ignoring blue flags,,Drive-through penalty,,
20,2007-1,adrian-sutil,24,12673.0,14:41:54,2007,7417,,4005.487776,23625.0,-2.734249,R,Ignoring blue flags,,Drive-through penalty,,


In [77]:
# Secondly, we filter that df with only the penalties that are drive-through
low_pits_with_penalties = low_pits_with_penalties[
    low_pits_with_penalties["Outcome"].apply(
        lambda x: "Drive-through penalty" in str(x).split(",")
    )
]

print(f"{len(low_pits_with_penalties)} rows with drive-through:\n")
low_pits_with_penalties.sort_values("time").head()


91 rows with drive-through:



Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation,pitStopSegsRaceVariation,pitStopSegsRaceMedian,deviation,Session,Allegation,Incident involving,Outcome,Reason,Notes
8,2004-10,giorgio-pantano,52,11084.0,15:13:27,2004,5617,,2274.429248,21263.0,-4.475409,R,Speeding in the pit lane,,Drive-through penalty,,
24,2010-10,fernando-alonso,30,12304.0,13:54:12,2010,9654,,4190.300125,20704.5,-2.004749,R,,,Drive-through penalty,,
14,2004-5,olivier-panis,29,12377.0,14:43:00,2004,5400,,2558.169338,23902.0,-4.505175,R,Speeding in the pit lane,,Drive-through penalty,,
19,2007-1,adrian-sutil,23,12672.0,14:40:04,2007,7413,,4005.487776,23625.0,-2.734498,R,Ignoring blue flags,,Drive-through penalty,,
20,2007-1,adrian-sutil,24,12673.0,14:41:54,2007,7417,,4005.487776,23625.0,-2.734249,R,Ignoring blue flags,,Drive-through penalty,,


In [78]:
pit_stops.loc[low_pits_with_penalties["pitStopIndex"].astype("int").to_list(), "annotation"] = "Drive-through"

In [79]:
# Remove pits where cars follow the SC
index_to_note = (
    pit_stops[
        ((pit_stops["eventId"] == "2017-8") & (pit_stops["pitStopLap"] == 17))
        | ((pit_stops["eventId"] == "2021-6") & (pit_stops["pitStopLap"] == 47))
    ]["pitStopIndex"]
    .astype("int")
    .to_list()
)

pit_stops.loc[index_to_note, "annotation"] = "All cars follow the SC through the pit lane"


In [80]:
pit_stops.groupby("annotation").count()

Unnamed: 0_level_0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,19021,19021,19021,19018,19021,19021,19021
All cars follow the SC through the pit lane,33,33,33,33,33,33,33
Drive-through,90,90,90,90,90,90,90
Red flag,388,388,388,388,388,388,388


In [85]:
# Same way as before, but with the long pits
long_pit_stops = std_per_race[(std_per_race["deviation"] > 1.5) & (std_per_race["time"] < 45000)].sort_values(by="eventId")

long_pits_with_penalties = pd.merge(
    long_pit_stops, penalties, on=["eventId", "driverId"]
)

# Secondly, we filter that df with only the penalties that are drive-through
long_pits_with_penalties = long_pits_with_penalties[
    long_pits_with_penalties["Outcome"].apply(
        lambda x: "Ten-second stop-go penalty" in str(x).split(",")
        or "Ten-second time penalty" in str(x).split(",")
        or "Five-second time penalty" in str(x).split(",")
    )
]

long_pits_with_penalties.head()


Unnamed: 0,eventId,driverId,pitStopLap,time,timeOfDay,year,pitStopIndex,annotation,pitStopSegsRaceVariation,pitStopSegsRaceMedian,deviation,Session,Allegation,Incident involving,Outcome,Reason,Notes
10,2011-2,sebastien-buemi,20,29843.0,16:39:30,2011,9991,,3166.994011,23221.0,2.090942,R,Pit lane speeding,,Ten-second stop-go penalty,,
15,2012-15,romain-grosjean,1,32329.0,15:05:37,2012,11746,,4451.085302,21308.0,2.476025,R,Caused a collision,Mark Webber,Ten-second stop-go penalty,,
16,2012-15,romain-grosjean,7,29793.0,15:16:52,2012,11749,,4451.085302,21308.0,1.906277,R,Caused a collision,Mark Webber,Ten-second stop-go penalty,,
21,2012-18,sergio-perez,44,26858.0,18:30:24,2012,11890,,3408.013717,20521.0,1.859441,R,Caused a collision,Romain Grosjean,Ten-second stop-go penalty,,
23,2012-18,sergio-perez,38,33431.0,18:17:35,2012,11886,,3408.013717,20521.0,3.78813,R,Caused a collision,Romain Grosjean,Ten-second stop-go penalty,,


In [88]:
pit_stops.loc[
    long_pits_with_penalties["pitStopIndex"].astype("int").to_list(), "annotation"
] = (long_pits_with_penalties["Outcome"].apply(lambda x: str(x).split(",")[0])).values

In [90]:
# ---------------- FINAL MODIFICATIONS ------------------

pit_stops.drop(columns=["year", "pitStopIndex"], inplace=True)
pit_stops.rename(columns={"pitStopLap": "lap"}, inplace=True)

pit_stops.groupby("annotation").count()

Unnamed: 0_level_0,eventId,driverId,pitStopLap,time,timeOfDay
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,18953,18953,18953,18950,18953
All cars follow the SC through the pit lane,33,33,33,33,33
Drive-through,90,90,90,90,90
Five-second time penalty,33,33,33,33,33
Red flag,388,388,388,388,388
Ten-second stop-go penalty,21,21,21,21,21
Ten-second time penalty,14,14,14,14,14


In [91]:
pit_stops.to_json("./pitStops.json", "records")