# Total Laps and Average Lap Time

## Set Up

In [4]:
# Import necessary data libraries.
import pandas as pd
import os 
import csv
import io
import requests
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# Set up URLs.
circuits_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/circuits.csv'
constructor_results_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/constructor_results.csv'
constructor_standings_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/constructor_standings.csv'
constructors_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/constructors.csv'
driver_standings_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/driver_standings.csv'
drivers_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/drivers.csv'
lap_times_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/lap_times.csv'
pit_stop_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/pit_stops.csv'
qualifying_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/qualifying.csv'
races_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/races.csv'
results_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/results.csv'
seasons_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/seasons.csv'
status_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/status.csv'

In [6]:
# Set up dataframes.
circuits_df = pd.read_csv(circuits_url, sep = ',', encoding = 'latin-1')
constructor_results_df = pd.read_csv(constructor_results_url, sep = ',', engine = 'python')
constructor_standings_df = pd.read_csv(constructor_standings_url, sep = ',', engine = 'python')
constructors_df = pd.read_csv(constructor_standings_url, sep = ',', engine = 'python')
driver_standings_df = pd.read_csv(driver_standings_url, sep = ',', engine = 'python')
lap_times_df = pd.read_csv(lap_times_url, sep = ',', engine = 'python')
pit_stop_df = pd.read_csv(pit_stop_url, sep = ',', engine = 'python')
qualifying_df = pd.read_csv(constructor_standings_url, sep = ',', engine = 'python')
results_df = pd.read_csv(results_url, sep = ',', engine = 'python')
seasons_df = pd.read_csv(seasons_url, sep = ',', engine = 'python')
status_df = pd.read_csv(status_url, sep = ',', engine = 'python')
races_df = pd.read_csv(races_url, sep = ',', engine = 'c')
drivers_df = pd.read_csv(drivers_url, sep = ',', encoding = 'latin-1')

## Determine Total Laps

In [7]:
# View results_df.
results_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1


We can see in the above `results_df` dataset that we have variables `laps` and `position`. `laps` marks the number of completed laps, `position` tells us the placement of the cars at the end of each race.

In [8]:
"""
Filter results_df so that position = 1. This only gives us the first place driver, who by taking first must
have finished the race and therefore completed the maximum number of laps.
"""
wanted_position = ['1']
position_fin = results_df[results_df['position'].isin(wanted_position)]
position_fin.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1
22,23,19,8,6,1,2,1,1,1,10.0,56,1:31:18.555,5478555,37,2,1:35.405,209.158,1
44,45,20,13,6,2,2,1,1,1,10.0,57,1:31:06.970,5466970,38,3,1:33.600,208.153,1
66,67,21,8,6,1,1,1,1,1,10.0,66,1:38:19.051,5899051,46,1,1:21.670,205.191,1
88,89,22,13,6,2,1,1,1,1,10.0,58,1:26:49.451,5209451,16,3,1:26.666,221.734,1


In [9]:
# Create the variable TotalLaps. This will directly mirror the variable laps.
position_fin['TotalLaps'] = position_fin['laps']
position_fin.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  position_fin['TotalLaps'] = position_fin['laps']


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,TotalLaps
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.3,1,58
22,23,19,8,6,1,2,1,1,1,10.0,56,1:31:18.555,5478555,37,2,1:35.405,209.158,1,56
44,45,20,13,6,2,2,1,1,1,10.0,57,1:31:06.970,5466970,38,3,1:33.600,208.153,1,57
66,67,21,8,6,1,1,1,1,1,10.0,66,1:38:19.051,5899051,46,1,1:21.670,205.191,1,66
88,89,22,13,6,2,1,1,1,1,10.0,58,1:26:49.451,5209451,16,3,1:26.666,221.734,1,58


In [10]:
"""
Drop any unncessary variables. We will be keeping raceId and TotalLaps.
Note that this is in preparation for merging the data with the races_df dataset.
"""
position_fin.drop("resultId", axis = 1, inplace = True)
position_fin.drop("driverId", axis = 1, inplace = True)
position_fin.drop("constructorId", axis = 1, inplace = True)
position_fin.drop("number", axis = 1, inplace = True)
position_fin.drop("grid", axis = 1, inplace = True)
position_fin.drop("position", axis = 1, inplace = True)
position_fin.drop("positionText", axis = 1, inplace = True)
position_fin.drop("positionOrder", axis = 1, inplace = True)
position_fin.drop("points", axis = 1, inplace = True)
position_fin.drop("laps", axis = 1, inplace = True)
position_fin.drop("time", axis = 1, inplace = True)
position_fin.drop("milliseconds", axis = 1, inplace = True)
position_fin.drop("fastestLap", axis = 1, inplace = True)
position_fin.drop("rank", axis = 1, inplace = True)
position_fin.drop("fastestLapTime", axis = 1, inplace = True)
position_fin.drop("fastestLapSpeed", axis = 1, inplace = True)
position_fin.drop("statusId", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [11]:
# View position_fin.
position_fin.head()

Unnamed: 0,raceId,TotalLaps
0,18,58
22,19,56
44,20,57
66,21,66
88,22,58


## Merge Datasets

In [12]:
# Merge position_fin with races_df by "raceId" to get final_races_df.
final_races_df = pd.merge(position_fin, races_df, on = "raceId")
final_races_df.head()

Unnamed: 0,raceId,TotalLaps,year,round,circuitId,name,date,time,url
0,18,58,2008,1,1,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...
1,19,56,2008,2,2,Malaysian Grand Prix,2008-03-23,07:00:00,http://en.wikipedia.org/wiki/2008_Malaysian_Gr...
2,20,57,2008,3,3,Bahrain Grand Prix,2008-04-06,11:30:00,http://en.wikipedia.org/wiki/2008_Bahrain_Gran...
3,21,66,2008,4,4,Spanish Grand Prix,2008-04-27,12:00:00,http://en.wikipedia.org/wiki/2008_Spanish_Gran...
4,22,58,2008,5,5,Turkish Grand Prix,2008-05-11,12:00:00,http://en.wikipedia.org/wiki/2008_Turkish_Gran...


In [13]:
# Merge final_races_df with results_df by raceId to get race_results_df.
race_results_df = pd.merge(final_races_df, results_df, on = "raceId")
race_results_df

Unnamed: 0,raceId,TotalLaps,year,round,circuitId,name,date,time_x,url,resultId,...,positionOrder,points,laps,time_y,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,18,58,2008,1,1,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,1,...,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.300,1
1,18,58,2008,1,1,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,2,...,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,18,58,2008,1,1,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,3,...,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,18,58,2008,1,1,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,4,...,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,18,58,2008,1,1,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,5,...,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25199,1060,71,2021,9,70,Austrian Grand Prix,2021-07-04,13:00:00,http://en.wikipedia.org/wiki/2021_Austrian_Gra...,25141,...,16,0.0,70,\N,\N,62,11,1:08.520,226.865,11
25200,1060,71,2021,9,70,Austrian Grand Prix,2021-07-04,13:00:00,http://en.wikipedia.org/wiki/2021_Austrian_Gra...,25142,...,17,0.0,69,\N,\N,51,8,1:08.420,227.196,4
25201,1060,71,2021,9,70,Austrian Grand Prix,2021-07-04,13:00:00,http://en.wikipedia.org/wiki/2021_Austrian_Gra...,25143,...,18,0.0,69,\N,\N,56,18,1:09.394,224.007,12
25202,1060,71,2021,9,70,Austrian Grand Prix,2021-07-04,13:00:00,http://en.wikipedia.org/wiki/2021_Austrian_Gra...,25144,...,19,0.0,69,\N,\N,49,19,1:09.757,222.842,12


## Find Total Laps Completed by Individual Drivers

In [14]:
status_df.head()

Unnamed: 0,statusId,status,familyStatus,Completion Status
0,1,Finished,4,1
1,2,Disqualified,3,0
2,3,Accident,1,0
3,4,Collision,2,0
4,5,Engine,6,0


Note: We realized that because the "+n Laps" status still means that these drivers finished the race, they have familyStatus = 4. We can simply assign instances with familyStatus = 4 the total number of laps in a race. We do not have to bother with adding the n number of laps in their status and adding it to their current laps count. That being said, a lot of work was done with that initial idea in mind, and it is shown below. It is not work that we will use, though.

### Initial Work (not to be used)

We know from our `status_df` dataset that there are a lot of drivers with "+n Laps" as their status. Because of this, their laps count is lower than it should be. We can fix this by taking the n number of laps in their status and adding it to their current laps count.

In [15]:
# Find statuses with numbers.
status_df['Contain#'] = status_df['status'].str.contains('\\d')

# Let's take a look at the statuses with numbers.
wanted_num = [True]
status_true_num = status_df[status_df['Contain#'].isin(wanted_num)]
status_true_num

Unnamed: 0,statusId,status,familyStatus,Completion Status,Contain#
10,11,+1 Lap,4,1,True
11,12,+2 Laps,4,1,True
12,13,+3 Laps,4,1,True
13,14,+4 Laps,4,1,True
14,15,+5 Laps,4,1,True
15,16,+6 Laps,4,1,True
16,17,+7 Laps,4,1,True
17,18,+8 Laps,4,1,True
18,19,+9 Laps,4,1,True
44,45,+11 Laps,4,1,True


We can see above that all of the statuses with numbers in them are "+n Laps" statuses with one exception: "107% Rule". The statuses with "+n Laps" are 11-19, 45, 50, 128, 53, 55, 58, 88, 111-120, 122-125, 127, 133-134.

In [16]:
"""
Create a new variable, AddToLaps, that will signify the number of laps to add to a driver's initial lap count.
Do this by making a for loop through the status_true_num dataset that selects just 1:3 of the substring
in the status variable. Put these selections into a new variable, AddToLaps.

https://www.dataquest.io/blog/tutorial-advanced-for-loops-python-pandas/
https://www.guru99.com/learning-python-strings-replace-join-split-reverse.html

"""
for label, row in status_true_num.iterrows():
    status_true_num.loc[label, "AddToLaps"] = row["status"][1:3]
    
status_true_num

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,statusId,status,familyStatus,Completion Status,Contain#,AddToLaps
10,11,+1 Lap,4,1,True,1
11,12,+2 Laps,4,1,True,2
12,13,+3 Laps,4,1,True,3
13,14,+4 Laps,4,1,True,4
14,15,+5 Laps,4,1,True,5
15,16,+6 Laps,4,1,True,6
16,17,+7 Laps,4,1,True,7
17,18,+8 Laps,4,1,True,8
18,19,+9 Laps,4,1,True,9
44,45,+11 Laps,4,1,True,11


In [17]:
"""
Drop the row with "107% Rule".
https://www.shanelynn.ie/pandas-drop-delete-dataframe-rows-columns/
"""
status_true_num = status_true_num.drop(labels = 76, axis = 0)
status_true_num

Unnamed: 0,statusId,status,familyStatus,Completion Status,Contain#,AddToLaps
10,11,+1 Lap,4,1,True,1
11,12,+2 Laps,4,1,True,2
12,13,+3 Laps,4,1,True,3
13,14,+4 Laps,4,1,True,4
14,15,+5 Laps,4,1,True,5
15,16,+6 Laps,4,1,True,6
16,17,+7 Laps,4,1,True,7
17,18,+8 Laps,4,1,True,8
18,19,+9 Laps,4,1,True,9
44,45,+11 Laps,4,1,True,11


#### Merge Datasets Again

In [18]:
# Merge status_true_num with race_results_df by statusId to get race_lap_results_df.
race_lap_results_df = pd.merge(race_results_df, status_true_num, on = "statusId", how = 'outer')
race_lap_results_df.head()

Unnamed: 0,raceId,TotalLaps,year,round,circuitId,name,date,time_x,url,resultId,...,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,status,familyStatus,Completion Status,Contain#,AddToLaps
0,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,1.0,...,39,2,1:27.452,218.3,1,,,,,
1,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,2.0,...,41,3,1:27.739,217.586,1,,,,,
2,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,3.0,...,41,5,1:28.090,216.719,1,,,,,
3,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,4.0,...,58,7,1:28.603,215.464,1,,,,,
4,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,5.0,...,43,1,1:27.418,218.385,1,,,,,


In [19]:
for label, row in race_lap_results_df.iterrows():
    if  race_lap_results_df.loc[label, "AddToLaps"] == 'NaN':
        race_lap_results_df.loc[label, "AddToLaps"].replace("NaN", 0)
    
race_lap_results_df.head()

Unnamed: 0,raceId,TotalLaps,year,round,circuitId,name,date,time_x,url,resultId,...,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,status,familyStatus,Completion Status,Contain#,AddToLaps
0,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,1.0,...,39,2,1:27.452,218.3,1,,,,,
1,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,2.0,...,41,3,1:27.739,217.586,1,,,,,
2,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,3.0,...,41,5,1:28.090,216.719,1,,,,,
3,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,4.0,...,58,7,1:28.603,215.464,1,,,,,
4,18.0,58.0,2008.0,1.0,1.0,Australian Grand Prix,2008-03-16,04:30:00,http://en.wikipedia.org/wiki/2008_Australian_G...,5.0,...,43,1,1:27.418,218.385,1,,,,,


### Work to be Used

Like we noted earlier, the "+n Laps" status means that these drivers finished the race, and they have familyStatus = 4. We can assign instances with this familyStatus the total number of laps in a race. Instances with any other familyStatus will be assigned their completed number of laps.

#### Merge Datasets

In [20]:
# Merge race_results_df with status_df by statusId to get race_status_df.
race_status_df = pd.merge(race_results_df, status_df, on = "statusId")
race_status_df.describe()

Unnamed: 0,raceId,TotalLaps,year,round,circuitId,resultId,driverId,constructorId,grid,positionOrder,points,laps,statusId,familyStatus,Completion Status
count,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0
mean,516.937589,65.885177,1989.289081,8.240914,22.141287,12588.639859,250.35907,47.406721,11.207507,12.933066,1.787873,45.836613,17.76718,4.108792,0.547889
std,289.156434,22.676536,18.9399,4.861397,16.662924,7257.528256,257.164574,58.216417,7.273266,7.747447,4.014859,30.0412,26.14447,1.377368,0.497711
min,1.0,12.0,1950.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
25%,287.0,54.0,1976.0,4.0,9.0,6301.75,56.0,6.0,5.0,6.0,0.0,21.0,1.0,4.0,0.0
50%,502.0,65.0,1990.0,8.0,18.0,12602.5,159.0,25.0,11.0,12.0,0.0,52.0,11.0,4.0,1.0
75%,761.0,75.0,2005.0,12.0,32.0,18883.25,347.0,58.0,17.0,19.0,2.0,66.0,15.0,5.0,1.0
max,1060.0,200.0,2021.0,21.0,76.0,25145.0,854.0,214.0,34.0,39.0,50.0,200.0,139.0,6.0,1.0


#### Assign Laps

In [21]:
"""
Create the variable CompletedLaps.
Remember that familyStatus = 4 means that the driver finished the race and completed every lap.
"""
race_status_df['CompletedLaps'] = np.where(race_status_df.familyStatus == 4, race_status_df.TotalLaps, race_status_df.laps)
race_status_df.describe()

Unnamed: 0,raceId,TotalLaps,year,round,circuitId,resultId,driverId,constructorId,grid,positionOrder,points,laps,statusId,familyStatus,Completion Status,CompletedLaps
count,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0,25204.0
mean,516.937589,65.885177,1989.289081,8.240914,22.141287,12588.639859,250.35907,47.406721,11.207507,12.933066,1.787873,45.836613,17.76718,4.108792,0.547889,46.485955
std,289.156434,22.676536,18.9399,4.861397,16.662924,7257.528256,257.164574,58.216417,7.273266,7.747447,4.014859,30.0412,26.14447,1.377368,0.497711,30.565368
min,1.0,12.0,1950.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,287.0,54.0,1976.0,4.0,9.0,6301.75,56.0,6.0,5.0,6.0,0.0,21.0,1.0,4.0,0.0,21.0
50%,502.0,65.0,1990.0,8.0,18.0,12602.5,159.0,25.0,11.0,12.0,0.0,52.0,11.0,4.0,1.0,53.0
75%,761.0,75.0,2005.0,12.0,32.0,18883.25,347.0,58.0,17.0,19.0,2.0,66.0,15.0,5.0,1.0,67.0
max,1060.0,200.0,2021.0,21.0,76.0,25145.0,854.0,214.0,34.0,39.0,50.0,200.0,139.0,6.0,1.0,200.0


#### Create a CSV file of race_status_df.

In [25]:
race_status_df.to_csv("race_status.csv", index = False)

## Find the Average Speed

The code below is currently unfinished, the syntax was giving me difficulty.

In [22]:
#race_status_df["AvgSpeed"] = int(race_status_df["milliseconds"]) / race_status_df["CompletedLaps"]
#race_status_df.head()

In [23]:
#for label, row in race_status_df.iterrows():
#    race_status_df.loc[label, "Milliseconds"] = int(row["milliseconds"])
#    race_status_df.loc[label, "AvgSpeed"] = row["milliseconds"] / row["CompletedLaps"]
    
#race_status_df.head()

In [24]:
#race_status_df = [int(milliseconds) for milliseconds in race_status_df]
#race_status_df.head()