# Team F1 Data Wrangling Master 

-Goals and Objectives:
1. Mold the Results csv into basic shape
2. Add in all required columns
3. address all N/A's
4. Save as a separate CSV
5. Enact test/train split

In [30]:
import numpy as np
import pandas as pd
from collections import Counter

## Import statements for our CSV's

In [31]:
#results
results_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/results.csv'
results_df = pd.read_csv(results_url, sep = ',', engine = 'python')
results_df.head()

#status
status_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/status.csv'
status_df = pd.read_csv(status_url, sep = ',', engine = 'python')

#circuits
circuits_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/circuits.csv'
circuits_df = pd.read_csv(circuits_url, sep = ',', encoding = 'latin-1')

#races
races_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/races.csv'
races_df = pd.read_csv(races_url, sep = ',', engine = 'c')

#Drivers
drivers_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/drivers.csv'
drivers_df = pd.read_csv(drivers_url, sep = ',', encoding = 'latin-1')

#lap Times
lap_times_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/lap_times.csv'
lap_times_df = pd.read_csv(lap_times_url, sep = ',', engine = 'python')

#MasterData1
MasterData1_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/MasterData1.csv'
MasterData1_df = pd.read_csv(MasterData1_url, sep = ',', engine = 'python')

#races-v4
race_weather_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/interim/races-v4.csv'
race_weather_df = pd.read_csv(race_weather_url, sep = ',', engine = 'python')

## Molding Results.csv into basic shape

### Tasks
1. drop:
    1. resultID
    2. number
    3. points
    4. time
    5. fastestLapTime

In [4]:
results_df = results_df.drop(['resultId','number','points','time','fastestLapTime'], axis=1, inplace=False)
results_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionText,positionOrder,laps,milliseconds,fastestLap,rank,fastestLapSpeed,statusId
0,18,1,1,1,1,1,1,58,5690616,39,2,218.3,1
1,18,2,2,5,2,2,2,58,5696094,41,3,217.586,1
2,18,3,3,7,3,3,3,58,5698779,41,5,216.719,1
3,18,4,4,11,4,4,4,58,5707797,58,7,215.464,1
4,18,5,1,3,5,5,5,58,5708630,43,1,218.385,1


## Adding Required Columns

### Tasks:
1. Add Race Outcome from status.csv
2. Add FamilyID from status.csv
3. Add circuitType from circuits.csv
4. Add isHistoric from circuits.csv
5. Add Driver Place of Origin

#### #1 and #2
Adding race outcome and familyID from status.csv

In [5]:
merged2_df = pd.merge(results_df,status_df,on='statusId')
#note - I did this with just an inner join because we know for sure that there are no nulls in statusID

In [6]:
#get rid of the leftovers on merged2
merged2_df = merged2_df.drop(columns=['status','statusId'],axis=1,inplace=False)

#### #3 and #4
adding circuitType and isHistoric from circuits.csv

In [7]:
#step 1, prep a trimmed version of races_df for left merging into merged2_df
races_df = races_df.drop(['round','name','date','time','url'],axis=1, inplace=False)

In [8]:
#step 2, bring in circuitID from races.csv INTO merged2_df (refined results.csv) to prep the join
merged3_df = pd.merge(merged2_df, races_df, on='raceId', how='left')

In [9]:
#step 3, left join the relevant columns into merged3_df from circuits.csv using circuitID as the joining column

In [10]:
#dropping the fluff out of circuits ahead of merge
circuits_df = circuits_df.drop(columns=['circuitRef','name','location','lat','lng','url'],axis=1,inplace=False)

In [20]:
circuits_df['trackType'].value_counts()

0    47
2    20
1    10
Name: trackType, dtype: int64

In [21]:
#merging in circuits to merged3_df on circuitID
merge4_df = pd.merge(merged3_df,circuits_df,on='circuitId',how='left')

In [29]:
merge4_df['trackType'].value_counts()

0    19363
2     5018
1      759
Name: trackType, dtype: int64

### #5, Adding in Driver Nationality

In [22]:
merge5_df = pd.merge(merge4_df,drivers_df[['driverId','nationality']],on='driverId',how='left')
#did it a different way here, merged just some select columns so I didn't need to do any drops

In [38]:
merge5_df['trackType'].value_counts()

0    19363
2     5018
1      759
Name: trackType, dtype: int64

# Data Wrangling / Fixing Nulls Work
## 1 - fixing milliseconds (there are a lot of nulls)

### Plan -

#### Step 1 - Drop the old Milliseconds from merged5_df and add the new milliseconds (from our new df called summed_df)

#### Step 2 - Take all Laptimes (in milliseconds) from LapTimes_df and sum them for each driver in each race, thus replicating the total time column (this does NOT factor in laps completed, so even if a driver completes just one lap, they will still have a time)


##### Key Note - Drivers who don't complete a lap at all (vehicle failure at start-line) will still have a Null, there are approx. 217 of these)

In [23]:
merge5_df = merge5_df.drop(columns=['milliseconds'],axis=1,inplace=False)

In [24]:
summed_lapTimes = lap_times_df.groupby(['raceId','driverId'], as_index = False).agg({'milliseconds':'sum'})

In [32]:
summed_lapTimes.head()

Unnamed: 0,raceId,driverId,milliseconds
0,1,1,5658698
1,1,2,5662869
2,1,3,5661506
3,1,4,5660663
4,1,6,1560978


In [39]:
merge6_df = pd.merge(merge5_df,summed_lapTimes, on=['raceId','driverId'], how='right')
#When I attempted this using a left join on accident, we got numerous nulls in a variety of columns.

In [40]:
#Renaming Columns
merge6_df.rename(columns={'milliseconds':'total_lap_time'}, inplace= True)

In [41]:
merge6_df['trackType'].value_counts()

0    7212
2    2254
Name: trackType, dtype: int64

## Creating Tables of Average Race times (milliseconds) and minimum laptimes (milliseconds) for each driver in each race

#### Average Laptime

In [44]:
average_lapTime = lap_times_df.groupby(['raceId','driverId'], as_index = False).agg({'milliseconds':'mean'})
average_lapTime.rename(columns={'milliseconds':'average_lap_time'}, inplace= True)

#### Minimum Lap Time

In [45]:
min_lapTime = lap_times_df.groupby(['raceId','driverId'], as_index = False).agg({'milliseconds':'min'})
min_lapTime.rename(columns={'milliseconds':'minimum_lap_time'}, inplace= True)

#### Bringing it all together now

In [46]:
#Bringing in avg lap time
merge7_df = pd.merge(merge6_df,average_lapTime, on=['raceId','driverId'])

In [47]:
#bringing in minimum lap time
merge8_df = pd.merge(merge7_df,min_lapTime, on=['raceId','driverId'])

In [48]:
#replacing \N with real NaN, probably should have done this sooner
merge8_df = merge8_df.replace(r'\N', np.NaN)

In [49]:
#bringing in weather data
merge9 = pd.merge(merge8_df, race_weather_df[['raceId', 'PRCP','TAVG', 'TMAX', 'TMIN']], on = 'raceId', how='left')

In [23]:
#No nulls confirmed
merge9.isna().sum()

raceId                  0
driverId                0
constructorId           0
grid                    0
position             2164
positionText            0
positionOrder           0
laps                    0
fastestLap           2745
rank                 2704
fastestLapSpeed      2745
familyStatus            0
Completion Status       0
year                    0
circuitId               0
country                 0
alt                     0
isHistoric              0
trackType               0
nationality             0
total_lap_time          0
average_lap_time        0
minimum_lap_time        0
PRCP                    0
TAVG                    0
TMAX                    0
TMIN                    0
dtype: int64

In [50]:
#dropping unnecessary columns
merge9 = merge9.drop(columns=['position','positionText', 'positionOrder', 'fastestLap', 'rank', 'fastestLapSpeed'],axis=1,inplace=False)

# Grouping up Circuit IDs by Frequency of races

In [51]:
merge9['circuitId'].value_counts()

9     531
4     528
11    523
14    514
6     512
18    490
7     475
1     469
22    452
13    452
2     395
10    369
3     367
17    337
20    331
8     270
21    267
15    251
24    250
70    200
5     169
69    164
23    142
19    140
71    132
12    110
32    101
73    100
35     93
68     69
25     64
34     58
16     41
75     40
26     22
27     20
76     18
Name: circuitId, dtype: int64

In [52]:
def circuit_binner2(row):
    if row['circuitId'] in (9, 4, 11, 14, 6): #500s
        val = "1"
    elif row['circuitId'] in (18, 7, 1, 22, 13): #400s
        val = "2"
    elif row['circuitId'] in (2, 10, 3, 17, 20): #300s
        val = "3"
    elif row['circuitId'] in (8, 21, 15, 24, 70): #200s
        val = "4"
    elif row['circuitId'] in (5, 69, 23, 19, 71, 12, 32, 73): #100s
        val = "5"
    elif row['circuitId'] in (35, 68, 25, 34, 16, 75, 26, 27, 76): #<100s
        val = "6"
    else:
        val = "error"
    return val

In [53]:
merge9['binned_circuits'] = merge9.apply(circuit_binner2, axis=1)

In [54]:
merge9['binned_circuits'].value_counts()

1    2608
2    2338
3    1799
4    1238
5    1058
6     425
Name: binned_circuits, dtype: int64

In [55]:
merge9.isna().sum()

raceId               0
driverId             0
constructorId        0
grid                 0
laps                 0
familyStatus         0
Completion Status    0
year                 0
circuitId            0
country              0
alt                  0
isHistoric           0
trackType            0
nationality          0
total_lap_time       0
average_lap_time     0
minimum_lap_time     0
PRCP                 0
TAVG                 0
TMAX                 0
TMIN                 0
binned_circuits      0
dtype: int64

# Dropping Noise from Data

## Dropped two phenomenon here:

### 1. Dropped all Lap 0 vehicle failures (nothing in our data explains why car failures occur at start line)

### 2. Dropped all DQ's and other erroneous failures not based strictly on car performance

In [56]:
merge9['familyStatus'].value_counts()

4    7108
6    1377
1     435
2     302
3     208
5      36
Name: familyStatus, dtype: int64

In [57]:
merge9['Completion Status'].value_counts(normalize = True)

1    0.750898
0    0.249102
Name: Completion Status, dtype: float64

In [58]:
filtered_master = merge9.loc[~((merge9['laps'] == 0) & (merge9['familyStatus'] == 6)),:]

In [59]:
filtered2_master = filtered_master.loc[(filtered_master['familyStatus'] != 3)]

In [60]:
filtered2_master.head()

Unnamed: 0,raceId,driverId,constructorId,grid,laps,familyStatus,Completion Status,year,circuitId,country,...,trackType,nationality,total_lap_time,average_lap_time,minimum_lap_time,PRCP,TAVG,TMAX,TMIN,binned_circuits
1,1,2,2,9,58,4,1,2009,1,Australia,...,2,German,5662869,97635.672414,88283,0.0,72.0,78.0,66.0,2
2,1,3,3,5,58,4,1,2009,1,Australia,...,2,German,5661506,97612.172414,87706,0.0,72.0,78.0,66.0,2
3,1,4,4,10,58,4,1,2009,1,Australia,...,2,Spanish,5660663,97597.637931,88712,0.0,72.0,78.0,66.0,2
4,1,6,3,11,17,1,0,2009,1,Australia,...,2,Japanese,1560978,91822.235294,89923,0.0,72.0,78.0,66.0,2
5,1,7,5,17,58,4,1,2009,1,Australia,...,2,French,5662082,97622.103448,89823,0.0,72.0,78.0,66.0,2


In [34]:
filtered2_master['familyStatus'].value_counts()

4    7108
6    1377
1     435
2     302
5      36
Name: familyStatus, dtype: int64

In [None]:
FinalData = filtered2_master.drop(columns=['laps','positionText', 'positionOrder', 'fastestLap', 'rank', 'fastestLapSpeed'],axis=1,inplace=False)

In [35]:
filtered2_master.to_csv("./data/processed/MasterData5.csv", index = False)