# Team F1 Data Wrangling Master 

-Goals and Objectives:
1. Mold the Results csv into basic shape
2. Add in all required columns
3. address all N/A's
4. Save as a separate CSV
5. Enact test/train split

In [45]:
import numpy as np
import pandas as pd

%config InlineBackend.figure_format = 'retina'

## Molding the results.csv into basic shape

In [107]:
#results
results_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/results.csv'
results_df = pd.read_csv(results_url, sep = ',', engine = 'python')
results_df.head()

#status
status_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/status.csv'
status_df = pd.read_csv(status_url, sep = ',', engine = 'python')

#circuits
circuits_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/circuits.csv'
circuits_df = pd.read_csv(circuits_url, sep = ',', encoding = 'latin-1')

#races
races_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/races.csv'
races_df = pd.read_csv(races_url, sep = ',', engine = 'c')

#Drivers
drivers_url = 'https://raw.githubusercontent.com/georgetown-analytics/Formula1/main/data/drivers.csv'
drivers_df = pd.read_csv(drivers_url, sep = ',', encoding = 'latin-1')

### Tasks
1. drop:
    1. resultID
    2. number
    3. points
    4. time
    5. fastestLapTime

In [94]:
#results_df = results_df.drop(['resultId','number','points','time','fastestLapTime'], axis=1, inplace=False)
results_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionText,positionOrder,laps,milliseconds,fastestLap,rank,fastestLapSpeed,statusId
0,18,1,1,1,1,1,1,58,5690616,39,2,218.3,1
1,18,2,2,5,2,2,2,58,5696094,41,3,217.586,1
2,18,3,3,7,3,3,3,58,5698779,41,5,216.719,1
3,18,4,4,11,4,4,4,58,5707797,58,7,215.464,1
4,18,5,1,3,5,5,5,58,5708630,43,1,218.385,1


## Adding Required Columns

### Tasks:
1. Add Race Outcome from status.csv
2. Add FamilyID from status.csv
3. Add circuitType from circuits.csv
4. Add isHistoric from circuits.csv
5. Add Driver Place of Origin

#### #1 and #2
Adding race outcome and familyID from status.csv

In [71]:
merged2_df = pd.merge(results_df,status_df,on='statusId')
#note - I did this with just an inner join because we know for sure that there are no nulls in statusID

In [89]:
merged2_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionText,positionOrder,laps,milliseconds,fastestLap,rank,fastestLapSpeed,statusId,status,familyStatus,Completion Status
0,18,1,1,1,1,1,1,58,5690616,39,2,218.3,1,Finished,4,1
1,18,2,2,5,2,2,2,58,5696094,41,3,217.586,1,Finished,4,1
2,18,3,3,7,3,3,3,58,5698779,41,5,216.719,1,Finished,4,1
3,18,4,4,11,4,4,4,58,5707797,58,7,215.464,1,Finished,4,1
4,18,5,1,3,5,5,5,58,5708630,43,1,218.385,1,Finished,4,1


In [95]:
#get rid of the leftovers on merged2
merged2_df = merged2_df.drop(columns=['status','statusId'],axis=1,inplace=False)

In [96]:
merged2_df

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionText,positionOrder,laps,milliseconds,fastestLap,rank,fastestLapSpeed,familyStatus,Completion Status
0,18,1,1,1,1,1,1,58,5690616,39,2,218.300,4,1
1,18,2,2,5,2,2,2,58,5696094,41,3,217.586,4,1
2,18,3,3,7,3,3,3,58,5698779,41,5,216.719,4,1
3,18,4,4,11,4,4,4,58,5707797,58,7,215.464,4,1
4,18,5,1,3,5,5,5,58,5708630,43,1,218.385,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25135,942,8,6,18,\N,R,16,25,\N,24,14,186.905,6,0
25136,976,815,10,6,\N,R,15,39,\N,37,8,204.670,6,0
25137,1010,817,4,12,\N,R,19,28,\N,18,19,212.478,1,0
25138,1037,847,3,15,\N,R,19,9,\N,5,18,225.624,1,0


#### #3 and #4
adding circuitType and isHistoric from circuits.csv

In [42]:
#step0 - confirming no nulls in raceID


resultId           0
raceId             0
driverId           0
constructorId      0
number             0
grid               0
position           0
positionText       0
positionOrder      0
points             0
laps               0
time               0
milliseconds       0
fastestLap         0
rank               0
fastestLapTime     0
fastestLapSpeed    0
statusId           0
dtype: int64

In [97]:
#step 1, prep a trimmed version of races_df for left merging into merge1_df
races_df = races_df.drop(['round','name','date','time','url'],axis=1, inplace=False)

In [98]:
races_df

Unnamed: 0,raceId,year,circuitId
0,1,2009,1
1,2,2009,2
2,3,2009,17
3,4,2009,3
4,5,2009,4
...,...,...,...
1053,1069,2021,69
1054,1070,2021,32
1055,1071,2021,18
1056,1072,2021,77


In [99]:
#step 2, bring in circuitID from races.csv INTO merged2_df (refined results.csv) to prep the join
merged3_df = pd.merge(merged2_df, races_df, on='raceId', how='left')

In [100]:
merged3_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionText,positionOrder,laps,milliseconds,fastestLap,rank,fastestLapSpeed,familyStatus,Completion Status,year,circuitId
0,18,1,1,1,1,1,1,58,5690616,39,2,218.3,4,1,2008,1
1,18,2,2,5,2,2,2,58,5696094,41,3,217.586,4,1,2008,1
2,18,3,3,7,3,3,3,58,5698779,41,5,216.719,4,1,2008,1
3,18,4,4,11,4,4,4,58,5707797,58,7,215.464,4,1,2008,1
4,18,5,1,3,5,5,5,58,5708630,43,1,218.385,4,1,2008,1


In [None]:
#step 2, left join the relevant columns into merged3_df from circuits.csv using circuitID as the joining column

In [101]:
#dropping the fluff out of circuits ahead of merge
circuits_df = circuits_df.drop(columns=['circuitRef','name','location','lat','lng','url'],axis=1,inplace=False)

In [102]:
circuits_df.head()

Unnamed: 0,circuitId,country,alt
0,1,Australia,10
1,2,Malaysia,18
2,3,Bahrain,7
3,4,Spain,109
4,5,Turkey,130


In [104]:
#merging in circuits to merged3_df on circuitID
merge4_df = pd.merge(merged3_df,circuits_df,on='circuitId',how='left')

In [105]:
merge4_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionText,positionOrder,laps,milliseconds,fastestLap,rank,fastestLapSpeed,familyStatus,Completion Status,year,circuitId,country,alt
0,18,1,1,1,1,1,1,58,5690616,39,2,218.3,4,1,2008,1,Australia,10
1,18,2,2,5,2,2,2,58,5696094,41,3,217.586,4,1,2008,1,Australia,10
2,18,3,3,7,3,3,3,58,5698779,41,5,216.719,4,1,2008,1,Australia,10
3,18,4,4,11,4,4,4,58,5707797,58,7,215.464,4,1,2008,1,Australia,10
4,18,5,1,3,5,5,5,58,5708630,43,1,218.385,4,1,2008,1,Australia,10


In [106]:
#doing a null check
merge4_df.isna().sum()

raceId               0
driverId             0
constructorId        0
grid                 0
position             0
positionText         0
positionOrder        0
laps                 0
milliseconds         0
fastestLap           0
rank                 0
fastestLapSpeed      0
familyStatus         0
Completion Status    0
year                 0
circuitId            0
country              0
alt                  0
dtype: int64

### #5, Adding in Driver Nationality

In [110]:
merge5_df = pd.merge(merge4_df,drivers_df[['driverId','nationality']],on='driverId',how='left')
#did it a different way here, merged just some select columns so I didn't need to do any drops

In [112]:
merge5_df.head()

Unnamed: 0,raceId,driverId,constructorId,grid,position,positionText,positionOrder,laps,milliseconds,fastestLap,rank,fastestLapSpeed,familyStatus,Completion Status,year,circuitId,country,alt,nationality
0,18,1,1,1,1,1,1,58,5690616,39,2,218.3,4,1,2008,1,Australia,10,British
1,18,2,2,5,2,2,2,58,5696094,41,3,217.586,4,1,2008,1,Australia,10,German
2,18,3,3,7,3,3,3,58,5698779,41,5,216.719,4,1,2008,1,Australia,10,German
3,18,4,4,11,4,4,4,58,5707797,58,7,215.464,4,1,2008,1,Australia,10,Spanish
4,18,5,1,3,5,5,5,58,5708630,43,1,218.385,4,1,2008,1,Australia,10,Finnish


In [113]:
#here im going to drop our 3 big key's (raceID, driverID, constructorID) to finalize this before saving
#if we ever need these again we can look at merge5_df
final_df = merge5_df.drop(columns=['constructorId','driverId','raceId'],axis=1,inplace=False)

### Final Step for basic setup - save as a CSV

In [117]:
final_df.to_csv('MasterData1.csv',index=False)

In [118]:
print(pd.read_csv('MasterData1.csv'))

       grid position positionText  positionOrder  laps milliseconds  \
0         1        1            1              1    58      5690616   
1         5        2            2              2    58      5696094   
2         7        3            3              3    58      5698779   
3        11        4            4              4    58      5707797   
4         3        5            5              5    58      5708630   
...     ...      ...          ...            ...   ...          ...   
25135    18       \N            R             16    25           \N   
25136     6       \N            R             15    39           \N   
25137    12       \N            R             19    28           \N   
25138    15       \N            R             19     9           \N   
25139    17       \N            R             18    47           \N   

      fastestLap rank fastestLapSpeed  familyStatus  Completion Status  year  \
0             39    2         218.300             4                