In [55]:
import bs4 as bs
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import wraps
from statistics import mean

In [3]:
def timer(orig_func):
    """
    decorator for logging time of function.
    """
    import time
    
    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = orig_func(*args, *kwargs)
        t2 = time.time() - t1
        print(f"{orig_func.__name__} ran in: {round(t2,3)} seconds")
        return result
    
    return wrapper

In [4]:
source = urllib.request.urlopen(f"https://www.formula1.com/en/results.html/1990/races/64/united-states/race-result.html").read()
soup = bs.BeautifulSoup(source,'lxml')

table = soup.find_all('table')[0] 
df = pd.read_html(str(table), flavor='bs4', header=[0])[0]
df.head()

Unnamed: 0.1,Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS,Unnamed: 8
0,,1,27,Ayrton Senna SEN,McLaren Honda,72,1:52:32.829,9,
1,,2,4,Jean Alesi ALE,Tyrrell Ford,72,+8.685s,6,
2,,3,5,Thierry Boutsen BOU,Williams Renault,72,+54.080s,4,
3,,4,20,Nelson Piquet PIQ,Benetton Ford,72,+68.358s,3,
4,,5,8,Stefano Modena MOD,Brabham Judd,72,+69.503s,2,


In [5]:
HOMEPAGE = 'https://www.formula1.com/'
YEAR = '1990'

In [6]:
@timer
def get_race_urls(year):
    race_urls = []
    source = urllib.request.urlopen(f"https://www.formula1.com/en/results.html/{YEAR}/races.html").read()
    soup = bs.BeautifulSoup(source,'lxml')
    
    for url in soup.find_all('a'):
        if YEAR in str(url.get('href')) and 'race-result' in str(url.get('href')) and url.get('href') not in race_urls:
            race_urls.append(url.get('href'))
    return race_urls

In [97]:
yearly_data = {
    'year': [],
    'lapped_position': [],
    'time_to_second': [],
    'average_delay_CR': []
}
output_df = pd.DataFrame(yearly_data)
output_df

Unnamed: 0,year,lapped_position,time_to_second,average_delay_CR


In [98]:
def get_race_results_by_year (YEAR, output_df):
    race_urls = get_race_urls(YEAR)
    lapped_position=[]
    average_time=[]
    time_to_second=[]

    for n, race in enumerate(race_urls):
        placeholder = [0 for i in range(n)]
        race_name = race.split('/')[6]

        results_page = urllib.request.urlopen(f"{HOMEPAGE}{race}").read()
        race_results = bs.BeautifulSoup(results_page,'lxml')

        table = race_results.find_all('table')[0] 
        df = pd.read_html(str(table), flavor='bs4', header=[0])[0]
        df.drop(["Unnamed: 0","Unnamed: 8"], axis=1, inplace=True)
        df.set_index('No', inplace=True)

        #getting lapped position
        try:
            sub = 'lap'
            df['lapped_position'] = df['Time/Retired'].str.find(sub)
            lapped_racer = df.lapped_position[df.lapped_position >= 0].index.tolist()
            lapped_position.append((int(df.at[lapped_racer[0], 'Pos'])))
            lapped_position_race = (int(df.at[lapped_racer[0], 'Pos']))
        except:
            lapped_position.append(-1)

        #getting time to second
        if lapped_position_race == 2:
            pass
        else:
            try:
                time_to_second.append(float(df.iloc[1, 4].strip('+s')))
            except:
                pass


        #getting average time of completed racers
        times=[]
        for i in range(1, lapped_position_race-1):
            try:
                time = float(df.iloc[i, 4].strip('+s'))
            except:
                pass
            times.append(time)

        if len(times) != 0:
            average_time.append(mean(times))

    #getting output ready
    lapped_position = mean(lapped_position) if len(lapped_position) != 0 else -1
    time_to_second = mean(time_to_second) if len(time_to_second) != 0 else -1
    average_time = mean(average_time) if len(average_time) != 0 else -1


    #adding values to output
    new_row = {'year': YEAR, 'lapped_position': lapped_position, 'time_to_second': time_to_second, 'average_delay_CR': average_time}

    output_df = output_df.append(new_row, ignore_index=True)
    return output_df



In [95]:
YEARS = range(1980, 2021)
YEARS = [str(year) for year in YEARS]


In [90]:
output_df

Unnamed: 0,year,lapped_position,time_to_second,average_delay_CR
0,1980,5.571429,23.705643,50.917952
1,1981,6.533333,15.822533,48.994133


In [99]:
for YEAR in YEARS:
    output_df = get_race_results_by_year(YEAR, output_df)
output_df

get_race_urls ran in: 0.154 seconds


TypeError: object of type 'float' has no len()

In [70]:
output_df