In [1]:
import requests
from bs4 import BeautifulSoup
import json
import html_to_json
from tqdm import tqdm
import pandas as pd
import networkx as nx

In [2]:
df_raw = pd.read_csv("data/results.csv")

df_raw.loc[df_raw["pos"] == "DQ", "pos"] = -1
df_raw.loc[df_raw["pos"] == "NC", "pos"] = -1
df_raw.loc[df_raw["pos"] == "EX", "pos"] = -1
df_raw = df_raw.astype({"pos":int})

max_pos = df_raw[["year","race","pos"]].groupby(["year","race"]).max("pos").reset_index()
max_pos["max_pos"] = max_pos["pos"] + 1 

tmp = pd.merge(df_raw,max_pos, on=["year", "race"])


tmp.loc[tmp["pos_x"] == -1, "pos_x"] = tmp[tmp["pos_x"] == -1]["max_pos"]
df = tmp[["driver_name", "pos_x", "car", "points", "laps", "retired", "time-retired", "race", "year"]].copy()
df["key"] =  df["year"].apply(str) + " " + df["race"] + " " + df["car"]
df = df.rename(columns={
    "pos_x":"pos",
})

driver_list = df_raw["driver_name"].unique()
df.head()

Unnamed: 0,driver_name,pos,car,points,laps,retired,time-retired,race,year,key
0,Nino Farina FAR,1,Alfa Romeo,9.0,70.0,False,2:13:23.600,Great Britain,1950,1950 Great Britain Alfa Romeo
1,Luigi Fagioli FAG,2,Alfa Romeo,6.0,70.0,False,+2.600s,Great Britain,1950,1950 Great Britain Alfa Romeo
2,Reg Parnell PAR,3,Alfa Romeo,4.0,70.0,False,+52.000s,Great Britain,1950,1950 Great Britain Alfa Romeo
3,Yves Giraud-Cabantous GIR,4,Talbot-Lago,3.0,68.0,False,+2 laps,Great Britain,1950,1950 Great Britain Talbot-Lago
4,Louis Rosier ROS,5,Talbot-Lago,2.0,68.0,False,+2 laps,Great Britain,1950,1950 Great Britain Talbot-Lago


In [15]:
merge_df = pd.merge(df,df, on="key")
merge_df["diff"] = 1.0*merge_df["pos_x"] > merge_df["pos_y"]
merge_df = merge_df[merge_df["diff"] > 0].copy()

In [16]:
graph_df = merge_df[["driver_name_x", "driver_name_y", "diff"]].copy()
graph_df = graph_df.groupby(["driver_name_x", "driver_name_y"]).sum().reset_index()

In [7]:
G = nx.DiGraph()
G.add_nodes_from(driver_list)
G.add_weighted_edges_from(list(graph_df.itertuples(index=False, name=None)))
pr = nx.pagerank(G, alpha=0.95,weight='weight', max_iter=100000)
pr_df = pd.DataFrame({"driver_name": pr.keys(), "rank": [pr[driver] for driver in pr.keys()] })

In [32]:
pr_df.sort_values(by="rank", ascending=False).head(10)

Unnamed: 0,driver_name,rank
12,Juan Manuel Fangio FAN,0.01574
689,Fernando Alonso ALO,0.012511
721,Lewis Hamilton HAM,0.012168
228,Jack Brabham BRA,0.011248
282,Bruce McLaren MCL,0.010881
268,Graham Hill HIL,0.010572
80,Stirling Moss MOS,0.009664
110,Jean Behra BEH,0.009547
127,Mike Hawthorn HAW,0.008814
741,Daniel Ricciardo RIC,0.008798


In [44]:
modern_drivers = df_raw[df_raw["year"] >= 2011]["driver_name"].unique()

In [62]:
pd.merge(pr_df, pd.DataFrame({"driver_name": modern_drivers}), on="driver_name").sort_values(by="rank", ascending=False).head(20)

Unnamed: 0,driver_name,rank
7,Fernando Alonso ALO,0.012511
15,Lewis Hamilton HAM,0.012168
31,Daniel Ricciardo RIC,0.008798
5,Jenson Button BUT,0.008753
0,Michael Schumacher MSC,0.008639
18,Sebastian Vettel VET,0.007602
13,Nico Rosberg ROS,0.007528
46,Max Verstappen VER,0.007517
45,Carlos Sainz SAI,0.007373
1,Rubens Barrichello BAR,0.00694
