<a href="https://colab.research.google.com/github/cmmm976/Ballond-OrPrediction/blob/main/src/scraping_and_cleaning_ballondor_rankings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
from tqdm import tqdm

In [None]:
ballonsdor = {}

In [None]:
#scrapping 1995-2019 ballons d'or 
for year in tqdm(range(1995,2020)):
  df_name = "bo_"+str(year)
  ballonsdor[df_name] = pd.read_html('https://fr.wikipedia.org/wiki/Ballon_d%27or_'+str(year),attrs={"class":"wikitable"})[0]

100%|██████████| 25/25 [00:09<00:00,  2.74it/s]


In [None]:
#adding missing rows for 2011-2015 BOs
for year in tqdm(range(2011,2014)):
  df_name = "bo_"+str(year)
  ballonsdor[df_name] = ballonsdor[df_name].append(pd.read_html('https://fr.wikipedia.org/wiki/Ballon_d%27or_'+str(year),attrs={"class":"wikitable"})[1],ignore_index=True)

ballonsdor["bo_2014"] = ballonsdor["bo_2014"].append(pd.read_html('https://fr.wikipedia.org/wiki/Ballon_d%27or_'+str(2014),attrs={"class":"wikitable sortable"})[0],ignore_index=True)
ballonsdor["bo_2015"] = ballonsdor["bo_2015"].append(pd.read_html('https://fr.wikipedia.org/wiki/Ballon_d%27or_'+str(2015),attrs={"class":"wikitable sortable"})[0],ignore_index=True)

100%|██████████| 3/3 [00:00<00:00,  5.52it/s]


In [None]:
#normalizing column names
ballonsdor["bo_1998"].rename(columns={"Points[1]": "Points","Rang[1]": "Rang"},inplace=True)
ballonsdor["bo_2002"].rename(columns={"Points[1]": "Points","Rang[1]": "Rang"},inplace=True)
ballonsdor["bo_2003"].rename(columns={"Points[1]": "Points","Rang[1]": "Rang"},inplace=True)
ballonsdor["bo_2006"].rename(columns={0: "Rang",1:"Nom",2:"Club",3:"Nationalité",4:"Points"},inplace=True)
ballonsdor["bo_2007"].rename(columns={0: "Rang",1:"Nom",2:"Club",3:"Nationalité",4:"Points"},inplace=True)
ballonsdor["bo_2008"].rename(columns={0: "Rang",1:"Nom",2:"Club",3:"Nationalité",4:"Points"},inplace=True)
ballonsdor["bo_2009"].rename(columns={0: "Rang",1:"Nom",2:"Club",3:"Nationalité",4:"Points"},inplace=True)

In [None]:
#dropping useless rows
ballonsdor["bo_2006"].drop(0,inplace=True)
ballonsdor["bo_2007"].drop(0,inplace=True)
ballonsdor["bo_2008"].drop(0,inplace=True)
ballonsdor["bo_2009"].drop(0,inplace=True)

In [None]:
#forcing int type for points column
for year in tqdm(range(1995,2020)):
   df_name = "bo_"+str(year)
   try:
    ballonsdor[df_name]["Points"].astype(int)
   except KeyError:
     print("Points column not found in "+df_name+", proceeding")

100%|██████████| 25/25 [00:00<00:00, 3389.17it/s]

Points column not found in bo_2010, proceeding
Points column not found in bo_2011, proceeding
Points column not found in bo_2012, proceeding
Points column not found in bo_2013, proceeding
Points column not found in bo_2014, proceeding





In [None]:
#normalizing points won for each year
for df in tqdm(ballonsdor.keys()):
  try:
    ballonsdor[df]["%"] = ballonsdor[df]["Points"]/ballonsdor[df]["Points"].sum()
  except KeyError:
    print("Points column not found in "+df+", proceeding")
  except TypeError:
    continue

100%|██████████| 25/25 [00:00<00:00, 1399.44it/s]

Points column not found in bo_2010, proceeding
Points column not found in bo_2011, proceeding
Points column not found in bo_2012, proceeding
Points column not found in bo_2013, proceeding
Points column not found in bo_2014, proceeding





In [None]:
#fixing name column
ballonsdor["bo_2011"]["Nom"] = ballonsdor["bo_2011"]["Nom[3]"]
ballonsdor["bo_2011"].loc[0,"Nom"] = "Lionel Messi"
ballonsdor["bo_2011"].loc[1,"Nom"] = "Cristiano Ronaldo"
ballonsdor["bo_2011"].loc[2,"Nom"] = "Xavi"
ballonsdor["bo_2011"].drop("Nom[3]",axis=1,inplace=True)
ballonsdor["bo_2011"].drop_duplicates(inplace=True)

In [163]:
#export data
for df in tqdm(ballonsdor.items()):
    print(df[0],type(df[1]))
    df[1].to_csv(df[0]+".csv",index=False)

100%|██████████| 25/25 [00:00<00:00, 594.35it/s]

bo_1995 <class 'pandas.core.frame.DataFrame'>
bo_1996 <class 'pandas.core.frame.DataFrame'>
bo_1997 <class 'pandas.core.frame.DataFrame'>
bo_1998 <class 'pandas.core.frame.DataFrame'>
bo_1999 <class 'pandas.core.frame.DataFrame'>
bo_2000 <class 'pandas.core.frame.DataFrame'>
bo_2001 <class 'pandas.core.frame.DataFrame'>
bo_2002 <class 'pandas.core.frame.DataFrame'>
bo_2003 <class 'pandas.core.frame.DataFrame'>
bo_2004 <class 'pandas.core.frame.DataFrame'>
bo_2005 <class 'pandas.core.frame.DataFrame'>
bo_2006 <class 'pandas.core.frame.DataFrame'>
bo_2007 <class 'pandas.core.frame.DataFrame'>
bo_2008 <class 'pandas.core.frame.DataFrame'>
bo_2009 <class 'pandas.core.frame.DataFrame'>
bo_2010 <class 'pandas.core.frame.DataFrame'>
bo_2011 <class 'pandas.core.frame.DataFrame'>
bo_2012 <class 'pandas.core.frame.DataFrame'>
bo_2013 <class 'pandas.core.frame.DataFrame'>
bo_2014 <class 'pandas.core.frame.DataFrame'>
bo_2015 <class 'pandas.core.frame.DataFrame'>
bo_2016 <class 'pandas.core.frame.


