#### Some websites block access. Make sure you're able to access it
* Then you can right click + inspect page to help navigate the HTML file


In [1]:
import requests 
import bs4

#This page just shows all of the group stages (aka teams that made the tournament stage)
#which is all we need for our purposes
URL = "https://fbref.com/en/comps/8/2015-2016/schedule/2015-2016-Champions-League-Scores-and-Fixtures"
page = requests.get(URL)
content = page.text
soup = bs4.BeautifulSoup(content, 'lxml')
table = soup.find('table')
rows = table.find_all('tr')

In [2]:
#Output is way too long that I don't want others to see it in my GitHub repo
#print(soup.prettify())

#### In our case, all the data we need are within the table, inside the "td" tag
* We can just try to get all the team names in the "Home" column since each team plays at home at least once (3 times to be exact).
* can use the '.find' method to accomplish this

In [3]:
team_list = [] #Save the contents of the print statement it in a list

for row in rows:
    home_team = row.find('td', {'data-stat':'home_team'})
    if home_team:
        home_team = home_team.text
        team_list.append(home_team)


In [30]:
team_list #There should be 32 distinct number of teams.

['Manchester City eng',
 'Benfica pt',
 'Sevilla es',
 'Wolfsburg de',
 'PSV Eindhoven nl',
 'Paris S-G fr',
 'Real Madrid es',
 'Galatasaray tr',
 'Chelsea eng',
 'Gent be',
 'Leverkusen de',
 'Roma it',
 'Dinamo Zagreb hr',
 'Valencia es',
 'Dynamo Kyiv ua',
 'Olympiacos gr',
 '',
 'Porto pt',
 'Arsenal eng',
 'Barcelona es',
 'Bayern Munich de',
 'Lyon fr',
 'Maccabi Tel Aviv il',
 'Zenit ru',
 'BATE Borisov by',
 'Manchester Utd eng',
 "M'Gladbach de",
 'Atlético Madrid es',
 'Malmö se',
 'Juventus it',
 'CSKA Moscow ru',
 'Shakhtar ua',
 'FC Astana kz',
 '',
 'Arsenal eng',
 'Porto pt',
 'Leverkusen de',
 'Dinamo Zagreb hr',
 'Valencia es',
 'BATE Borisov by',
 'Dynamo Kyiv ua',
 'Zenit ru',
 'Manchester City eng',
 'Atlético Madrid es',
 'Paris S-G fr',
 'Juventus it',
 'Malmö se',
 'Wolfsburg de',
 'Galatasaray tr',
 'CSKA Moscow ru',
 '',
 'Benfica pt',
 'Manchester Utd eng',
 'Sevilla es',
 'Real Madrid es',
 "M'Gladbach de",
 'PSV Eindhoven nl',
 'FC Astana kz',
 'Shakhtar ua

First, convert list to dataframe

In [52]:
import pandas as pd
cl_teams16 = pd.DataFrame(team_list, columns=['CL_Team'])

Remove duplicates. Right now, there are 33 teams (there should be 32). Will need to investigate further

Remove duplicates and save to df. 

In [53]:
cl_teams16.drop_duplicates(inplace=True) 

Also remove the row with the 'blank' name.

In [54]:
cl_teams16 = cl_teams16[cl_teams16["CL_Team"]!=""].sort_values(by='CL_Team').reset_index(drop=True)

Now just need to <b>store in a separate column</b> the country abbreviated code in each element

In [57]:
cl_teams16['country_abbrev'] = cl_teams16.CL_Team.str.rsplit(' ',1).str[1] #keep this information just in case.

In [59]:
cl_teams16['CL_Team'] = cl_teams16.CL_Team.str.rsplit(' ',1).str[0] 
#everything to the left of this last word is saved as team name

In [60]:
cl_teams16

Unnamed: 0,CL_Team,country_abbrev
0,Arsenal,eng
1,Atlético Madrid,es
2,BATE Borisov,by
3,Barcelona,es
4,Bayern Munich,de
5,Benfica,pt
6,CSKA Moscow,ru
7,Chelsea,eng
8,Dinamo Zagreb,hr
9,Dynamo Kyiv,ua


##  Good to go. For code reuseability, write these steps in a function.

In [138]:
#Two required parameters: season start year, and season end year (integer types)

def cl_team_extractor(season_start_year, season_end_year):

    URL = f"https://fbref.com/en/comps/8/{season_start_year}-{season_end_year}/schedule/{season_start_year}-{season_end_year}-Champions-League-Scores-and-Fixtures"
    page = requests.get(URL)
    content = page.text
    soup = bs4.BeautifulSoup(content, 'lxml')
    table = soup.find('table')
    rows = table.find_all('tr')

    team_list = []

    for row in rows:
        home_team = row.find('td', {'data-stat':'home_team'})
        if home_team:
            home_team = home_team.text
            team_list.append(home_team)

    cl_teams16 = pd.DataFrame(team_list, columns=['CL_Team'])
    cl_teams16.drop_duplicates(inplace=True) 
    cl_teams16 = cl_teams16[cl_teams16["CL_Team"]!=""].sort_values(by='CL_Team').reset_index(drop=True)
    cl_teams16['country_abbrev'] = cl_teams16.CL_Team.str.rsplit(' ',1).str[1]
    cl_teams16['CL_Team'] = cl_teams16.CL_Team.str.rsplit(' ',1).str[0] 
    return cl_teams16

In [139]:
cl_team_extractor(2015, 2016)

Unnamed: 0,CL_Team,country_abbrev
0,Arsenal,eng
1,Atlético Madrid,es
2,BATE Borisov,by
3,Barcelona,es
4,Bayern Munich,de
5,Benfica,pt
6,CSKA Moscow,ru
7,Chelsea,eng
8,Dinamo Zagreb,hr
9,Dynamo Kyiv,ua


In [140]:
#Try it with different years
cl_team_extractor(2014, 2015)

Unnamed: 0,CL_Team,country_abbrev
0,APOEL FC,cy
1,Ajax,nl
2,Anderlecht,be
3,Arsenal,eng
4,Athletic Club,es
5,Atlético Madrid,es
6,BATE Borisov,by
7,Barcelona,es
8,Basel,ch
9,Bayern Munich,de
