In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine


In [2]:
# print('Enter Web URL to Scrape Team listing (e.g. https://fbref.com/en/comps/9/Premier-League-Stats)')
# league_url = input()

league_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
r = requests.get(league_url)
soup = BeautifulSoup(r.text, "html.parser")

In [4]:
# Create empty list to append urls into
team_urls = []

# Scrape for items that have the attribute "data-stat": "squad" and save as teams
teams = soup.find_all(attrs={"data-stat": "squad"})

# Iterate through all items in teams to find the <a> that contains links
for team in teams:
    x = team.find('a')
    # Use following if statement to weed out any <a> tags that do not have href so we dont get an 'href' error
    if x is not None and 'href' in x.attrs:
        # Append all hrefs onto the team_urls
        team_urls.append(x['href'])


In [5]:
# Team URLs appear twice on the team_urls
# Use the following to get a final list where the team urls oly appear once
half = int(len(team_urls)/2)
team_urls_half = team_urls[0:int(half)]

# Add the begging of the URL
team_list = []
for x in team_urls_half:
     team_list.append("https://fbref.com" + x)
        
# team_list

['https://fbref.com/en/squads/822bd0ba/Liverpool',
 'https://fbref.com/en/squads/b8fd03ef/Manchester-City',
 'https://fbref.com/en/squads/a2d435b3/Leicester-City',
 'https://fbref.com/en/squads/cff3d9bb/Chelsea',
 'https://fbref.com/en/squads/19538871/Manchester-United',
 'https://fbref.com/en/squads/361ca564/Tottenham-Hotspur',
 'https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers',
 'https://fbref.com/en/squads/1df6b87e/Sheffield-United',
 'https://fbref.com/en/squads/33c895d4/Southampton',
 'https://fbref.com/en/squads/18bb7c10/Arsenal',
 'https://fbref.com/en/squads/47c64c55/Crystal-Palace',
 'https://fbref.com/en/squads/d3fd31cc/Everton',
 'https://fbref.com/en/squads/943e8050/Burnley',
 'https://fbref.com/en/squads/b2b47a98/Newcastle-United',
 'https://fbref.com/en/squads/d07537b9/Brighton--Hove-Albion',
 'https://fbref.com/en/squads/8602292d/Aston-Villa',
 'https://fbref.com/en/squads/7c21e445/West-Ham-United',
 'https://fbref.com/en/squads/4ba7cbea/Bournemouth',
 'http

In [None]:
#Read team table direcly into dataframe
d_list = pd.DataFrame()

for team in team_list:
    dfs = pd.read_html(team,header=1)
    d_list = d_list.append(dfs[0])
d_list.head(100)

In [None]:
#Rename columns to remove special characters and better formatting
d_list.rename(columns={'Player': 'long_name', 'Nation': 'nationality', 'Pos': 'team_position', 'Age': 'age', 'MP': 'ptime_matches_played',
                  'Starts': 'ptime_starts', 'Min': 'ptime_min', 'Gls': 'perf_goals', 'Ast': 'perf_assists', 'PK': 'perf_pks',
                  'PKatt': 'perf_pkatt', 'CrdY': 'perf_crd_y', 'CrdR': 'perf_crd_r', 'Gls.1': 'per_90_goals', 'Ast.1': 'per_90_assts',
                  'G+A': 'per_90_ga', 'G-PK': 'per_90_g_pk', 'G+A-PK': 'per_90_g_a_pk', 'xG': 'exp_goals', 'npxG': 'exp_np_exp_goals',
                  'xA': 'exp_assists', 'xG.1': 'per_90_exp_goals', 'xA.1': 'per_90_exp_assists', 'xG+xA': 'per_90_exp_goals_exp_assists',
                   'npxG.1': 'per_90_exp_np_exp_goals', 'npxG+xA': 'per_90_exp_np_exp_goals_assists'}, inplace=True)

In [None]:
df_player_perf = d_list[{'ptime_matches_played','ptime_starts', 'ptime_min', 'perf_goals', 'perf_assists', 'perf_pks', 'perf_pkatt',
 'perf_crd_y', 'perf_crd_r', 'per_90_goals', 'per_90_assts', 'per_90_ga', 'per_90_g_pk', 'per_90_g_a_pk', 'exp_goals', 'exp_np_exp_goals', 'exp_assists',
                      'per_90_exp_goals', 'per_90_exp_assists', 'per_90_exp_goals_exp_assists', 'per_90_exp_np_exp_goals', 'per_90_exp_np_exp_goals_assists'}]
df_player_perf.head()

# Create DB Connection

In [None]:
#modify connection string using local database id and password 
connection_string = "postgres:{Password}@localhost:5432/fifa_db"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
df_player_perf.to_sql(name='player_perf', con=engine, if_exists='replace', index=False)

In [None]:
#confirm country table
pd.read_sql_query('select * from player_perf', con=engine).head()