In [277]:
# read neccessary libraries
import numpy as np
import pandas as pd
pd.options.display.max_rows = 20

np.set_printoptions(precision = 4, suppress = True)

In [278]:
def get_team_name(soup):
    title = soup.find('title')
    pattern = '<title>(.*) Statistics | Premier League</title>'
    regex = re.compile(pattern, flags=re.IGNORECASE)
    team_name = regex.match(str(title)).groups()[0]
    return team_name

# Return the filtered club name for the given index 
def get_premier_teams(target_teams, i):
    pattern = '<h4 class="clubName">(.*)<'
    regex = re.compile(pattern, flags=re.IGNORECASE)
    result = regex.match(str(target_teams[i])).groups()
    return result[0]

In [325]:
# Get the 20 Premier League teams (first division)

premier_teams = []
url = "https://www.premierleague.com/clubs?se=210"
res = requests.get(url)
soup = BeautifulSoup(res.content)
target_teams = soup.find_all('h4', attrs={'clubName'})

for i in range(0,20):
    premier_teams.append(get_premier_teams(target_teams, i))

premier_teams

['Arsenal',
 'Aston Villa',
 'AFC Bournemouth',
 'Brighton and Hove Albion',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Newcastle United',
 'Norwich City',
 'Sheffield United',
 'Southampton',
 'Tottenham Hotspur',
 'Watford',
 'West Ham United',
 'Wolverhampton Wanderers']

In [319]:
import requests
import re
from bs4 import BeautifulSoup

headers = ['Team']
team_stats = []
premier_teams_index = []

# Iterate through clubs using numbers 1-46 in the URL
# Since the 20 Premier League teams are found in mixed order with second division teams, we have to extract all
for team in range(1,46):
    url = "https://www.premierleague.com/clubs/" + str(team) + "/club/stats"  
    res = requests.get(url)                                                     
    soup = BeautifulSoup(res.content)
    
    team_name = get_team_name(soup)
    
    # Filter out FC from the football clubs' names
    if 'FC' in team_name:
        team_name = team_name[:-3]
        
    # Filter out teams that aren't in the Premier League
    if team_name not in premier_teams:
        continue
    
    # Append the indices (for URL) of the Premier League teams (first division)
    premier_teams_index.append(team)    
    
    # Find all the stats that are enclosed in 'div' classed as normalStat
    all_stats = soup.find_all('div', attrs={'normalStat'})                     

    stats = [team_name]
    
    for i in range(13,26): 
        
        # Filter the first child from each 'div' tag which contains the stat name
        span = all_stats[i].findChildren()[0]
        
        # Only need to extract headers once
        if team == 1:                     
            pattern = '<span class="stat">(.*)<(.*)'
            regex = re.compile(pattern, flags=re.IGNORECASE)
            result = regex.match(str(span)).groups()
            headers.append(result[0].strip())
        
        # Extract the statistical data
        pattern = '(.*)\\n( *)(.*)'
        regex = re.compile(pattern, flags=re.IGNORECASE)
        result = regex.match(str(span)).groups()
        
        # Use float only for 'Goals conceded per match'
        if i != 15:
            data_value = int(result[2].replace(',','').replace('%',''))
        else:
            data_value = float(result[2].replace(',','').replace('%',''))
        stats.append(data_value)
        
    team_stats.append(stats)


In [320]:
pd.DataFrame(team_stats, columns=headers).set_index('Team')

Unnamed: 0_level_0,Clean sheets,Goals conceded,Goals conceded per match,Saves,Tackles,Tackle success %,Blocked shots,Interceptions,Clearances,Headed Clearance,Aerial Battles/Duels Won,Errors leading to goal,Own goals
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Arsenal,405,1045,0.98,781,10286,73,2097,8004,14027,5179,36603,113,41
Aston Villa,272,1231,1.3,424,8510,75,1188,6226,14401,4564,28783,76,46
Chelsea,425,1032,0.97,602,10213,74,2387,6630,13168,4980,35727,65,36
Crystal Palace,103,591,1.44,756,4810,71,787,3817,7157,3833,20484,52,16
Everton,327,1344,1.27,707,9952,74,1869,7064,17267,6074,36225,85,52
Liverpool,403,1060,1.0,547,11194,74,2237,6670,14167,4993,35968,89,39
Manchester City,284,1002,1.15,509,10140,74,2189,7137,14123,4889,33957,62,38
Manchester United,448,956,0.9,686,10033,73,2098,7181,15473,5866,34378,59,39
Norwich City,70,555,1.64,315,3035,72,516,2253,5444,2059,11097,43,18
Sheffield United,40,190,1.31,65,1317,73,161,479,1783,306,4295,7,4


In [323]:
# Extract the age of defenders

url = "http://www.footballsquads.co.uk/eng/2019-2020/engprem/chelsea.htm"
tables = pd.read_html(url)
df = tables[0]
#df = df.filter([3, 6])
#df = df[df[3] == 'D']
#df
    

In [321]:
print(premier_teams_index)
    

[1, 2, 4, 6, 7, 10, 11, 12, 14, 18, 20, 21, 23, 25, 26, 33, 38, 43]
