In [64]:
# read neccessary libraries
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

pd.options.display.max_rows = 20

np.set_printoptions(precision = 4, suppress = True)

In [65]:
def get_team_name(soup):
    title = soup.find('title')
    pattern = '<title>(.*) Statistics | Premier League</title>'
    regex = re.compile(pattern, flags=re.IGNORECASE)
    team_name = regex.match(str(title)).groups()[0]
    return team_name

# Return the filtered club name for the given index 
def get_premier_teams(target_teams, i):
    pattern = '<h4 class="clubName">(.*)<'
    regex = re.compile(pattern, flags=re.IGNORECASE)
    result = regex.match(str(target_teams[i])).groups()
    return result[0]

In [66]:
# Get the 20 Premier League teams (first division)

premier_teams = []
url = "https://www.premierleague.com/clubs?se=210"
res = requests.get(url)
soup = BeautifulSoup(res.content)
target_teams = soup.find_all('h4', attrs={'clubName'})

for i in range(0,20):
    premier_teams.append(get_premier_teams(target_teams, i))

premier_teams

['Arsenal',
 'Aston Villa',
 'AFC Bournemouth',
 'Brighton and Hove Albion',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Newcastle United',
 'Norwich City',
 'Sheffield United',
 'Southampton',
 'Tottenham Hotspur',
 'Watford',
 'West Ham United',
 'Wolverhampton Wanderers']

In [None]:
headers = ['Team', 'Wins']
team_stats = []
premier_teams_index = []

# Iterate through clubs using numbers 1-46 in the URL
# Since the 20 Premier League teams are found in mixed order with second division teams, we have to extract all
for team in range(1,46):
    url = "https://www.premierleague.com/clubs/" + str(team) + "/club/stats"  
    res = requests.get(url)                                                     
    soup = BeautifulSoup(res.content)
    
    team_name = get_team_name(soup)
    
    # Filter out FC from the football clubs' names
    if 'FC' in team_name:
        team_name = team_name[:-3]
        
    # Filter out teams that aren't in the Premier League
    if team_name not in premier_teams:
        continue
    
    # Append the indices (for URL) of the Premier League teams (first division)
    premier_teams_index.append(team)    
      
    # Find all the stats that are enclosed in 'div' classed as normalStat
    all_stats = soup.find_all('div', attrs={'normalStat'})                     

    stats = [team_name]
    
    # Find number of wins
    win_stats = soup.find_all('div', attrs={'topStat'})[1].findChildren()[0]
    pattern = '(.*)\\n( *)(.*)'
    regex = re.compile(pattern, flags=re.IGNORECASE)
    result = regex.match(str(win_stats)).groups()
    stats.append(int(result[-1]))
    
    for i in range(0,26): 
        
        # Filter the first child from each 'div' tag which contains the stat name
        span = all_stats[i].findChildren()[0]
        
        # Only need to extract headers once
        if team == 1:                     
            pattern = '<span class="stat">(.*)<(.*)'
            regex = re.compile(pattern, flags=re.IGNORECASE)
            result = regex.match(str(span)).groups()
            headers.append(result[0].strip())
        
        # Extract the statistical data
        pattern = '(.*)\\n( *)(.*)'
        regex = re.compile(pattern, flags=re.IGNORECASE)
        result = regex.match(str(span)).groups()
                
        value = result[2].replace(',', '').replace('%', '')
    
        # Use float for appropriate values
        if (value.isdigit()):
            data_value = int(value.replace(',','').replace('%','')) 
        else:
            data_value = float(value.replace(',','').replace('%',''))
        stats.append(data_value)
        
    team_stats.append(stats)

In [None]:
stats = pd.DataFrame(team_stats, columns=headers).set_index('Team')

In [None]:
import matplotlib.pyplot as plt

corr = stats.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(stats.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(stats.columns)
ax.set_yticklabels(stats.columns)
plt.show()