## Web Scrape English Premier League Table

Link: https://www.premierleague.com/tables

In [13]:
# Imports

import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import date

In [14]:
epl_link = "https://www.premierleague.com/tables"

response = requests.get(epl_link)

response.status_code

200

In [15]:
soup = BeautifulSoup(response.content, 'html.parser')

## Get Premier League Table Data - Testing

In [16]:
# Table rows are in the tr tags. Each table row is for each EPL team
table_rows = soup.find_all('tr')

In [17]:
# Look at first table row for top ranked team in the table:
table_rows[1].find_all('td', {'class': 'team'})

[<td class="team" scope="row">
 <a href="/clubs/11/Manchester-City/overview">
 <span class="badge badge-image-container" data-size="25" data-widget="club-badge-image">
 <img class="badge-image badge-image--25" src="https://resources.premierleague.com/premierleague/badges/25/t43.png" srcset="https://resources.premierleague.com/premierleague/badges/25/t43@x2.png 2x">
 </img></span>
 <span class="long">Manchester City</span>
 <span class="short">MCI</span>
 </a>
 </td>]

In [18]:
# Team name is in span tag with class long that is inside a tag that is inside td tag.
table_rows[1].find_all('span', {'class': 'long'})[0].get_text()

'Manchester City'

In [19]:
# Number of Played Games 4th td item in a table row (tr)
table_rows[1].find_all('td')[3].get_text()

'25'

For each table row, there are 13 td tags associates with it.

The first td tag in a row is for the collapsable arrow drop down part. The second is for the rank, third for team name.

Fourth td tag with index 3 is for number of games played, fifth is for wins. 

I use indices to obtain Played, Won, Draws, Losses, GF, GA, GD and Points as they do not really have tag names.

## Rank

In [20]:
rank = [row.find_all('span', {'class': 'value'})[0].get_text() for row in table_rows[1:40:2]]

## Team Name

In [21]:
teams = [row.find_all('span', {'class': 'long'})[0].get_text() for row in table_rows[1:40:2]]

In [22]:
teams

['Manchester City',
 'Liverpool',
 'Chelsea',
 'West Ham United',
 'Manchester United',
 'Arsenal',
 'Wolverhampton Wanderers',
 'Tottenham Hotspur',
 'Brighton and Hove Albion',
 'Southampton',
 'Leicester City',
 'Aston Villa',
 'Crystal Palace',
 'Brentford',
 'Leeds United',
 'Everton',
 'Newcastle United',
 'Norwich City',
 'Watford',
 'Burnley']

## Played

In [23]:
played = [row.find_all('td')[3].get_text() for row in table_rows[1:40:2]]

## Wins

In [24]:
wins = [row.find_all('td')[4].get_text() for row in table_rows[1:40:2]]

## Draws

In [25]:
draws = [row.find_all('td')[5].get_text() for row in table_rows[1:40:2]]

## Losses

In [26]:
losses = [row.find_all('td')[6].get_text() for row in table_rows[1:40:2]]

## Goals For

In [27]:
goals_for = [row.find_all('td')[7].get_text() for row in table_rows[1:40:2]]

## Goals Against

In [28]:
goals_against = [row.find_all('td')[8].get_text() for row in table_rows[1:40:2]]

## Goals Difference

In [29]:
goal_diff = [row.find_all('td')[9].get_text().strip() for row in table_rows[1:40:2]]

## Points

In [30]:
points = [row.find_all('td')[10].get_text().strip() for row in table_rows[1:40:2]]

# Make Dataframe

In [31]:
epl_df = pd.DataFrame({
         'Rank': rank,
         'Team': teams,
         'Played': played,
         'Wins': wins,
         'Draws': draws,
         'Losses': losses,
         'Goals For': goals_for,
         'Goals Against': goals_against,
         'Goal Difference': goal_diff,
         'Points': points
})

In [32]:
# See table
epl_df

Unnamed: 0,Rank,Team,Played,Wins,Draws,Losses,Goals For,Goals Against,Goal Difference,Points
0,1,Manchester City,25,20,3,2,61,14,47,63
1,2,Liverpool,24,16,6,2,61,19,42,54
2,3,Chelsea,24,13,8,3,48,18,30,47
3,4,West Ham United,25,12,5,8,44,33,11,41
4,5,Manchester United,24,11,7,6,38,32,6,40
5,6,Arsenal,22,12,3,7,34,25,9,39
6,7,Wolverhampton Wanderers,23,11,4,8,21,17,4,37
7,8,Tottenham Hotspur,22,11,3,8,28,29,-1,36
8,9,Brighton and Hove Albion,23,7,12,4,25,23,2,33
9,10,Southampton,24,6,11,7,30,37,-7,29


In [33]:
# Save to .csv:

epl_df.to_csv('epl_table_' + str(date.today()) + '.csv', index = False)

In [34]:
# Save to Excel file

epl_df.to_csv('epl_table_' + str(date.today()) + '.xlsx', index = False)