# Scraping Football Data from Understat.com

This code is used to scrape football data from various data sources, in this case, it'll be from understat.com

## 1. Importing the required packages and modules

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

## 2. Creating the URL

In [2]:
base_url = 'https://understat.com/match/'
match = str(input('Match ID:'))
url = base_url + match

Match ID:14606


## In this example, we'll use the data from Tottenham Hotspur vs Aston Villa on 21st March of 2021

In [3]:
res = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml')
scripts = soup.find_all('script')

In [4]:
scripts

[<script>
 			var THEME = localStorage.getItem("theme") || 'DARK';
 			document.body.className = "theme-" + THEME.toLowerCase();
 		</script>,
 <script>
 	var shotsData 	= JSON.parse('\x7B\x22h\x22\x3A\x5B\x7B\x22id\x22\x3A\x22412050\x22,\x22minute\x22\x3A\x2257\x22,\x22result\x22\x3A\x22BlockedShot\x22,\x22X\x22\x3A\x220.889000015258789\x22,\x22Y\x22\x3A\x220.544000015258789\x22,\x22xG\x22\x3A\x220.10845217108726501\x22,\x22player\x22\x3A\x22Tr\x5Cu00e9z\x5Cu00e9guet\x22,\x22h_a\x22\x3A\x22h\x22,\x22player_id\x22\x3A\x227722\x22,\x22situation\x22\x3A\x22OpenPlay\x22,\x22season\x22\x3A\x222020\x22,\x22shotType\x22\x3A\x22LeftFoot\x22,\x22match_id\x22\x3A\x2214606\x22,\x22h_team\x22\x3A\x22Aston\x20Villa\x22,\x22a_team\x22\x3A\x22Tottenham\x22,\x22h_goals\x22\x3A\x220\x22,\x22a_goals\x22\x3A\x222\x22,\x22date\x22\x3A\x222021\x2D03\x2D21\x2019\x3A30\x3A00\x22,\x22player_assisted\x22\x3A\x22Morgan\x20Sanson\x22,\x22lastAction\x22\x3A\x22Pass\x22\x7D,\x7B\x22id\x22\x3A\x22412051\x22,\x22mi

## 3. From scripts, we'll only take the shots data

In [5]:
strings = scripts[1].string #since the shots data is in the second element of the scripts list
strings

"\n\tvar shotsData \t= JSON.parse('\\x7B\\x22h\\x22\\x3A\\x5B\\x7B\\x22id\\x22\\x3A\\x22412050\\x22,\\x22minute\\x22\\x3A\\x2257\\x22,\\x22result\\x22\\x3A\\x22BlockedShot\\x22,\\x22X\\x22\\x3A\\x220.889000015258789\\x22,\\x22Y\\x22\\x3A\\x220.544000015258789\\x22,\\x22xG\\x22\\x3A\\x220.10845217108726501\\x22,\\x22player\\x22\\x3A\\x22Tr\\x5Cu00e9z\\x5Cu00e9guet\\x22,\\x22h_a\\x22\\x3A\\x22h\\x22,\\x22player_id\\x22\\x3A\\x227722\\x22,\\x22situation\\x22\\x3A\\x22OpenPlay\\x22,\\x22season\\x22\\x3A\\x222020\\x22,\\x22shotType\\x22\\x3A\\x22LeftFoot\\x22,\\x22match_id\\x22\\x3A\\x2214606\\x22,\\x22h_team\\x22\\x3A\\x22Aston\\x20Villa\\x22,\\x22a_team\\x22\\x3A\\x22Tottenham\\x22,\\x22h_goals\\x22\\x3A\\x220\\x22,\\x22a_goals\\x22\\x3A\\x222\\x22,\\x22date\\x22\\x3A\\x222021\\x2D03\\x2D21\\x2019\\x3A30\\x3A00\\x22,\\x22player_assisted\\x22\\x3A\\x22Morgan\\x20Sanson\\x22,\\x22lastAction\\x22\\x3A\\x22Pass\\x22\\x7D,\\x7B\\x22id\\x22\\x3A\\x22412051\\x22,\\x22minute\\x22\\x3A\\x2258\\x22

In [6]:
#as we can see, it's still a little bit dirty so we'll have to strip certain symbol so only JSON data remains

ind_start = strings.index("('")+2
ind_end = strings.index("')")

## Next, we'll convert strings into JSON data

In [7]:
json_data = strings[ind_start:ind_end]
json_data = json_data.encode('utf8').decode('unicode_escape')

data = json.loads(json_data)

In [8]:
data

{'h': [{'id': '412050',
   'minute': '57',
   'result': 'BlockedShot',
   'X': '0.889000015258789',
   'Y': '0.544000015258789',
   'xG': '0.10845217108726501',
   'player': 'Trézéguet',
   'h_a': 'h',
   'player_id': '7722',
   'situation': 'OpenPlay',
   'season': '2020',
   'shotType': 'LeftFoot',
   'match_id': '14606',
   'h_team': 'Aston Villa',
   'a_team': 'Tottenham',
   'h_goals': '0',
   'a_goals': '2',
   'date': '2021-03-21 19:30:00',
   'player_assisted': 'Morgan Sanson',
   'lastAction': 'Pass'},
  {'id': '412051',
   'minute': '58',
   'result': 'SavedShot',
   'X': '0.7930000305175782',
   'Y': '0.5879999923706055',
   'xG': '0.03506242111325264',
   'player': 'John McGinn',
   'h_a': 'h',
   'player_id': '7723',
   'situation': 'OpenPlay',
   'season': '2020',
   'shotType': 'RightFoot',
   'match_id': '14606',
   'h_team': 'Aston Villa',
   'a_team': 'Tottenham',
   'h_goals': '0',
   'a_goals': '2',
   'date': '2021-03-21 19:30:00',
   'player_assisted': 'Matt Targe

In [9]:
x = []
y = []
xg = []
team = []
time = []
data_away = data['a']
data_home = data['h']

for index in range(len(data_home)):
    for key in data_home[index]:
        if key == 'X':
            x.append(data_home[index][key])
        if key == 'Y':
            y.append(data_home[index][key])
        if key == 'xG':
            xg.append(data_home[index][key])
        if key == 'h_team':
            team.append(data_home[index][key])
        if key == 'minute':
            time.append(data_home[index][key])
            
for index in range(len(data_away)):
    for key in data_away[index]:
        if key == 'X':
            x.append(data_away[index][key])
        if key == 'Y':
            y.append(data_away[index][key])
        if key == 'xG':
            xg.append(data_away[index][key])
        if key == 'a_team':
            team.append(data_away[index][key])
        if key == 'minute':
            time.append(data_away[index][key])

## Last, we'll create a DataFrame

In [10]:
col_names= ['x', 'y','time', 'xg', 'team']
df = pd.DataFrame([x,y,time,xg,team], index=col_names)

In [11]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
x,0.889000015258789,0.7930000305175782,0.755,0.81,0.8569999694824219,0.8580000305175781,0.8180000305175781,0.8490000152587891,0.7719999694824219,0.9590000152587891,0.85,0.9390000152587891,0.745999984741211,0.784000015258789,0.885,0.845999984741211,0.9269999694824219
y,0.544000015258789,0.5879999923706055,0.765,0.3659999847412109,0.4520000076293945,0.425,0.7069999694824218,0.38,0.32099998474121094,0.45799999237060546,0.48200000762939454,0.4540000152587891,0.4779999923706055,0.5570000076293945,0.5,0.4420000076293945,0.37099998474121093
time,57,58,60,72,83,83,83,85,4,28,35,41,41,47,67,77,81
xg,0.10845217108726501,0.03506242111325264,0.013734503649175167,0.043730102479457855,0.08297575265169144,0.06148289144039154,0.019062310457229614,0.07518036663532257,0.023315519094467163,0.6499724388122559,0.0706346407532692,0.057559773325920105,0.013745012693107128,0.04467565938830376,0.7611688375473022,0.08070853352546692,0.018329372629523277
team,Aston Villa,Aston Villa,Aston Villa,Aston Villa,Aston Villa,Aston Villa,Aston Villa,Aston Villa,Tottenham,Tottenham,Tottenham,Tottenham,Tottenham,Tottenham,Tottenham,Tottenham,Tottenham


In [12]:
df = df.T
df

Unnamed: 0,x,y,time,xg,team
0,0.889000015258789,0.544000015258789,57,0.108452171087265,Aston Villa
1,0.7930000305175782,0.5879999923706055,58,0.0350624211132526,Aston Villa
2,0.755,0.765,60,0.0137345036491751,Aston Villa
3,0.81,0.3659999847412109,72,0.0437301024794578,Aston Villa
4,0.8569999694824219,0.4520000076293945,83,0.0829757526516914,Aston Villa
5,0.8580000305175781,0.425,83,0.0614828914403915,Aston Villa
6,0.8180000305175781,0.7069999694824218,83,0.0190623104572296,Aston Villa
7,0.8490000152587891,0.38,85,0.0751803666353225,Aston Villa
8,0.7719999694824219,0.3209999847412109,4,0.0233155190944671,Tottenham
9,0.9590000152587892,0.4579999923706054,28,0.6499724388122559,Tottenham


This DataFrame can be exported and I can use it to make an xG timeline or shot maps.

In [13]:
df.to_csv(r'C:\Users\edgar\Desktop\stuff\tutorial python\Tottenham vs Aston Villa.csv', index = False)

## Credits

This is from Mckay Johns' tutorial on <a href="https://www.youtube.com/watch?v=IsR5FrjNmro&ab_channel=McKayJohns">scraping data for football analytics </a>. Check his other works <a href="https://www.youtube.com/channel/UCmqincDKps3syxvD4hbODSg"> here </a> 