In [1]:
import requests
import numpy as np
import pandas as pd
import time
import tweepy
from datetime import datetime
from google.colab import files


In [2]:
def url_to_df(url,key=None):
  response = requests.get(url)
  if response.status_code == 200:
      data = response.json()
      if key!=None:
        df=pd.DataFrame(data[key])
      else:
        df=pd.DataFrame(data)
      return df
  else:
      print(f"Error: {response.status_code}")

In [3]:
pip install ScraperFC

Collecting ScraperFC
  Downloading ScraperFC-3.1.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botasaurus (from ScraperFC)
  Downloading botasaurus-4.0.75.tar.gz (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.1/119.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bs4 (from ScraperFC)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting cloudscraper (from ScraperFC)
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting datetime (from ScraperFC)
  Downloading DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting selenium (from ScraperFC)
  Downloading selenium-4.26.1-py3-none-any.whl.metadata (7.1 kB)
Collecting javascript_fixes (from botasaurus->ScraperFC)
  Downloading javascript_fixes-1.1.29.tar.gz (40 kB)


In [4]:
import sys
sys.path.append('./src')

import ScraperFC as sfc
fb = sfc.FBref()
sc = sfc.Sofascore()

Downloading @request dependencies...


Output()

Scrape Begins

In [29]:
def prepare_sc(match_id):
  matchy=sc.get_match_dict(match_id)
  matchy={'event':matchy}
  matchy=pd.DataFrame(matchy)
  score=matchy.loc[['homeScore','awayScore']]
  score.iloc[0,0]=score.iloc[0,0]['normaltime']
  score.iloc[1,0]=score.iloc[1,0]['normaltime']
  score.index=['home','away']
  score.columns=['Goals']
  teams=matchy.loc[['homeTeam','awayTeam']]
  teams.iloc[0,0]=teams.iloc[0,0]['name']
  teams.iloc[1,0]=teams.iloc[1,0]['name']
  teams.index=['home','away']
  teams.columns=['team']

  stats=sc.scrape_team_match_stats(match_id)
  stats=stats.T
  stats.columns=stats.iloc[0]
  stats=stats.drop(['name'],axis=0)
  stats=stats[['Expected goals','Total shots','Shots inside box','Shots on target','Big chances']]
  stats=stats.T
  stats=stats[stats['period']=='ALL']
  stats=stats.T
  stats.columns=['xG','Shots','Total_shots','SiB','SoT','BC']
  stats=stats.drop(['Total_shots'],axis=1)
  stats=stats.loc[['home','away']]
  stats=pd.concat([teams,score,stats],axis=1)

  num_gw=matchy.loc['roundInfo'].iloc[0]['round']
  num_season=matchy.loc['season'].iloc[0]['year']
  stats.index=['H','A']
  df=pd.DataFrame({'season':[num_season],'GW':[num_gw]})
  df.index=df['season']
  df=df.drop(['season'],axis=1)
  for col in stats.columns:
    for index,row in stats.iterrows():
      new_col=col+' '+index
      new_df={new_col:[row[col]]}
      new_df=pd.DataFrame(new_df)
      new_df.index=df.index
      df=pd.concat([df,pd.DataFrame(new_df)],axis=1)
  return df

In [6]:
def get_teams_dict(year_fb,year_sc):
  events=sc.get_match_dicts(year_sc,'EPL')
  all_stats=[]
  for event in events:
    id=event['id']
    try:
      stats=prepare_sc(id)
      all_stats.append(stats)
    except Exception as e:
      continue
  all_stats=pd.concat(all_stats)
  all_stats=all_stats.sort_values(['GW'])
  teams_sc=set()
  for team in all_stats.loc[:,'team H']:
    teams_sc.add(team)
  teams_sc=sorted(teams_sc)
  teams_fb=set()
  for team in matches.loc[:,'Home Team']:
    teams_fb.add(team)
  teams_fb=sorted(teams_fb)
  teams_fb
  teams=dict(zip(teams_fb,teams_sc))
  return teams

In [30]:
def get_xg(game):
  home_xg=game['Home Player Stats']['Summary'].iloc[-1,18]
  away_xg=game['Away Player Stats']['Summary'].iloc[-1,18]
  return home_xg,away_xg

In [32]:
years_fb=['2017-2018','2018-2019','2019-2020','2020-2021','2021-2022','2022-2023','2023-2024','2024-2025']
years_sc=['17/18','18/19','19/20','20/21','21/22','22/23','23/24','24/25']

In [None]:
excel_file='EPL_data.xlsx'
with pd.ExcelWriter(excel_file) as writer:
  for i,year_fb in enumerate(years_fb):
    year_sc=years_sc[i]
    events=sc.get_match_dicts(year_sc,'EPL')
    all_stats=[]
    for event in events:
      id=event['id']
      try:
        stats=prepare_sc(id)
        all_stats.append(stats)
      except Exception as e:
        continue
    all_stats=pd.concat(all_stats)
    all_stats=all_stats.sort_values(['GW'])

    all_stats.insert(5,'xG H',np.nan)
    all_stats.insert(6,'xG A',np.nan)
    all_stats.index=range(len(all_stats))
    matches=fb.scrape_matches(year_fb,'EPL')
    teams_fb_sc=get_teams_dict(year_fb,year_sc)
    for i,matchy in matches.iterrows():
      xg_h,xg_a=get_xg(matchy)
      team_h=teams_fb_sc[matchy.loc['Home Team']]
      team_a=teams_fb_sc[matchy.loc['Away Team']]
      if all_stats[(all_stats['team H']==team_h) & (all_stats['team A']==team_a)].empty:
        continue
      ind=all_stats[(all_stats['team H']==team_h) & (all_stats['team A']==team_a)].index[0]
      all_stats.iloc[ind,5]=xg_h
      all_stats.iloc[ind,6]=xg_a

    all_stats=all_stats.astype({'GW':'int64','Goals H':'int64','Goals A':'int64','xG H':'float','xG A':'float','Shots H':'int64','Shots A':'int64','SiB H':'int64','SiB A':'int64','SoT H':'int64','SoT A':'int64','BC H':'int64','BC A':'int64'})
    all_stats.to_excel(writer, sheet_name=year_fb, index=False)
files.download(excel_file)

In [None]:
all_stats.to_csv('all_stats.csv')
file_path = 'all_stats.csv'
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
excel_file='EPL_data.xlsx'
with pd.ExcelWriter(excel_file) as writer:
  for year in years_fb:
    all_stats.to_excel(writer, sheet_name=year, index=False)