In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import time
import requests
from bs4 import BeautifulSoup
import lxml
import json
from urllib.parse import quote
import random

In [2]:
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))

In [3]:
players_df = pd.read_csv("players_db/fm23/fm23db_processed.csv")
team_df = pd.read_csv("players_db/fm23/team_ratings.csv")

In [4]:
top5_leagues = ['Ligue 1 Uber Eats', 'English Premier Division', 'Italian Serie A',
       'Spanish First Division', 'Bundesliga'] 

In [5]:
fbref_league_dict = {
    "English Premier Division": {"short": "eng", "name": "English Premier Division", "fbref_league_id": 9, "league_id": 354},
    "Italian Serie A": {"short": "ita", "name": "Italian Serie A", "fbref_league_id": 11, "league_id": 710},
    "Ligue 1 Uber Eats": {"short": "fra", "name": "Ligue 1 Uber Eats", "fbref_league_id": 13, "league_id": 773},
    "Spanish First Division": {"short": "spa", "name": "Spanish First Division", "fbref_league_id": 12, "league_id": 1215},
    "Bundesliga": {"short": "ger", "name": "Bundesliga", "fbref_league_id": 20, "league_id": 185},   
}

<br><br><br><br>
<h1 style="color:blue;">  Matching FBREF Team Names</h1>

In [6]:
from rapidfuzz import process
def find_best_match(name, choices):
    return process.extractOne(name, choices)

In [7]:
top5_url = "https://fbref.com/en/comps/Big5/2022-2023/shooting/squads/2022-2023-Big-5-European-Leagues-Stats"

In [8]:
data = pd.read_html(top5_url)

In [9]:
fbref_teams = data[0][('Unnamed: 1_level_0', 'Squad')].tolist()
fm_teams = team_df[team_df.Division.isin(top5_leagues)]["Club"].tolist()

In [10]:
for club_name in fm_teams:
    answer, score, other = find_best_match(club_name, fbref_teams)
    where_id = team_df[team_df.Club==club_name].iloc[0].name
    team_df.at[where_id, 'fbref_name'] = answer
team_df.to_csv("players_db/fm23/team_ratings.csv", index=False)

<br><br><br><br>
<h1 style="color:blue;">  Writing FBREF Slugs</h1>

In [11]:
def Top5_Leagues_Teams(team_df, season="2022-2023"):
    url = f"https://fbref.com/en/comps/Big5/{season}/shooting/squads/{season}-Big-5-European-Leagues-Stats"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "lxml")

    table = soup.find("table", {"class": "stats_table"})
    team_ids = []
    team_names = []

    for row in table.find_all("tr")[1:]:
        team_cell = row.find("td", {"data-stat": "team"})
        team_competition = row.find("td", {"data-stat": "comp_level"})
        if team_cell:
            team_name = team_cell.text.strip()
            team_id = team_cell.a.get("href").split("/")[3] if team_cell.a else None
            team_ids.append(team_id)
            team_names.append(team_name)
            team_row = team_df[team_df['fbref_name'] == team_name]
            if not team_row.empty:
                where_id = team_row.index[0]
                team_df.at[where_id, 'fbref_slug'] = team_id
    # return pd.DataFrame({
    #     "slug_id": team_ids,
    #     "fbref_name": team_names,
    # })
Top5_Leagues_Teams(team_df, "2022-2023")
team_df.to_csv("players_db/fm23/team_ratings.csv", index=False)

<br><br><br><br>
<h1 style="color:blue;">  Preparing FBREF URLS</h1>

In [13]:
fbref_league_dict

{'English Premier Division': {'short': 'eng',
  'name': 'English Premier Division',
  'fbref_league_id': 9,
  'league_id': 354},
 'Italian Serie A': {'short': 'ita',
  'name': 'Italian Serie A',
  'fbref_league_id': 11,
  'league_id': 710},
 'Ligue 1 Uber Eats': {'short': 'fra',
  'name': 'Ligue 1 Uber Eats',
  'fbref_league_id': 13,
  'league_id': 773},
 'Spanish First Division': {'short': 'spa',
  'name': 'Spanish First Division',
  'fbref_league_id': 12,
  'league_id': 1215},
 'Bundesliga': {'short': 'ger',
  'name': 'Bundesliga',
  'fbref_league_id': 20,
  'league_id': 185}}

In [14]:
def League_Name_to_fm_league_id(league_name):
    for i in fbref_league_dict.values():
        name = i["name"]
        lid = i["league_id"]
        if name == league_name:
            return lid

In [15]:
def fm_league_id_to_League_Name(fm_league_id):
    for i in fbref_league_dict.values():
        name = i["name"]
        lid = i["league_id"]
        if lid == fm_league_id:
            return name

In [16]:
def FBREF_URLS(match_logs_stats_dict, team_df, stat_attribute, fbref_league_id, league_id, season="2022-2023"):
    team_slugs = team_df.query(f"League_id == {league_id}")["fbref_slug"].unique().tolist()
    team_names = team_df.query(f"League_id == {league_id}")["Club"].unique().tolist()
    team_ids = team_df.query(f"League_id == {league_id}")["Club_id"].unique().tolist()
    
    for slug, club_name, club_id  in zip(team_slugs, team_names, team_ids):
        match_logs_url = f"https://fbref.com/en/squads/{slug}/{season}/matchlogs/c{fbref_league_id}/{stat_attribute}"
        match_logs_stats_dict[stat_attribute].append({
            "Club": club_name,
            "Club_id": club_id,
            "League_name": fm_league_id_to_League_Name(league_id),
            "League_id": league_id,
            "fbref_league_id": fbref_league_id,
            "fbref_slug": slug,
            "stat": stat_attribute,
            "url": match_logs_url,
            "season": season
        })

In [225]:
Match_Logs_URL_Dict = {
    "shooting":[],
    "passing":[],
    "defense":[],
    "keeper":[],
    "passing_types":[],
    "gca":[],
    "possession":[],
    "misc":[],
}

# 784 urls
stat_attributes = list(Match_Logs_URL_Dict.keys())
fbref_league_ids = [i["fbref_league_id"] for i in fbref_league_dict.values()]
fm_league_names = fbref_league_dict.keys()

for stat in stat_attributes:
    for fbref_league_id, fm_league_name in zip(fbref_league_ids, fm_league_names):
        league_id = League_Name_to_fm_league_id(fm_league_name)
        FBREF_URLS(Match_Logs_URL_Dict, team_df, stat, fbref_league_id, league_id, "2022-2023")        

In [226]:
Match_Logs_URL_Dict["passing"]

[{'Club': 'Manchester City',
  'Club_id': 6827,
  'League_name': 'English Premier Division',
  'League_id': 354,
  'fbref_league_id': 9,
  'fbref_slug': 'b8fd03ef',
  'stat': 'passing',
  'url': 'https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/passing',
  'season': '2022-2023'},
 {'Club': 'Liverpool',
  'Club_id': 6518,
  'League_name': 'English Premier Division',
  'League_id': 354,
  'fbref_league_id': 9,
  'fbref_slug': '822bd0ba',
  'stat': 'passing',
  'url': 'https://fbref.com/en/squads/822bd0ba/2022-2023/matchlogs/c9/passing',
  'season': '2022-2023'},
 {'Club': 'Manchester United',
  'Club_id': 6828,
  'League_name': 'English Premier Division',
  'League_id': 354,
  'fbref_league_id': 9,
  'fbref_slug': '19538871',
  'stat': 'passing',
  'url': 'https://fbref.com/en/squads/19538871/2022-2023/matchlogs/c9/passing',
  'season': '2022-2023'},
 {'Club': 'Tottenham Hotspur',
  'Club_id': 11015,
  'League_name': 'English Premier Division',
  'League_id': 354,
  'fbref_lea

In [19]:
url = "https://fbref.com/en/squads/0cdc4311/2022-2023/matchlogs/c20/gca"
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")

table = soup.find("table", {"class": "stats_table"})
html2 = pd.read_html(url)[0][:-1]

html2.columns = html2.columns.get_level_values(1)
html2['Club'] = "Augsburg"

In [262]:
html2.head()

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,SCA,...,Def,GCA,PassLive,PassDead,TO,Sh,Fld,Def.1,Match Report,Club
0,2022-08-06,15:30,Matchweek 1,Sat,Home,L,0,4,Freiburg,19,...,0,0,0,0,0,0,0,0,Match Report,Augsburg
1,2022-08-13,15:30,Matchweek 2,Sat,Away,W,2,1,Leverkusen,11,...,1,4,2,1,0,0,1,0,Match Report,Augsburg
2,2022-08-20,15:30,Matchweek 3,Sat,Home,L,1,2,Mainz 05,12,...,0,2,2,0,0,0,0,0,Match Report,Augsburg
3,2022-08-27,15:30,Matchweek 4,Sat,Away,L,0,1,Hoffenheim,21,...,1,0,0,0,0,0,0,0,Match Report,Augsburg
4,2022-09-04,15:30,Matchweek 5,Sun,Home,L,0,2,Hertha BSC,11,...,0,0,0,0,0,0,0,0,Match Report,Augsburg


<br><br><br><br>
<h1 style="color:blue;">  Scraping FBREF Stats</h1>

In [21]:
headers_list = [
    {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"},
    {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"},
    {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"},
    {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"},
    {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"},
    {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0"},
    {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/18.17763"},
    {"User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36"},
    {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"},
    {"User-Agent": "Mozilla/5.0 (iPad; CPU OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"}
]

In [227]:
def Scrape_FBREF_Stats(fbdict):
    url =  fbdict["url"]
    headers = np.random.choice(headers_list)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "lxml")
    table = soup.find("table", {"class": "stats_table"})
    html2 = pd.read_html(url)[0][:-1]
    html2.columns = html2.columns.get_level_values(1)
    html2['Club'] = fbdict["Club"]
    return html2

In [261]:
# season_info = "22-23"

# for stat in stat_attributes:
#     counter = 0    
#     df_logs_array = []
#     for url in Match_Logs_URL_Dict[stat]:
#         time.sleep(np.random.uniform(1,7))
#         new_df = Scrape_FBREF_Stats(url)
#         if "Comp" in new_df.columns:
#              new_df = new_df.drop(["Comp"],axis=1) 
#         df_logs_array.append(new_df)
#         counter+=1        
#         print(stat, counter)
#     df_log = pd.concat(df_logs_array, ignore_index=True)
#     df_log.to_csv(f"match_logs/Big5@{season_info}@{stat}.csv", index=False)   

### df_log Error Workplace

In [242]:
# df2 = [df for df in df_logs_array]

# c = 0
# empty = []
# df2[94] = df2[94].drop(["Comp"],axis=1) 
# for i in range(len(df2)):
#     empty.append(df2[i])
#     test_df = pd.concat(empty)
#     c += 1
#     print(i)

# test_df.to_csv(f"match_logs/Big5@{season_info}@{st333at}.csv", index=False)   