In [192]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json

In [193]:
# Now that we are able to get the teams for any season in any league, let's try to also get other data located in the website, too.
# The json format is very useful. It is optimized and allows for a more complex data storage.
# This meets our requirement for a season based table (or dataframe).
json_data = json.dumps({
    "header": "TransferMarkt data",
    "data": {
        
        "England": {
            
            "2020-2021": {
                "team": [],
                "player_count": [],
                "mean_age": [],
                "foreigner_count": [],
                "mean_player_market_value": [],
                "total_market_value": []
            },
            "2021-2022": {
                "team": [],
                "player_count": [],
                "mean_age": [],
                "foreigner_count": [],
                "mean_player_market_value": [],
                "total_market_value": []
            },
            "2022-2023": {
                "team": [],
                "player_count": [],
                "mean_age": [],
                "foreigner_count": [],
                "mean_player_market_value": [],
                "total_market_value": []
            },
            "2023-2024": {
                "team": [],
                "player_count": [],
                "mean_age": [],
                "foreigner_count": [],
                "mean_player_market_value": [],
                "total_market_value": []
            },
            "2024-2025": {
                "team": [],
                "player_count": [],
                "mean_age": [],
                "foreigner_count": [],
                "mean_player_market_value": [],
                "total_market_value": []
            }
        }
    },
    "description": "This json file contains number of players, mean age, number of foreigners, mean player market value, and total market value of the teams for each season"
}, indent=4)

In [197]:
# To be able to access and index the json data, convert it to a python format
json_data_dict = json.loads(json_data)
# An example indexing to grasp how it works
"""
data ==>
    seasons ==>
        2020-2021 ==>
             -------------
            |team,..      | 
            |Arsenal,     |
            |Chelsea,     |
            |_____________|

        ....
        2024-2025 ==>
             -------------
            |team,..      | 
            |Man Utd,     |
            |Liverpool,   |
            |_____________|
"""
    
print(json_data_dict["data"]["England"]["2021-2022"]["team"])

[]


In [198]:
# Introducing ourselves
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
}
# get the request based on the url
years = range(2020, 2025)
seasons = [str(years[i]) + "-" + str(years[i] + 1) for i in range(len(years))]
for i, year in enumerate(years):
    url = f"https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id={year}" # url
    response = requests.get(url, headers = headers) # get request from the above url
    print(response.status_code) # 200 means successful
    
    soup = BeautifulSoup(response.content, "html.parser") # response.content is the code html parser boyformats
    body = soup.find_all("tbody")[1] # The table we are interested in is the second table
    
    # Scrape the tables and insert into the json dict
    
    # Utilize Regex
    matches = re.findall(">[^a][a-z A-Z.0-9;&]+</", str(body)) # This !beauty! extracts the information in the table
    cleaned_data_cells = [re.sub(r'[<>/]', '', match) for match in matches] # get rid of unnecessary characters
    
    table = {
        "team": [],
        "player_count": [],
        "mean_age": [],
        "foreigner_count": [],
        "mean_player_market_value": [],
        "total_market_value": []
    }
    
    keys = [k for k in table] * 20
    for k, d in zip(keys,cleaned_data_cells):
        table[k].append(d)
        json_data_dict["data"]["England"][seasons[i]][k].append(d) # add the data cells to the json string
    
    season_info = pd.DataFrame(table)
    # season_info.to_json(f"{seasons[i]}.json", indent=4) # save each seasons's table




200
200
200
200
200


In [199]:
# Save the json string as Squad Information.json
with open("Squad Information.json", "w") as fp:
    json.dump(json_data_dict, fp)

In [200]:
# To use as df, load the content and pd.DataFrame(content)
with open('Squad Information.json', 'r') as json_file:
    j = json.load(json_file)

# Example: pd.DataFrame(j["data"]["England"]["2020-2021"])

In [202]:
json_data_dict

{'header': 'TransferMarkt data',
 'data': {'England': {'2020-2021': {'team': ['Manchester City',
     'Liverpool FC',
     'Chelsea FC',
     'Manchester United',
     'Tottenham Hotspur',
     'Arsenal FC',
     'Everton FC',
     'Leicester City',
     'Wolverhampton Wanderers',
     'Aston Villa',
     'West Ham United',
     'Brighton &amp; Hove Albion',
     'Southampton FC',
     'Newcastle United',
     'Fulham FC',
     'Leeds United',
     'Crystal Palace',
     'Sheffield United',
     'West Bromwich Albion',
     'Burnley FC'],
    'player_count': ['36',
     '43',
     '39',
     '39',
     '41',
     '42',
     '44',
     '40',
     '38',
     '53',
     '39',
     '46',
     '39',
     '33',
     '38',
     '39',
     '36',
     '40',
     '43',
     '32'],
    'mean_age': ['25.3',
     '24.9',
     '25.7',
     '25.4',
     '25.2',
     '24.9',
     '25.2',
     '25.8',
     '24.4',
     '23.9',
     '27.0',
     '24.9',
     '25.0',
     '27.4',
     '27.0',
     '25.1'