In [1]:
from bs4 import BeautifulSoup
import requests
import json
import time
import os  # Optional: for checking file locations
import pandas as pd

In [2]:
session = requests.Session()

In [3]:
def parse_team_id_from_url(url):
    """Helper function to extract team ID from URL"""
    if 'verein/' in url:
        return url.split('/')[4]
    return None

In [4]:
def get_league_html(session, league, season, window):
    """Fetches HTML page response with headers to avoid 403 error"""
    URL_LEAGUE = "https://www.transfermarkt.com/premier-league/transfers/wettbewerb/{league}/plus/?saison_id={season}&s_w={window}"
    
    scrape_url = URL_LEAGUE.format(
        league=league,
        season=season,
        window=window
    )
    
    # Headers to mimic a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
    }
    
    resp = session.get(scrape_url, headers=headers)  # Send request with headers
    
    if resp.status_code != 200:
        print(f"Error fetching {scrape_url} - Status Code: {resp.status_code}")
        return None  # Prevents further errors
    
    return BeautifulSoup(resp.text, 'html.parser')


In [5]:
def scrape_league_season_filtered(session, league, season, window, club_name_filter):
    """Scrapes league season data and filters by club name"""
    league_soup = get_league_html(session, league, season, window)
    
    if not league_soup:
        print(f"Failed to scrape data for {league}, {season}, {window}")
        return []

    boxes = league_soup.find_all(class_="box")
    team_boxes = [box for box in boxes if box.find("div", class_="transfer-zusatzinfo-box")]

    filtered_data = []
    for box in team_boxes:
        try:
            team_info_tag = box.find("h2", class_="content-box-headline")  
            team_name = team_info_tag.find("a")["title"].strip() if team_info_tag and team_info_tag.find("a") else "Unknown Team"

            # Only process if the club matches the filter
            if club_name_filter.lower() in team_name.lower():
                team_tables = box.find_all("table")
                in_transfers = scrape_transfer_table(team_tables[0]) if len(team_tables) > 0 else []
                out_transfers = scrape_transfer_table(team_tables[1]) if len(team_tables) > 1 else []

                filtered_data.append({
                    'team_name': team_name,
                    'season': season,
                    'in': in_transfers,
                    'left': out_transfers
                })
        except Exception as e:
            print(f"Error processing team data: {e}")

    return filtered_data


In [6]:

def scrape_multiple_seasons_filtered(session, league, seasons, windows, club_name_filter):
    """Scrapes multiple seasons and windows, filters by club name, and includes transfer window info"""
    all_data = []

    for season in seasons:
        for window in windows:
            print(f"Scraping season {season}, {window.upper()} window for {club_name_filter}...")

            # Get filtered transfer data for the club
            season_data = scrape_league_season_filtered(session, league, season, window, club_name_filter)
            
            if season_data:
                for team_data in season_data:
                    team_data['season'] = season  # Add season info to the data
                    team_data['transfer_window'] = "Summer" if window == "s" else "Winter"  # Label transfer window
                    all_data.append(team_data)

    return all_data  # List of all seasons' transfer data


In [7]:
def scrape_transfer_table(transfer_table_soup):
    """Scrape data from team transfers HTML table"""
    tbody = transfer_table_soup.find("tbody")
    if not tbody:
        return []  # Return empty list if tbody is missing
    
    table_records = tbody.findAll("tr")
    
    if not table_records or len(table_records[0].findAll("td")) <= 1:
        return []  # Empty list if no valid records
    
    records = []
    for rec in table_records:
        try:
            rec_data = {
                'player_name': rec.find("td").find("div").text.strip() if rec.find("td") else "Unknown",
                'player_id': rec.find("td").find("div").find("a")["href"].split("spieler/")[-1] if rec.find("td").find("div") and rec.find("td").find("div").find("a") else "Unknown",
                'player_age': rec.find("td", class_="zentriert alter-transfer-cell").text.strip() if rec.find("td", class_="zentriert alter-transfer-cell") else "Unknown",
                'player_pos': rec.find("td", class_="kurzpos-transfer-cell zentriert").text.strip() if rec.find("td", class_="kurzpos-transfer-cell zentriert") else "Unknown",
                'market_val': rec.find("td", class_="rechts mw-transfer-cell").text.strip() if rec.find("td", class_="rechts mw-transfer-cell") else "Unknown"
                
            }

            # Get player nationalities
            nationalities = rec.find("td", class_="zentriert nat-transfer-cell")
            rec_data['player_nat'] = [img['title'].strip() for img in nationalities.findAll("img")] if nationalities else []
            
            # Counter team data
            counter_team = rec.find("td", class_="verein-flagge-transfer-cell")
            rec_data['counter_team_country'] = counter_team.find("img")["title"].strip() if counter_team and counter_team.find("img") else "Unknown"
            rec_data['counter_team_name'] = counter_team.find("a")["title"].strip() if counter_team and counter_team.find("a") else "Unknown"
            rec_data['counter_team_id'] = parse_team_id_from_url(counter_team.find("a")["href"]) if counter_team and counter_team.find("a") else "Unknown"
            
            # Transfer details
            transfer = rec.findAll("td", class_="rechts")[-1].find("a") if rec.findAll("td", class_="rechts") else None
            rec_data['transfer_fee'] = transfer.text.strip() if transfer else "Unknown"
            rec_data['transfer_id'] = transfer["href"].split("transfer_id/")[-1] if transfer and "transfer_id/" in transfer["href"] else "Unknown"

            records.append(rec_data)

        except Exception as e:
            print(f"Error parsing transfer data: {e}")

    return records

In [76]:
# Define parameters for scraping
league = "TR1"  # Turkish Süper Lig
seasons = list(range(1995, 2025))  # 1995 to 2024 seasons
windows = ["s", "w"]  # Scrape both Summer ("s") and Winter ("w")
club_name = "Trabzonspor"  # Change this to your desired club

# Run the scraper for multiple seasons & both transfer windows
data = scrape_multiple_seasons_filtered(session, league, seasons, windows, club_name)

# Convert to DataFrame
df_in = pd.json_normalize(data, record_path=['in'], meta=['team_name', 'season', 'transfer_window'], errors='ignore')
df_out = pd.json_normalize(data, record_path=['left'], meta=['team_name', 'season', 'transfer_window'], errors='ignore')

# Define filename
xlsx_filename = f"{club_name.replace(' ', '_')}_transfers.xlsx"

# Remove existing file if it exists (to avoid permission errors)
if os.path.exists(xlsx_filename):
    os.remove(xlsx_filename)

# Save to Excel (XLSX)
with pd.ExcelWriter(xlsx_filename) as writer:
    df_in.to_excel(writer, sheet_name="Incoming Transfers", index=False)
    df_out.to_excel(writer, sheet_name="Outgoing Transfers", index=False)

print(f"Data saved to {xlsx_filename}")



Scraping season 1995, S window for Trabzonspor...
Scraping season 1995, W window for Trabzonspor...
Scraping season 1996, S window for Trabzonspor...
Scraping season 1996, W window for Trabzonspor...
Scraping season 1997, S window for Trabzonspor...
Scraping season 1997, W window for Trabzonspor...
Scraping season 1998, S window for Trabzonspor...
Scraping season 1998, W window for Trabzonspor...
Scraping season 1999, S window for Trabzonspor...
Scraping season 1999, W window for Trabzonspor...
Scraping season 2000, S window for Trabzonspor...
Scraping season 2000, W window for Trabzonspor...
Scraping season 2001, S window for Trabzonspor...
Scraping season 2001, W window for Trabzonspor...
Scraping season 2002, S window for Trabzonspor...
Scraping season 2002, W window for Trabzonspor...
Scraping season 2003, S window for Trabzonspor...
Scraping season 2003, W window for Trabzonspor...
Scraping season 2004, S window for Trabzonspor...
Scraping season 2004, W window for Trabzonspor...
