### Matchlogs Scrapper

This notebooks scraps information about matchlogs from FBRef for premiere league teams

In [1]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from urllib.parse import urljoin, urlparse

# Headers to appear more like a regular browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_page(url):
    """Fetch page with error handling and rate limiting"""
    time.sleep(random.uniform(2, 4))  # Be respectful - random delay
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

  from pandas.core import (


In [2]:
def load_teams_from_json(json_filename):
    """
    Load teams data from JSON file
    
    Args:
        json_filename (str): Path to JSON file
    
    Returns:
        dict: Teams data dictionary
    """
    with open(json_filename, 'r', encoding='utf-8') as f:
        teams_data = json.load(f)

    return teams_data

# Load the teams data
all_fixtures = load_teams_from_json('../../data/raw/all_competitions_fixtures_2019_2024.json')

# Now you can use all_teams for wage extraction
print(f"Loaded {len(all_fixtures)} teams from JSON file")

Loaded 26 teams from JSON file


In [3]:
def fixtures_data_to_dataframe(fixtures_data):
    """
    Convert fixtures data dictionary to a pandas DataFrame
    
    Args:
        fixtures_data (dict): Fixtures data from extract_all_team_fixtures()
    
    Returns:
        pd.DataFrame: Flattened DataFrame with one row per match
    """
    all_records = []
    
    for team_id, team_data in fixtures_data.items():
        team_name = team_data['team_name']
        
        for season, season_data in team_data['seasons_data'].items():
            if season_data and season_data.get('matches'):
                
                for match in season_data['matches']:
                    # Create a record for each match
                    record = {
                        'team_id': team_id,
                        'team_name': team_name,
                        'season': season
                    }
                    
                    # Add all match data
                    record.update(match)
                    all_records.append(record)
    
    # Create DataFrame
    df = pd.DataFrame(all_records)
    
    # Reorder columns to match FBRef table structure
    if len(df) > 0:
        # First the team identification columns
        team_columns = ['team_name', 'season', 'team_id']
        
        # Then the actual FBRef fixtures table columns in order
        fixtures_columns = [
            'date', 'time', 'comp','round', 'day', 'venue', 'result', 
            'gf', 'ga', 'opponent', 'xg', 'xga', 'poss', 
            'attendance', 'captain', 'formation', 'formation_opp', 
            'referee', 'match_report', 'notes'
        ]
        
        # Include any additional columns (like href links)
        available_team_cols = [col for col in team_columns if col in df.columns]
        available_fixtures_cols = [col for col in fixtures_columns if col in df.columns]
        other_columns = [col for col in df.columns if col not in team_columns + fixtures_columns]
        
        # Final column order
        final_columns = available_team_cols + available_fixtures_cols + other_columns
        df = df[final_columns]
    
    return df

In [4]:
def load_fixtures_json_to_dataframe(json_filename):
    """
    Load fixtures data from JSON file and convert to DataFrame
    
    Args:
        json_filename (str): Path to JSON file
    
    Returns:
        pd.DataFrame: Fixtures data as DataFrame
    """
    with open(json_filename, 'r', encoding='utf-8') as f:
        fixtures_data = json.load(f)
    
    return fixtures_data_to_dataframe(fixtures_data)

### Fixture information loading

In [5]:
fixtures_df = fixtures_data_to_dataframe(all_fixtures)
print(f"DataFrame shape: {fixtures_df.shape}")
print(f"Columns: {list(fixtures_df.columns)}")
fixtures_df.head()

DataFrame shape: (4776, 29)
Columns: ['team_name', 'season', 'team_id', 'date', 'comp', 'round', 'venue', 'result', 'opponent', 'attendance', 'captain', 'formation', 'referee', 'match_report', 'notes', 'date_href', 'start_time', 'comp_href', 'round_href', 'dayofweek', 'goals_for', 'goals_against', 'opponent_href', 'xg_for', 'xg_against', 'possession', 'captain_href', 'opp_formation', 'match_report_href']


Unnamed: 0,team_name,season,team_id,date,comp,round,venue,result,opponent,attendance,...,dayofweek,goals_for,goals_against,opponent_href,xg_for,xg_against,possession,captain_href,opp_formation,match_report_href
0,Arsenal,2019-2020,18bb7c10,2019-08-11,Premier League,Matchweek 1,Away,W,Newcastle Utd,47635,...,Sun,1,0,/en/squads/b2b47a98/2019-2020/Newcastle-United...,1.1,0.4,62,/en/players/e61b8aee/Granit-Xhaka,3-5-2,/en/matches/1405a610/Newcastle-United-Arsenal-...
1,Arsenal,2019-2020,18bb7c10,2019-08-17,Premier League,Matchweek 2,Home,W,Burnley,60214,...,Sat,2,1,/en/squads/943e8050/2019-2020/Burnley-Stats,0.8,1.5,67,/en/players/d4cb83cc/Nacho-Monreal,4-4-2,/en/matches/ff7eda21/Arsenal-Burnley-August-17...
2,Arsenal,2019-2020,18bb7c10,2019-08-24,Premier League,Matchweek 3,Away,L,Liverpool,53298,...,Sat,1,3,/en/squads/822bd0ba/2019-2020/Liverpool-Stats,1.0,2.5,48,/en/players/e61b8aee/Granit-Xhaka,4-3-3,/en/matches/102b241e/Liverpool-Arsenal-August-...
3,Arsenal,2019-2020,18bb7c10,2019-09-01,Premier League,Matchweek 4,Home,D,Tottenham,60333,...,Sun,2,2,/en/squads/361ca564/2019-2020/Tottenham-Hotspu...,2.4,2.0,55,/en/players/e61b8aee/Granit-Xhaka,4-2-2-2,/en/matches/0b6b8aaf/North-London-Derby-Arsena...
4,Arsenal,2019-2020,18bb7c10,2019-09-15,Premier League,Matchweek 5,Away,D,Watford,21360,...,Sun,2,2,/en/squads/2abfe087/2019-2020/Watford-Stats,0.8,2.7,48,/en/players/e61b8aee/Granit-Xhaka,4-2-3-1,/en/matches/8257eda8/Watford-Arsenal-September...


In [6]:
# Step 1: Add base URL to match report links
fixtures_df['full_match_report_url'] = 'https://fbref.com' + fixtures_df['match_report_href']

# Display first few URLs to verify
print("First 5 complete match report URLs:")
for i in range(5):
    print(f"{i+1}. {fixtures_df.iloc[i]['full_match_report_url']}")

# Step 2: Test fetching HTML from a single match report
# Let's use the first match as a test case
test_url = fixtures_df.iloc[0]['full_match_report_url']
test_team_name = fixtures_df.iloc[0]['team_name']
test_opponent = fixtures_df.iloc[0]['opponent']

print(f"Testing URL: {test_url}")
print(f"Team: {test_team_name} vs Opponent: {test_opponent}")
print("Fetching HTML...")

# Fetch the page
soup = get_page(test_url)

if soup:
    print("✅ Successfully fetched HTML")
    print(f"Page title: {soup.title.string if soup.title else 'No title found'}")
else:
    print("❌ Failed to fetch HTML")

First 5 complete match report URLs:
1. https://fbref.com/en/matches/1405a610/Newcastle-United-Arsenal-August-11-2019-Premier-League
2. https://fbref.com/en/matches/ff7eda21/Arsenal-Burnley-August-17-2019-Premier-League
3. https://fbref.com/en/matches/102b241e/Liverpool-Arsenal-August-24-2019-Premier-League
4. https://fbref.com/en/matches/0b6b8aaf/North-London-Derby-Arsenal-Tottenham-Hotspur-September-1-2019-Premier-League
5. https://fbref.com/en/matches/8257eda8/Watford-Arsenal-September-15-2019-Premier-League
Testing URL: https://fbref.com/en/matches/1405a610/Newcastle-United-Arsenal-August-11-2019-Premier-League
Team: Arsenal vs Opponent: Newcastle Utd
Fetching HTML...
✅ Successfully fetched HTML
Page title: Newcastle United vs. Arsenal Match Report – Sunday August 11, 2019 | FBref.com


In [7]:
# Let's scrape the specific match URL to get Team Stats and Team Stats Extra
url = " https://fbref.com/en/matches/1405a610/Newcastle-United-Arsenal-August-11-2019-Premier-League"

print(f"Scraping: {url}")
soup = get_page(url)

Scraping:  https://fbref.com/en/matches/1405a610/Newcastle-United-Arsenal-August-11-2019-Premier-League


In [8]:
def extract_percentage_or_value(text):
    """
    Extract percentage first, if not found then extract first number
    Prioritizes percentage values over other numbers
    """
    import re
    
    # First try to find percentage
    percentage_match = re.search(r'(\d+(?:\.\d+)?%)', text)
    if percentage_match:
        return percentage_match.group(1)
    
    # If no percentage, try to find any number
    number_match = re.search(r'(\d+(?:\.\d+)?)', text)
    if number_match:
        return number_match.group(1)
    
    # If nothing found, return original text
    return text

def scrape_team_stats(soup):
    """
    Improved version of scrape_team_stats that prioritizes percentage values
    """
    team_stats_div = soup.find('div', {'id': 'team_stats'})
    
    if not team_stats_div:
        print("No team_stats div found")
        return None
    
    table = team_stats_div.find('table')
    if not table:
        print("No table found in team_stats div")
        return None
    
    # Extract team names from header
    header_row = table.find('tr')
    team_cells = header_row.find_all('th')
    team1_name = team_cells[0].get_text(strip=True).split()[0]  # Extract just team name
    team2_name = team_cells[1].get_text(strip=True).split()[-1]  # Extract just team name
    
    print(f"Teams found: {team1_name} vs {team2_name}")
    
    # Parse stats
    stats_data = []
    rows = table.find_all('tr')[1:]  # Skip header
    
    i = 0
    while i < len(rows):
        # Each stat has a header row followed by a data row
        if i + 1 < len(rows):
            header_row = rows[i]
            data_row = rows[i + 1]
            
            # Get stat name
            stat_name = header_row.get_text(strip=True)
            
            if stat_name and stat_name != "Cards":  # Skip cards for now
                # Get values for both teams
                data_cells = data_row.find_all('td')
                if len(data_cells) == 2:
                    team1_value = data_cells[0].get_text(strip=True)
                    team2_value = data_cells[1].get_text(strip=True)
                    
                    # Use improved extraction function
                    team1_clean = extract_percentage_or_value(team1_value)
                    team2_clean = extract_percentage_or_value(team2_value)
                    
                    print(f"{stat_name}: {team1_name}={team1_clean} | {team2_name}={team2_clean}")
                    
                    stats_data.append({
                        'stat_name': stat_name,
                        f'{team1_name}': team1_clean,
                        f'{team2_name}': team2_clean
                    })
        
        i += 2  # Skip to next stat (header + data)
    
    return pd.DataFrame(stats_data), team1_name, team2_name


def scrape_team_stats_extra(soup,team1_name, team2_name):              
    """                               
    Scrape team stats extra from the team_stats_extra div                  
    Returns DataFrame with team namesand extra stats                      
    """                               
    team_stats_extra_div = soup.find('div', {'id': 'team_stats_extra'})                  
                                        
    if not team_stats_extra_div:      
        print("No team_stats_extra div found")                           
        return None                   
                                        
    stats_data = []                   
                                        
    # Find all stat containers        
    stat_containers = team_stats_extra_div.find_all('div', recursive=False)                      
                                        
    for container in stat_containers: 
        divs = container.find_all('div')             
        if len(divs) >= 3:
            # Each row has: team1_value, stat_name, team2_value pattern                               
            for i in range(0, len(divs), 3):
                if i + 2 < len(divs):
                    team1_value = divs[i].get_text(strip=True)
                    stat_name = divs[i + 1].get_text(strip=True)
                    team2_value = divs[i + 2].get_text(strip=True)
                    # Skip headers
                    if not (team1_value.isdigit() and team2_value.isdigit()):
                        continue
                    stats_data.append({     
                        'stat_name': stat_name,
                        f'{team1_name}': team1_value,
                        f'{team2_name}': team2_value})
    return pd.DataFrame(stats_data)

In [9]:
def scrape_and_concatenate_all_team_stats(soup, match_url=None):
    """
    Scrape both team stats and team stats extra, then concatenate the results
    
    Args:
        soup: BeautifulSoup object of the match page
        match_url: URL of the match page (optional, used as match ID)
    
    Returns:
        pd.DataFrame: Concatenated DataFrame with all team stats from both functions
    """
    # First scrape team stats
    team_stats_result = scrape_team_stats(soup)
    
    if team_stats_result is None:
        print("Failed to scrape team stats")
        return None
    
    team_stats_df, team1_name, team2_name = team_stats_result
    
    # Then scrape team stats extra
    team_stats_extra_df = scrape_team_stats_extra(soup, team1_name, team2_name)
    
    # Concatenate the dataframes
    if team_stats_df is not None and team_stats_extra_df is not None:
        # Both dataframes have the same structure with stat_name and team columns
        combined_df = pd.concat([team_stats_df, team_stats_extra_df], ignore_index=True)
        print(f"Combined {len(team_stats_df)} team stats + {len(team_stats_extra_df)} extra stats = {len(combined_df)} total stats")
    elif team_stats_df is not None:
        print("Only team stats available")
        combined_df = team_stats_df
    elif team_stats_extra_df is not None:
        print("Only team stats extra available")
        combined_df = team_stats_extra_df
    else:
        print("No stats available")
        return None
    
    # Add match URL as match ID if provided
    if match_url:
        combined_df['match_id'] = match_url
    
    return combined_df

In [10]:
# Test with the match ID included
match_url = " https://fbref.com/en/matches/1405a610/Newcastle-United-Arsenal-August-11-2019-Premier-League"
stats_df_with_id = scrape_and_concatenate_all_team_stats(soup, match_url)
print("\nDataFrame with match_id:")
print(stats_df_with_id.tail())

Teams found: Newcastle vs Arsenal
Possession: Newcastle=38% | Arsenal=62%
Passing Accuracy: Newcastle=75% | Arsenal=84%
Shots on Target: Newcastle=22% | Arsenal=25%
Saves: Newcastle=50% | Arsenal=100%
Combined 4 team stats + 12 extra stats = 16 total stats

DataFrame with match_id:
     stat_name Newcastle Arsenal  \
11  Clearances        17      22   
12    Offsides         1       3   
13  Goal Kicks         8       8   
14   Throw Ins        23      30   
15  Long Balls        68      57   

                                             match_id  
11   https://fbref.com/en/matches/1405a610/Newcast...  
12   https://fbref.com/en/matches/1405a610/Newcast...  
13   https://fbref.com/en/matches/1405a610/Newcast...  
14   https://fbref.com/en/matches/1405a610/Newcast...  
15   https://fbref.com/en/matches/1405a610/Newcast...  


In [11]:
stats_df_with_id

Unnamed: 0,stat_name,Newcastle,Arsenal,match_id
0,Possession,38%,62%,https://fbref.com/en/matches/1405a610/Newcast...
1,Passing Accuracy,75%,84%,https://fbref.com/en/matches/1405a610/Newcast...
2,Shots on Target,22%,25%,https://fbref.com/en/matches/1405a610/Newcast...
3,Saves,50%,100%,https://fbref.com/en/matches/1405a610/Newcast...
4,Fouls,12,7,https://fbref.com/en/matches/1405a610/Newcast...
5,Corners,5,3,https://fbref.com/en/matches/1405a610/Newcast...
6,Crosses,15,10,https://fbref.com/en/matches/1405a610/Newcast...
7,Touches,548,771,https://fbref.com/en/matches/1405a610/Newcast...
8,Tackles,26,11,https://fbref.com/en/matches/1405a610/Newcast...
9,Interceptions,20,6,https://fbref.com/en/matches/1405a610/Newcast...


In [13]:
stats_df_with_id

Unnamed: 0,stat_name,Newcastle,Arsenal,match_id
0,Possession,38%,62%,https://fbref.com/en/matches/1405a610/Newcast...
1,Passing Accuracy,75%,84%,https://fbref.com/en/matches/1405a610/Newcast...
2,Shots on Target,22%,25%,https://fbref.com/en/matches/1405a610/Newcast...
3,Saves,50%,100%,https://fbref.com/en/matches/1405a610/Newcast...
4,Fouls,12,7,https://fbref.com/en/matches/1405a610/Newcast...
5,Corners,5,3,https://fbref.com/en/matches/1405a610/Newcast...
6,Crosses,15,10,https://fbref.com/en/matches/1405a610/Newcast...
7,Touches,548,771,https://fbref.com/en/matches/1405a610/Newcast...
8,Tackles,26,11,https://fbref.com/en/matches/1405a610/Newcast...
9,Interceptions,20,6,https://fbref.com/en/matches/1405a610/Newcast...
