# Gathering SofaScore Team Statistics

## Data Collection

Scraping team statistics from SofaScore including:
- Team ratings
- Goals scored
- Goals conceded
- Assists

### Sources
- https://www.sofascore.com/
- https://www.selenium.dev/documentation/webdriver/
- https://selenium-python.readthedocs.io/locating-elements.html

*Note: SofaScore pages are JavaScript-heavy so Selenium is required for scraping*

In [1]:
# Selenium: Python library used for automating web browser for web scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
import re

In [None]:
def scrape_team_statistics(driver, team_id, team_name):
    """
    Scrapes team statistics from SofaScore statistics page.
    Extracts rating, goals scored, goals conceded, and assists.
    
    Args:
        driver: Selenium WebDriver instance
        team_id: SofaScore team ID
        team_name: Team name (e.g., 'Arsenal')
    
    Returns:
        dict: Team statistics including rating, goals, assists
    """
    
    results = {
        'team_name': team_name,
        'team_rating': None,
        'goals_scored': None,
        'goals_conceded': None,
        'assists': None
    }
    
    try:
        # set page load timeout
        driver.set_page_load_timeout(15)
        
        # load statistics tab
        team_url = f"https://www.sofascore.com/team/football/{team_name.lower().replace(' ', '-')}/{team_id}#tab:statistics"
        print(f"Loading {team_name}...", end=' ', flush=True)
        
        try:
            driver.get(team_url)
        except:
            print(f"Timeout")
            return results
        
        time.sleep(4)
        
        # scroll to trigger lazy loading
        for scroll_pos in [300, 600, 900, 1200]:
            driver.execute_script(f"window.scrollTo(0, {scroll_pos});")
            time.sleep(0.5)
        
    
        body_text = driver.find_element(By.TAG_NAME, 'body').text
        lines = body_text.split('\n')
        
        # extract rating, look for "Average Sofascore Rating"
        for i, line in enumerate(lines[:200]):
            if 'sofascore rating' in line.lower() or 'average rating' in line.lower():
                # Check surrounding lines for decimal numbers
                for j in range(max(0, i-3), min(len(lines), i+8)):
                    potential = lines[j].strip()
                    if re.match(r'^\d\.\d{1,2}$', potential):
                        val = float(potential)
                        if 0 <= val <= 10:
                            results['team_rating'] = potential
                            break
                if results['team_rating']:
                    break
        
        # if rating not found, use JS to find all decimal numbers in the page
        if not results['team_rating']:
            rating_js = r"""
            function findRating() {
                var walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, null, false);
                var ratings = [];
                var node;
                var count = 0;
                
                while((node = walker.nextNode()) && count < 10000) {
                    count++;
                    var text = node.nodeValue.trim();
                    if (/^\d\.\d{1,2}$/.test(text)) {
                        var val = parseFloat(text);
                        if (val >= 0 && val <= 10) {
                            ratings.push(text);
                        }
                    }
                }
                return ratings;
            }
            return findRating();
            """
            
            try:
                ratings = driver.execute_script(rating_js)
                if ratings and len(ratings) > 0:
                    results['team_rating'] = ratings[0]
            except:
                pass
        
        # extract other statistics
        max_lines = min(len(lines), 500)
        
        for i in range(max_lines):
            line = lines[i]
            line_lower = line.lower()
            
            # soals scored
            if 'goals scored' in line_lower:
                for j in range(max(0, i-2), min(max_lines, i+3)):
                    potential = lines[j].strip()
                    if potential.isdigit():
                        if results['goals_scored'] is None:
                            results['goals_scored'] = potential
                            break
            
            # goals conceded
            if 'goals conceded' in line_lower:
                for j in range(max(0, i-2), min(max_lines, i+3)):
                    potential = lines[j].strip()
                    if potential.isdigit():
                        if results['goals_conceded'] is None:
                            results['goals_conceded'] = potential
                            break
            
            # assists
            if line_lower == 'assists' or 'total assists' in line_lower:
                for j in range(max(0, i-2), min(max_lines, i+3)):
                    potential = lines[j].strip()
                    if potential.isdigit():
                        if results['assists'] is None:
                            results['assists'] = potential
                            break
            
            # early exit if everything found
            if all([results['team_rating'], results['goals_scored'], 
                   results['goals_conceded'], results['assists']]):
                break
        
        print(f"Rating: {results['team_rating']}, Goals: {results['goals_scored']}, Conceded: {results['goals_conceded']}, Assists: {results['assists']}")
        return results
        
    except Exception as e:
        print(f"Error: {str(e)[:50]}")
        return results

In [4]:
def get_teams_statistics(teams):
    """
    Scrapes statistics for multiple teams from SofaScore.
    
    Args:
        teams: Dictionary of team names to team IDs
    
    Returns:
        all_teams_data: List of dictionaries containing team statistics
    """
    
    # Setup Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-images')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    all_teams_data = []
    
    try:
        # Scrape each team
        for i, (team_name, team_id) in enumerate(teams.items(), 1):
            print(f"[{i}/{len(teams)}] ", end='')
            
            try:
                team_stats = scrape_team_statistics(driver, team_id, team_name)
                
                if team_stats:
                    all_teams_data.append({
                        'team_name': team_name,
                        'team_id': team_id,
                        'rating': team_stats['team_rating'],
                        'goals_scored': team_stats['goals_scored'],
                        'goals_conceded': team_stats['goals_conceded'],
                        'assists': team_stats['assists']
                    })
            except Exception as e:
                print(f"✗ Failed: {str(e)[:50]}")
                all_teams_data.append({
                    'team_name': team_name,
                    'team_id': team_id,
                    'rating': None,
                    'goals_scored': None,
                    'goals_conceded': None,
                    'assists': None
                })
            
            time.sleep(1)
        
    finally:
        driver.quit()
    
    return all_teams_data

In [5]:
def create_teams_df(all_teams_data):
    """
    Converts team statistics dictionaries into DataFrame.
    
    Args:
        all_teams_data: List of team statistics dictionaries
    
    Returns:
        teams_df: DataFrame with a row for each team
    """
    teams_series = []
    for team in all_teams_data:
        teams_series.append(pd.Series(team))
    teams_df = pd.DataFrame(teams_series)
    return teams_df

In [6]:
# Teams to scrape (team_name: team_id)
teams = {
    'Manchester United': 35,
    'Arsenal': 42,
    'Liverpool': 44,
    'Manchester City': 17,
    'Chelsea': 38
}

In [8]:

all_teams_data = get_teams_statistics(teams)

[1/5] Loading Manchester United... ✓ Rating: 6.81, Goals: 9, Conceded: 15, Assists: 14
[2/5] Loading Arsenal... ✓ Rating: 6.99, Goals: 8, Conceded: 15, Assists: 3
[3/5] Loading Liverpool... ✓ Rating: 6.84, Goals: 8, Conceded: 14, Assists: 11
[4/5] Loading Manchester City... ✓ Rating: 7.01, Goals: 8, Conceded: 17, Assists: 6
[5/5] Loading Chelsea... ✓ Rating: 6.87, Goals: 9, Conceded: 17, Assists: 11


In [9]:
teams_df = create_teams_df(all_teams_data)
teams_df

Unnamed: 0,team_name,team_id,rating,goals_scored,goals_conceded,assists
0,Manchester United,35,6.81,9,15,14
1,Arsenal,42,6.99,8,15,3
2,Liverpool,44,6.84,8,14,11
3,Manchester City,17,7.01,8,17,6
4,Chelsea,38,6.87,9,17,11
