# Gathering SofaScore Team Statistics

## Data Collection

Scraping team statistics from SofaScore including:
- Team ratings
- Goals scored
- Goals conceded
- Assists

### Sources
- https://www.sofascore.com/
- https://www.selenium.dev/documentation/webdriver/
- https://selenium-python.readthedocs.io/locating-elements.html

*Note: SofaScore pages are JavaScript-heavy so Selenium is required for scraping*

In [13]:
# Selenium: Python library used for automating web browser for web scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
import re

In [14]:
def scrape_team_statistics(driver, team_id, team_name, timeout=25):
    """
    Scrapes team statistics with strict timeout.
    If it takes longer than timeout seconds, returns partial/empty data.
    
    Args:
        driver: Selenium WebDriver instance
        team_id: SofaScore team ID
        team_name: Team name
        timeout: Maximum seconds to spend on this team
    
    Returns:
        dict: Team statistics
    """
    
    results = {
        'team_name': team_name,
        'team_rating': None,
        'goals_scored': None,
        'goals_conceded': None,
        'assists': None
    }
    
    start_time = time.time()
    
    try:
        # Set page load timeout
        driver.set_page_load_timeout(10)
        
        # Load statistics tab
        team_url = f"https://www.sofascore.com/team/football/{team_name.lower().replace(' ', '-')}/{team_id}#tab:statistics"
        print(f"Loading {team_name}...", end=' ', flush=True)
        
        try:
            driver.get(team_url)
        except Exception as e:
            print(f"✗ Page timeout")
            return results
        
        # Check if we're over time
        if time.time() - start_time > timeout:
            print(f"✗ Overall timeout")
            return results
        
        time.sleep(3)
        
        # Quick scroll
        for scroll_pos in [400, 800]:
            if time.time() - start_time > timeout:
                break
            driver.execute_script(f"window.scrollTo(0, {scroll_pos});")
            time.sleep(0.3)
        
        # Get page text
        try:
            body_text = driver.find_element(By.TAG_NAME, 'body').text
            lines = body_text.split('\n')
        except:
            print(f"✗ Could not get text")
            return results
        
        # Extract rating quickly
        for i, line in enumerate(lines[:150]):
            if time.time() - start_time > timeout:
                break
            
            if 'sofascore rating' in line.lower():
                for j in range(max(0, i-3), min(len(lines), i+6)):
                    potential = lines[j].strip()
                    if re.match(r'^\d\.\d{1,2}$', potential):
                        val = float(potential)
                        if 0 <= val <= 10:
                            results['team_rating'] = potential
                            break
                if results['team_rating']:
                    break
        
        # If rating not found, try JavaScript (with timeout)
        if not results['team_rating'] and time.time() - start_time < timeout:
            rating_js = r"""
            function findRating() {
                var walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, null, false);
                var ratings = [];
                var node;
                var count = 0;
                
                while((node = walker.nextNode()) && count < 5000) {
                    count++;
                    var text = node.nodeValue.trim();
                    if (/^\d\.\d{1,2}$/.test(text)) {
                        var val = parseFloat(text);
                        if (val >= 0 && val <= 10) {
                            ratings.push(text);
                            if (ratings.length >= 3) break;
                        }
                    }
                }
                return ratings;
            }
            return findRating();
            """
            
            try:
                driver.set_script_timeout(5)
                ratings = driver.execute_script(rating_js)
                if ratings and len(ratings) > 0:
                    results['team_rating'] = ratings[0]
            except:
                pass
        
        # Extract other statistics - EXACT matches only!
        max_lines = min(len(lines), 300)
        
        for i in range(max_lines):
            if time.time() - start_time > timeout:
                break
            
            line = lines[i]
            line_lower = line.lower().strip()
            
            # Goals scored - EXACT match "goals scored"
            if line_lower == 'goals scored':
                for j in range(i+1, min(max_lines, i+4)):
                    potential = lines[j].strip()
                    if potential.isdigit():
                        results['goals_scored'] = potential
                        break
            
            # Goals conceded - EXACT match "goals conceded"
            if line_lower == 'goals conceded':
                for j in range(i+1, min(max_lines, i+4)):
                    potential = lines[j].strip()
                    if potential.isdigit():
                        results['goals_conceded'] = potential
                        break
            
            # Assists - EXACT match "assists"
            if line_lower == 'assists':
                for j in range(i+1, min(max_lines, i+4)):
                    potential = lines[j].strip()
                    if potential.isdigit():
                        results['assists'] = potential
                        break
            
            # Early exit if everything found
            if all([results['team_rating'], results['goals_scored'], 
                   results['goals_conceded'], results['assists']]):
                break
        
        elapsed = time.time() - start_time
        print(f"✓ Rating: {results['team_rating']}, Goals: {results['goals_scored']}, Conceded: {results['goals_conceded']}, Assists: {results['assists']} ({elapsed:.1f}s)")
        return results
        
    except Exception as e:
        print(f"✗ Error: {str(e)[:50]}")
        return results

In [15]:
def get_teams_statistics(teams):
    """
    Scrapes statistics for multiple teams from SofaScore.
    
    Args:
        teams: Dictionary of team names to team IDs
    
    Returns:
        all_teams_data: List of dictionaries containing team statistics
    """
    
    # Setup Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-images')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    all_teams_data = []
    
    try:
        # Scrape each team
        for i, (team_name, team_id) in enumerate(teams.items(), 1):
            print(f"[{i}/{len(teams)}] ", end='')
            
            try:
                team_stats = scrape_team_statistics(driver, team_id, team_name)
                
                if team_stats:
                    all_teams_data.append({
                        'team_name': team_name,
                        'team_id': team_id,
                        'rating': team_stats['team_rating'],
                        'goals_scored': team_stats['goals_scored'],
                        'goals_conceded': team_stats['goals_conceded'],
                        'assists': team_stats['assists']
                    })
            except Exception as e:
                print(f"Failed: {str(e)[:50]}")
                all_teams_data.append({
                    'team_name': team_name,
                    'team_id': team_id,
                    'rating': None,
                    'goals_scored': None,
                    'goals_conceded': None,
                    'assists': None
                })
            
            time.sleep(1)
        
    finally:
        driver.quit()
    
    return all_teams_data

In [16]:
def create_teams_df(all_teams_data):
    """
    Converts team statistics dictionaries into DataFrame.
    
    Args:
        all_teams_data: List of team statistics dictionaries
    
    Returns:
        teams_df: DataFrame with a row for each team
    """
    teams_series = []
    for team in all_teams_data:
        teams_series.append(pd.Series(team))
    teams_df = pd.DataFrame(teams_series)
    return teams_df

In [17]:
# Teams to scrape (team_name: team_id)
teams = {
    'Manchester United': 35,
    'Arsenal': 42,
    'Liverpool': 44,
    'Manchester City': 17,
    'Chelsea': 38
}

In [18]:
all_teams_data = get_teams_statistics(teams)

[1/5] Loading Manchester United... ✓ Rating: 6.81, Goals: 15, Conceded: 14, Assists: 10 (6.4s)
[2/5] Loading Arsenal... ✓ Rating: 7.00, Goals: 16, Conceded: 3, Assists: 10 (5.6s)
[3/5] Loading Liverpool... ✓ Rating: 6.83, Goals: 16, Conceded: 14, Assists: 11 (5.0s)
[4/5] Loading Manchester City... ✓ Rating: 6.98, Goals: 17, Conceded: 7, Assists: 13 (6.4s)
[5/5] Loading Chelsea... ✓ Rating: 6.87, Goals: 17, Conceded: 11, Assists: 12 (5.5s)


In [19]:
teams_df = create_teams_df(all_teams_data)
teams_df

Unnamed: 0,team_name,team_id,rating,goals_scored,goals_conceded,assists
0,Manchester United,35,6.81,15,14,10
1,Arsenal,42,7.0,16,3,10
2,Liverpool,44,6.83,16,14,11
3,Manchester City,17,6.98,17,7,13
4,Chelsea,38,6.87,17,11,12


In [47]:
import plotly.express as px
import pandas as pd

# convert the goals scored and rating cols to have numerical values
teams_df['goals_scored'] = pd.to_numeric(teams_df['goals_scored'], errors='coerce')
teams_df['rating'] = pd.to_numeric(teams_df['rating'], errors='coerce')

# creates the scatter plot comparing each team's rating to goals scored
fig = px.scatter(
    teams_df,
    x='rating',
    y='goals_scored',
    color='team_name',
    hover_name='team_name',
    title='Team Rating vs Goals Scored',
)

#normalize it so axis' start at 0
fig.update_layout(
    xaxis=dict(title='Team Rating', range=[0, teams_df['rating'].max() + 5]),
    yaxis=dict(title='Goals Scored', range=[0, teams_df['goals_scored'].max() + 5]),
    title_font_size=22
)

fig.show()
