In [113]:
import json
import time

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from pydantic import BaseModel
from typing import List, Optional

from selenium import webdriver

from supabase import create_client, Client

import os

from dotenv import load_dotenv

In [114]:
# Load environment variables for Supabase credentials
load_dotenv()
supabase_password = os.getenv('supabase_password')
project_url = os.getenv('project_url')
project_api = os.getenv('project_api')

In [115]:
import psycopg2
# Connect to the PostgreSQL database using psycopg2
conn = psycopg2.connect(
    user ="postgres.srnpjbsmliwxrjfuisfr",
    password=supabase_password,
    host="aws-0-us-west-1.pooler.supabase.com",
    port=6543,
    database="postgres"
)

In [116]:
# Define a Pydantic model for match events
class MatchEvent(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    player_id: int
    x: float
    y: float
    end_x: Optional[float] = None
    end_y: Optional[float] = None
    qualifiers: List[dict]
    is_touch: bool
    blocked_x: Optional[float] = None
    blocked_y: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_goal: bool
    type_display_name: str
    outcome_type_display_name: str
    period_display_name: str

In [117]:
# Function to insert match events into Supabase
def insert_match_events(df, supabase):
    events = [
        MatchEvent(**x).dict()
        for x in df.to_dict(orient="records")
    ]
    # Upsert the events into the Supabase table
    execution = supabase.table('match_event').upsert(events).execute()

In [118]:
# Define a Pydantic model for players
class Player(BaseModel):
    player_id: int
    shirt_no: int
    name: str
    age: int
    position: str
    team_id: int

In [119]:
# Function to insert player information into Supabase
def insert_players(team_info, supabase):
    players = []
    
    #Loop over each team and create new dictionary for each player format of the table
    for team in team_info:
        for player in team['players']:
            players.append({
                'player_id': player['playerId'],
                'team_id': team['team_id'],
                'shirt_no': player['shirtNo'],
                'name': player['name'],
                'position': player['position'],
                'age': player['age']
            })
     # Upsert the players into the Supabase table           
    execution = supabase.table('player').upsert(players).execute()

In [120]:
# Create a Supabase client
supabase = create_client(project_url, project_api)

In [121]:
# Initialize the Selenium WebDriver
driver = webdriver.Chrome()

In [122]:
# Function to scrape match events from a given URL
def scrape_match_events(whoscored_url, driver):
    
    # Navigate to the specified Whoscored URL using the provided web driver
    driver.get(whoscored_url)
    
    # Parse the page source using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find the script tag containing 'matchCentreData' which holds the match data
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    
    # Extract the JSON data from the script tag and load it into a Python dictionary
    try:
        matchdict = json.loads(element.text.split("matchCentreData: ")[1].split(',\n')[0])
    except(AttributeError, IndexError):
        print('No data is found, scraping stopped')
        return  # Exit the function if no data is found

    # Get the list of match events from the dictionary
    match_events = matchdict['events']
    
    # Convert the list of events into a pandas DataFrame
    df = pd.DataFrame(match_events)
    
    # Drop rows where 'playerId' is missing to clean the data
    df.dropna(subset='playerId', inplace=True)
    
    # Replace any remaining NaN values with None
    df = df.where(pd.notnull(df), None)
    
    # Rename the columns to make them more descriptive and consistent
    df = df.rename(
        {
            'eventId': 'event_id',
            'expandedMinute': 'expanded_minute',
            'outcomeType': 'outcome_type',
            'isTouch': 'is_touch',
            'playerId': 'player_id',
            'teamId': 'team_id',
            'endX': 'end_x',
            'endY': 'end_y',
            'blockedX': 'blocked_x',
            'blockedY': 'blocked_y',
            'goalMouthZ': 'goal_mouth_z',
            'goalMouthY': 'goal_mouth_y',
            'isShot': 'is_shot',
            'cardType': 'card_type',
            'isGoal': 'is_goal'
        },
        axis=1
    )
    
    # Extract display names for 'period', 'type', and 'outcome_type' fields
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])
    
    # Drop the original complex columns now that their display names are extracted
    df.drop(columns=["period", "type", "outcome_type"], inplace=True)
    
    # Ensure 'is_goal' and 'is_card' columns exist and default them to False if missing
    if 'is_goal' not in df.columns:
        df['is_goal'] = False
        
    if 'is_card' not in df.columns:
        df['is_card'] = False
        df['card_type'] = False
        
    # Remove rows where the event type is "OffsideGiven"
    df = df[~(df['type_display_name'] == "OffsideGiven")]
    
    # Reorder the DataFrame columns for consistency
    df = df[[
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
        'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name',
        'period_display_name'
    ]]
    
    # Convert specific columns to appropriate data types
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype(np.int64)
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)
    
    # Ensure 'is_goal' and 'is_shot' are filled with False where data is missing
    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)
    
    # Replace NaNs in float columns with None to maintain consistency
    for column in df.columns:
        if df[column].dtype == np.float64 or df[column].dtype == np.float32:
            df[column] = np.where(
                np.isnan(df[column]),
                None,
                df[column]
            )
    
    # Insert the cleaned and processed DataFrame into the match events table in Supabase
    insert_match_events(df, supabase)
    
    # Extract and prepare team information from the match dictionary
    team_info = []
    team_info.append({
        'team_id': matchdict['home']['teamId'],
        'name': matchdict['home']['name'],
        'country_name': matchdict['home']['countryName'],
        'manager_name': matchdict['home']['managerName'],
        'players': matchdict['home']['players'],
    })

    team_info.append({
        'team_id': matchdict['away']['teamId'],
        'name': matchdict['away']['name'],
        'country_name': matchdict['away']['countryName'],
        'manager_name': matchdict['away']['managerName'],
        'players': matchdict['away']['players'],
    })
    
    # Insert the team information into the players table in Supabase
    insert_players(team_info, supabase)
    
    # Return success message
    return print('Success')



In [123]:
# Get the match URLs from the specified Whoscored page
driver.get('https://www.whoscored.com/Teams/37/Show/Germany-Bayern-Munich')
time.sleep(3)

In [124]:
# Parse the page source to find all match URLs
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [125]:
# Filter href to get the unique match URLs
all_urls = soup.select('a[href*="\/Live\/"]')

In [126]:
#Filter href to get the urls
all_urls = list(set([
    'https://www.whoscored.com' + x.attrs['href']
    for x in all_urls
]))

In [127]:
all_urls

['https://www.whoscored.com/Matches/1834261/Live/Germany-Bundesliga-2024-2025-Wolfsburg-Bayern-Munich',
 'https://www.whoscored.com/Matches/1834276/Live/Germany-Bundesliga-2024-2025-Bayern-Munich-Freiburg',
 'https://www.whoscored.com/Matches/1816538/Live/Germany-DFB-Pokal-2024-2025-Ulm-Bayern-Munich',
 'https://www.whoscored.com/Matches/1834286/Live/Germany-Bundesliga-2024-2025-Holstein-Kiel-Bayern-Munich']

In [128]:
# Print all match URLs
for url in all_urls:
    print(url)
    try:
        scrape_match_events(
            whoscored_url=url,
            driver=driver
        )
    except Exception as e:
        print(f'Error scraping {url}: {e}')  # Print the error message for debugging
    time.sleep(2)

https://www.whoscored.com/Matches/1834261/Live/Germany-Bundesliga-2024-2025-Wolfsburg-Bayern-Munich


C:\Users\Duy Tran\AppData\Local\Temp\ipykernel_9836\2641234873.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  MatchEvent(**x).dict()


Success
https://www.whoscored.com/Matches/1834276/Live/Germany-Bundesliga-2024-2025-Bayern-Munich-Freiburg


C:\Users\Duy Tran\AppData\Local\Temp\ipykernel_9836\2641234873.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  MatchEvent(**x).dict()


Success
https://www.whoscored.com/Matches/1816538/Live/Germany-DFB-Pokal-2024-2025-Ulm-Bayern-Munich
No data is found, scraping stopped
https://www.whoscored.com/Matches/1834286/Live/Germany-Bundesliga-2024-2025-Holstein-Kiel-Bayern-Munich


C:\Users\Duy Tran\AppData\Local\Temp\ipykernel_9836\2641234873.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  MatchEvent(**x).dict()


Success
