In [89]:
import json
import time

import numpy as np
import pandas as pd
# Using Pandas Version 1.5.2

from bs4 import BeautifulSoup

from pydantic import BaseModel
from typing import List, Optional

from selenium import webdriver
# Ensure webdriver is downloaded for current chrome version

from supabase import create_client, Client

In [90]:
# Assign BeautifulSoup class to alias "BS"
from bs4 import BeautifulSoup as BS

import bs4
import soupsieve

In [91]:
# Initiate new instance of chrome webdriver using selenium
driver = webdriver.Chrome()

In [92]:
# Match Data: Inter 1-0 Juventus (Feb. 4th, 2024) 
whoscored_url = "https://www.whoscored.com/Matches/1746287/Live/Italy-Serie-A-2023-2024-Inter-Juventus"

In [93]:
# Opens specified URL
driver.get(whoscored_url)

In [94]:
# Retrives HTML source code and parses it
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [95]:
# Select element with the matchCentreData
element = soup.select_one('script:-soup-contains("matchCentreData")')

In [96]:
# Extracts & loads JSON data, loads it into dictionary
matchdict = json.loads(element.text.split("matchCentreData: ")[1].split(',\n')[0])

In [152]:
# Keys in dictionary
matchdict.keys()

dict_keys(['playerIdNameDictionary', 'periodMinuteLimits', 'timeStamp', 'attendance', 'venueName', 'referee', 'weatherCode', 'elapsed', 'startTime', 'startDate', 'score', 'htScore', 'ftScore', 'etScore', 'pkScore', 'statusCode', 'periodCode', 'home', 'away', 'maxMinute', 'minuteExpanded', 'maxPeriod', 'expandedMinutes', 'expandedMaxMinute', 'periodEndMinutes', 'commonEvents', 'events', 'timeoutInSeconds'])

In [153]:
matchdict['events'][55]

{'id': 2642983473.0,
 'eventId': 30,
 'minute': 2,
 'second': 36,
 'teamId': 87,
 'playerId': 105995,
 'x': 2.6,
 'y': 87.5,
 'expandedMinute': 2,
 'period': {'value': 1, 'displayName': 'FirstHalf'},
 'type': {'value': 56, 'displayName': 'ShieldBallOpp'},
 'outcomeType': {'value': 1, 'displayName': 'Successful'},
 'qualifiers': [{'type': {'value': 56, 'displayName': 'Zone'},
   'value': 'Back'}],
 'satisfiedEventsTypes': [],
 'isTouch': False}

In [154]:
# Assign list to variable
match_events = matchdict['events']

In [155]:
# Create DataFrame using 'events' data
df = pd.DataFrame(match_events)

In [156]:
# Return first 5 rows
df.head()

Unnamed: 0,id,eventId,minute,second,teamId,x,y,expandedMinute,period,type,...,relatedEventId,relatedPlayerId,blockedX,blockedY,goalMouthZ,goalMouthY,isShot,cardType,isOwnGoal,isGoal
0,2642982000.0,2,0,0,87,0.0,0.0,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 32, 'displayName': 'Start'}",...,,,,,,,,,,
1,2642982000.0,2,0,0,75,0.0,0.0,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 32, 'displayName': 'Start'}",...,,,,,,,,,,
2,2642982000.0,3,0,0,87,50.1,50.1,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,,,,,
3,2642982000.0,4,0,2,87,37.2,44.6,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,,,,,
4,2642982000.0,4,0,12,75,33.1,0.0,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,,,,,,,,,,


In [157]:
# Remove rows where 'playerId' has missing values (NaN)
df.dropna(subset='playerId', inplace=True)

nan, None, NaN

In [158]:
# Replace missing vlaues with 'None'
df = df.where(pd.notnull(df), None)

event_id, team_id, outcome_type

In [159]:
# Rename columns
df = df.rename(
    {
        'eventId': 'event_id',
        'expandedMinute': 'expanded_minute',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    },
    axis=1
)

In [105]:
# Creates new column with 'displayName' values
df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])

# Creates a new column with 'displayName' values
df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])

# Creates a new column with 'displayName' values
df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])

In [106]:
# Drops specified columns
df.drop(columns=["period", "type", "outcome_type"], inplace=True)

In [107]:
# Check if 'is_goal' column is missing (for 0-0 matches)
if 'is_goal' not in df.columns:
    print('missing goals')
    df['is_goal'] = False

In [108]:
# Returns the number of rows
len(df.index)

1634

In [109]:
# Removes rows where 'type_display_name' is "OffsideGiven"
# Avoids inconsistencies with URL database
df = df[~(df['type_display_name'] == "OffsideGiven")]

In [110]:
# display columns
df.columns

Index(['id', 'event_id', 'minute', 'second', 'team_id', 'x', 'y',
       'expanded_minute', 'qualifiers', 'satisfiedEventsTypes', 'is_touch',
       'player_id', 'end_x', 'end_y', 'relatedEventId', 'relatedPlayerId',
       'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
       'card_type', 'isOwnGoal', 'is_goal', 'period_display_name',
       'type_display_name', 'outcome_type_display_name'],
      dtype='object')

In [111]:
# Keep these columns
df = df[[
    'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
    'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
    'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name',
    'period_display_name'
]]

In [112]:
# Display data types
df.dtypes

id                           float64
event_id                       int64
minute                         int64
second                         int64
team_id                        int64
player_id                    float64
x                            float64
y                            float64
end_x                        float64
end_y                        float64
qualifiers                    object
is_touch                        bool
blocked_x                    float64
blocked_y                    float64
goal_mouth_z                 float64
goal_mouth_y                 float64
is_shot                       object
card_type                     object
is_goal                       object
type_display_name             object
outcome_type_display_name     object
period_display_name           object
dtype: object

In [113]:
# Convert to appropriate data types
df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype(np.int64)
df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)

In [114]:
# Insert NaN values in columns with False
df['is_goal'] = df['is_goal'].fillna(False)
df['is_shot'] = df['is_shot'].fillna(False)

In [115]:
# Replace NaN values in columns with float64 or float32 data types with None 
for column in df.columns:
    if df[column].dtype == np.float64 or df[column].dtype == np.float32:
        df[column] = np.where(
            np.isnan(df[column]),
            None,
            df[column]
        )

In [116]:
# Convert first row to a dictionary
df.iloc[0].to_dict()

{'id': 2642982317,
 'event_id': 3,
 'minute': 0,
 'second': 0.0,
 'team_id': 87,
 'player_id': 315369,
 'x': 50.1,
 'y': 50.1,
 'end_x': 36.7,
 'end_y': 44.4,
 'qualifiers': [{'type': {'value': 141, 'displayName': 'PassEndY'},
   'value': '44.4'},
  {'type': {'value': 213, 'displayName': 'Angle'}, 'value': '3.41'},
  {'type': {'value': 212, 'displayName': 'Length'}, 'value': '14.6'},
  {'type': {'value': 140, 'displayName': 'PassEndX'}, 'value': '36.7'},
  {'type': {'value': 178, 'displayName': 'StandingSave'}},
  {'type': {'value': 56, 'displayName': 'Zone'}, 'value': 'Back'}],
 'is_touch': True,
 'blocked_x': None,
 'blocked_y': None,
 'goal_mouth_z': None,
 'goal_mouth_y': None,
 'is_shot': False,
 'card_type': False,
 'is_goal': False,
 'type_display_name': 'Pass',
 'outcome_type_display_name': 'Successful',
 'period_display_name': 'FirstHalf'}

In [117]:
# Define Pydantic model
class MatchEvent(BaseModel):
    
    # Unique identifier for match event
    id: int
    
    # Identifier for specific event
    event_id: int
    
    # The minute when event occurred
    minute: int
        
    # The second within the minute, if applicable
    second: Optional[float] = None
    
    # Identifier for the team associated with the event
    team_id: int
        
    # Identifier for the player associated with the event
    player_id: int
        
    # X-coordinate of  event location on field
    x: float
        
    # Y-coordinate of  event location on field
    y: float
        
    # X-coordinate of the end location, if applicable
    end_x: Optional[float] = None
    
    # Y-coordinate of the end location, if applicable
    end_y: Optional[float] = None
        
    # List of qualifiers providing additional details
    qualifiers: List[dict]
        
    # Indicates whether the event involves a player's touch
    is_touch: bool
        
    # X-coordinate of a blocked event, if applicable
    blocked_x: Optional[float] = None
    
    # Y-coordinate of a blocked event, if applicable
    blocked_y: Optional[float] = None
        
    # Note: "goal_mouth" refers to area in front of goal
    # Z-coordinate of the goal mouth, if applicable
    goal_mouth_z: Optional[float] = None
    
    # Y-coordinate of the goal mouth, if applicable
    goal_mouth_y: Optional[float] = None
        
    # Indicates whether  event is a shot on goal
    is_shot: bool
        
    # Indicates whether  event involves a card being issued
    card_type: bool
        
    # Indicates whether the event resulted in a goal
    is_goal: bool
    
    # Display name indicating type of event
    type_display_name: str
        
    # Display name indicating  outcome type of event
    outcome_type_display_name: str
        
    # Display name indicating period when event occurred
    period_display_name: str

In [118]:
# Create MatchEvent objects
for x in df.to_dict(orient="records"):
    try:
        # Create object and convert it to dictionary
        MatchEvent(**x).model_dump()
    except Exception as e:
        # Print exception
        print(e)
        break

In [119]:
x

{'id': 2643035309,
 'event_id': 851,
 'minute': 94,
 'second': 35.0,
 'team_id': 87,
 'player_id': 425080,
 'x': 87.4,
 'y': 14.4,
 'end_x': 84.2,
 'end_y': 100.0,
 'qualifiers': [{'type': {'value': 178, 'displayName': 'StandingSave'}},
  {'type': {'value': 212, 'displayName': 'Length'}, 'value': '59.7'},
  {'type': {'value': 2, 'displayName': 'Cross'}},
  {'type': {'value': 155, 'displayName': 'Chipped'}},
  {'type': {'value': 140, 'displayName': 'PassEndX'}, 'value': '84.2'},
  {'type': {'value': 56, 'displayName': 'Zone'}, 'value': 'Left'},
  {'type': {'value': 20, 'displayName': 'RightFoot'}},
  {'type': {'value': 141, 'displayName': 'PassEndY'}, 'value': '100.0'},
  {'type': {'value': 213, 'displayName': 'Angle'}, 'value': '1.63'},
  {'type': {'value': 1, 'displayName': 'Longball'}}],
 'is_touch': True,
 'blocked_x': None,
 'blocked_y': None,
 'goal_mouth_z': None,
 'goal_mouth_y': None,
 'is_shot': False,
 'card_type': False,
 'is_goal': False,
 'type_display_name': 'Pass',
 'out

In [120]:
supabase_password = '7eZZSE#$e+U6B.s'

In [121]:
project_url = 'https://vrvkafgbdruqmzszmaln.supabase.co'
api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZydmthZmdiZHJ1cW16c3ptYWxuIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MDcyMzQ1OTAsImV4cCI6MjAyMjgxMDU5MH0.RnA_99mq99SmSYLTuin-GFSOt5ax4mCThVGXYVFov9A'

In [122]:
# Grouping by "id" and counting occurences to show frequency
df.groupby('id').count().sort_values(by='event_id', ascending=False)

Unnamed: 0_level_0,event_id,minute,second,team_id,player_id,x,y,end_x,end_y,qualifiers,...,blocked_x,blocked_y,goal_mouth_z,goal_mouth_y,is_shot,card_type,is_goal,type_display_name,outcome_type_display_name,period_display_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2642982317,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
2643016461,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
2643016697,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
2643016669,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
2643016661,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2642996273,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
2642996243,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
2642996197,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1
2642996155,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,1,1,1,1,1


In [123]:
# Insert match events into Supabase tables
def insert_match_events(df, supabase):
    # Create a list of MatchEvent dictionaries
    events = [
        MatchEvent(**x).model_dump()
        for x in df.to_dict(orient='records')
    ]
    # Use Supabase upsert to insert or update records in the 'match_events' table
    execution = supabase.table('match_events').upsert(events).execute()

In [124]:
# Create Supabase client using project URL and API key
supabase = create_client(project_url, api_key)

In [125]:
# Insert match events from DataFrame into  'match_events' table
insert_match_events(df, supabase)

2024-02-07 17:28:21,695:INFO - HTTP Request: POST https://vrvkafgbdruqmzszmaln.supabase.co/rest/v1/match_events "HTTP/1.1 200 OK"


In [126]:
# Create team information list for home team
team_info = []
team_info.append({
    'team_id': matchdict['home']['teamId'],
    'name': matchdict['home']['name'],
    'country_name': matchdict['home']['countryName'],
    'manager_name': matchdict['home']['managerName'],
    'players': matchdict['home']['players'],
})

# Create team information list for away team
team_info.append({
    'team_id': matchdict['away']['teamId'],
    'name': matchdict['away']['name'],
    'country_name': matchdict['away']['countryName'],
    'manager_name': matchdict['away']['managerName'],
    'players': matchdict['away']['players'],
})

In [127]:
matchdict['home']['players']

[{'playerId': 35758,
  'shirtNo': 1,
  'name': 'Yann Sommer',
  'position': 'GK',
  'height': 183,
  'weight': 79,
  'age': 35,
  'isFirstEleven': True,
  'isManOfTheMatch': False,
  'field': 'home',
  'stats': {'totalSaves': {'73': 1.0},
   'collected': {'73': 1.0},
   'possession': {'6': 2.0,
    '8': 1.0,
    '10': 1.0,
    '15': 1.0,
    '33': 1.0,
    '38': 2.0,
    '44': 2.0,
    '47': 3.0,
    '50': 2.0,
    '54': 1.0,
    '56': 1.0,
    '64': 1.0,
    '69': 1.0,
    '73': 1.0,
    '76': 3.0,
    '77': 1.0,
    '81': 1.0,
    '83': 2.0,
    '88': 2.0,
    '90': 1.0,
    '92': 1.0},
   'ratings': {'0': 6.0,
    '6': 6.0,
    '8': 6.0,
    '10': 6.01,
    '15': 6.01,
    '33': 6.01,
    '36': 6.06,
    '38': 6.07,
    '44': 6.06,
    '47': 6.07,
    '50': 6.05,
    '54': 6.06,
    '56': 6.06,
    '64': 6.07,
    '69': 6.07,
    '73': 6.18,
    '76': 6.18,
    '77': 6.18,
    '81': 6.19,
    '83': 6.21,
    '88': 6.21,
    '90': 6.22,
    '92': 6.24,
    '97': 6.57},
   'touches': 

In [128]:
# Pydantic model for play info
class Player(BaseModel):
    
    # Player Identifier
    player_id: int
        
    # Shirt number
    shirt_no: int
    
    # Player's name
    name: str
        
    # Player's age
    age: int
        
    # Player's position
    position: str
        
     # Which team they're on
    team_id: int

In [129]:
# Insert player information into 'players' table
def insert_players(team_info, supabase):
    players = []
    # Extract player details from team info
    for team in team_info:
        for player in team['players']:
            players.append({
                'player_id': player['playerId'],
                'team_id': team['team_id'],
                'shirt_no': player['shirtNo'],
                'name': player['name'],
                'position': player['position'],
                'age': player['age']
            })
    # Use Supabase upsert to insert or update records in 'players' table        
    execution = supabase.table('players').upsert(players).execute()

In [130]:
insert_players(team_info, supabase)

2024-02-07 17:28:21,950:INFO - HTTP Request: POST https://vrvkafgbdruqmzszmaln.supabase.co/rest/v1/players "HTTP/1.1 200 OK"


In [131]:
import psycopg2

conn = psycopg2.connect(
    user="postgres.vrvkafgbdruqmzszmaln",
    password=supabase_password,
    host="aws-0-ca-central-1.pooler.supabase.com",
    port=5432,
    database="postgres"

)

In [132]:
# Create cursor object to interact with database
cursor = conn.cursor()

In [133]:
# Execute SQL query to select all from 'players' table
cursor.execute("""
    SELECT * FROM players;
""")

In [134]:
# Fetch all records and put into 'records'
records = cursor.fetchall()

In [135]:
records

[(35758, 1, 'Yann Sommer', 35, 'GK', 75),
 (259648, 28, 'Benjamin Pavard', 27, 'DC', 75),
 (329665, 95, 'Alessandro Bastoni', 24, 'DC', 75),
 (54968, 15, 'Francesco Acerbi', 35, 'DC', 75),
 (23220, 36, 'Matteo Darmian', 34, 'DMR', 75),
 (255929, 32, 'Federico Dimarco', 26, 'DML', 75),
 (28421, 22, 'Henrikh Mkhitaryan', 35, 'MC', 75),
 (148684, 23, 'Nicolò Barella', 27, 'MC', 75),
 (110373, 20, 'Hakan Çalhanoglu', 29, 'MC', 75),
 (299344, 10, 'Lautaro Martínez', 26, 'FW', 75),
 (296322, 9, 'Marcus Thuram', 26, 'FW', 75),
 (34693, 8, 'Marko Arnautovic', 34, 'Sub', 75),
 (357897, 30, 'Carlos Augusto', 25, 'Sub', 75),
 (322153, 2, 'Denzel Dumfries', 27, 'Sub', 75),
 (108860, 14, 'Davy Klaassen', 30, 'Sub', 75),
 (82399, 6, 'Stefan de Vrij', 32, 'Sub', 75),
 (423450, 21, 'Kristjan Asllani', 21, 'Sub', 75),
 (371027, 17, 'Tajon Buchanan', 24, 'Sub', 75),
 (25244, 70, 'Alexis Sánchez', 35, 'Sub', 75),
 (349126, 31, 'Yann Bisseck', 23, 'Sub', 75),
 (331425, 16, 'Davide Frattesi', 24, 'Sub', 75

In [136]:
# Create DataFrame using fetched records and column names from cursor description
df = pd.DataFrame(records, columns=[desc[0] for desc in cursor.description])

In [137]:
df.head(5)

Unnamed: 0,player_id,shirt_no,name,age,position,team_id
0,35758,1,Yann Sommer,35,GK,75
1,259648,28,Benjamin Pavard,27,DC,75
2,329665,95,Alessandro Bastoni,24,DC,75
3,54968,15,Francesco Acerbi,35,DC,75
4,23220,36,Matteo Darmian,34,DMR,75


# Web Scraping a Team's Entire Season

In [187]:
# Pydantic model for match events
class MatchEvent(BaseModel):
    id: int
    event_id: int
    minute: int
    second: Optional[float] = None
    team_id: int
    player_id: int
    x: float
    y: float
    end_x: Optional[float] = None
    end_y: Optional[float] = None
    qualifiers: List[dict]
    is_touch: bool
    blocked_x: Optional[float] = None
    blocked_y: Optional[float] = None
    goal_mouth_z: Optional[float] = None
    goal_mouth_y: Optional[float] = None
    is_shot: bool
    card_type: bool
    is_goal: bool
    type_display_name: str
    outcome_type_display_name: str
    period_display_name: str

In [188]:
# Insert match events into Supabase tables
def insert_match_events(df, supabase):
    events = [
        MatchEvent(**x).dict()
        for x in df.to_dict(orient='records')
    ]
    execution = supabase.table('match_events').upsert(events).execute()

In [189]:
# Pydantic model for play info
class Player(BaseModel):
    player_id: int
    shirt_no: int
    name: str
    age: int
    position: str
    team_id: int

In [190]:
# Insert player information into 'players' table
def insert_players(team_info, supabase):
    players = []
    for team in team_info:
        for player in team['players']:
            players.append({
                'player_id': player['playerId'],
                'team_id': team['team_id'],
                'shirt_no': player['shirtNo'],
                'name': player['name'],
                'position': player['position'],
                'age': player['age']
            })
            
    execution = supabase.table('players').upsert(players).execute()

In [191]:
# Supabase Info
supabase_password = '7eZZSE#$e+U6B.s'
project_url = 'https://vrvkafgbdruqmzszmaln.supabase.co'
api_key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZydmthZmdiZHJ1cW16c3ptYWxuIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MDcyMzQ1OTAsImV4cCI6MjAyMjgxMDU5MH0.RnA_99mq99SmSYLTuin-GFSOt5ax4mCThVGXYVFov9A'
supabase = create_client(project_url, api_key)

In [192]:
# # Initiate chrome webdriver using selenium
driver = webdriver.Chrome()

In [193]:
# Scrapes match events from given Whoscored URL
# Processes the scraped data into a DataFrame
# Maps and transforms DataFrame columns
# Filters and cleans the DataFrame
# Inserts cleaned match events into Supabase table using insert_match_events function
# Extracts team information and inserts it into a Supabase table using the insert_players function
# Returns 'Success' message if the process is completed successfully
def scrape_match_events(whoscored_url, driver):
    
    driver.get(whoscored_url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    element = soup.select_one('script:-soup-contains("matchCentreData")')
    matchdict = json.loads(element.text.split("matchCentreData: ")[1].split(',\n')[0])
    
    match_events = matchdict['events']
    
    df = pd.DataFrame(match_events)
    
    df.dropna(subset='playerId', inplace=True)
    
    df = df.where(pd.notnull(df), None)
    
    df = df.rename(
    {
        'eventId': 'event_id',
        'expandedMinute': 'expanded_minute',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'teamId': 'team_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    },
        axis=1
    )
    
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])
    
    df.drop(columns=["period", "type", "outcome_type"], inplace=True)
    
    if 'is_goal' not in df.columns:
        df['is_goal'] = False
        
    if 'is_card' not in df.columns:
        df['is_card'] = False
        df['card_type'] = False
        
    df = df[~(df['type_display_name'] == "OffsideGiven")]
    
    df = df[[
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot',
        'card_type', 'is_goal', 'type_display_name', 'outcome_type_display_name',
        'period_display_name'
    ]]
    
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype(np.int64)
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal', 'card_type']] = df[['is_shot', 'is_goal', 'card_type']].astype(bool)
    
    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)
    
    for column in df.columns:
        if df[column].dtype == np.float64 or df[column].dtype == np.float32:
            df[column] = np.where(
                np.isnan(df[column]),
                None,
                df[column]
            )
            
            
    insert_match_events(df, supabase)
    
    
    team_info = []
    team_info.append({
        'team_id': matchdict['home']['teamId'],
        'name': matchdict['home']['name'],
        'country_name': matchdict['home']['countryName'],
        'manager_name': matchdict['home']['managerName'],
        'players': matchdict['home']['players'],
    })

    team_info.append({
        'team_id': matchdict['away']['teamId'],
        'name': matchdict['away']['name'],
        'country_name': matchdict['away']['countryName'],
        'manager_name': matchdict['away']['managerName'],
        'players': matchdict['away']['players'],
    })
    
    insert_players(team_info, supabase)
    
    return print('Success')

    

In [194]:
# Navigates to a team's fixtures page (on Whoscored)
# Waits 3 seconds for page to load before further actions
driver.get('https://www.whoscored.com/Teams/80/Fixtures/Italy-AC-Milan')
time.sleep(3)

In [195]:
# Creates BS object by parsing HTML source code of web page using 'html.parser'
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [196]:
# Extracts all URLs containing "/Live/" in href attribute from
# anchor (a) elements in parsed HTML using BeautifulSoup.
all_urls = soup.select('a[href*="\/Live\/"]')

In [197]:
# Creates list of URLs by combining the base URL "https://www.whoscored.com" 
# with href attribute of each anchor element in the list of URLs 
# Note: Converts the list to a set and back to a list to help remove duplicate URLs
all_urls = list(set([
    'https://www.whoscored.com' + x.attrs['href']
    for x in all_urls
]))

In [198]:
all_urls

['https://www.whoscored.com/Matches/1785206/Live/Italy-Coppa-Italia-2023-2024-AC-Milan-Cagliari',
 'https://www.whoscored.com/Matches/1746090/Live/Italy-Serie-A-2023-2024-Cagliari-AC-Milan',
 'https://www.whoscored.com/Matches/1746113/Live/Italy-Serie-A-2023-2024-Genoa-AC-Milan',
 'https://www.whoscored.com/Matches/1746165/Live/Italy-Serie-A-2023-2024-Lecce-AC-Milan',
 'https://www.whoscored.com/Matches/1746175/Live/Italy-Serie-A-2023-2024-AC-Milan-Frosinone',
 'https://www.whoscored.com/Matches/1746251/Live/Italy-Serie-A-2023-2024-AC-Milan-Roma',
 'https://www.whoscored.com/Matches/1746286/Live/Italy-Serie-A-2023-2024-Frosinone-AC-Milan',
 'https://www.whoscored.com/Matches/1746126/Live/Italy-Serie-A-2023-2024-AC-Milan-Juventus',
 'https://www.whoscored.com/Matches/1794755/Live/Italy-Coppa-Italia-2023-2024-AC-Milan-Atalanta',
 'https://www.whoscored.com/Matches/1746226/Live/Italy-Serie-A-2023-2024-AC-Milan-Sassuolo',
 'https://www.whoscored.com/Matches/1746073/Live/Italy-Serie-A-2023-

In [199]:
# Go through list of URLs and prints then calls 'scrap_match_events' for each
# 2 second delay between iterations
for url in all_urls:
    print(url)
    scrape_match_events(
        whoscored_url=url,
        driver=driver
    )
    time.sleep(2)

https://www.whoscored.com/Matches/1785206/Live/Italy-Coppa-Italia-2023-2024-AC-Milan-Cagliari


AttributeError: 'NoneType' object has no attribute 'text'