In [1]:
import pandas as pd
import sys
import json
import time
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
from typing import List, Optional
from selenium import webdriver
from supabase import create_client, Client
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.colors import to_rgba
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.image as mpimg
from mplsoccer import Pitch, FontManager, Sbopen
import urllib
from PIL import Image
from mplsoccer import Pitch, VerticalPitch, FontManager, Sbopen

In [4]:
whoscored_url = 'https://www.whoscored.com/matches/1811539/live/south-america-copa-libertadores-2024-universitario-de-deportes-junior-fc'

In [5]:
def scraping_whoscored(whoscored_url):

    ''' Explanation...'''

    # Setting up the driver
    driver = webdriver.Chrome()

    # Set up the Driver for the URL
    driver.get(whoscored_url)

    # Creating the soup element. We will get the HTML code of the page
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Selecting the block of code we are interested in, where the JSON data lies
    # Right click -> View Source Code -> Look for the MatchCentreData
    element = soup.select_one('script:-soup-contains("matchCentreData")')

    # Extracting the dictionary of events data.
    # There is a primary key 'id' and an eventId associated with each record
    # Coming in JSON format, preferred by web/app developers
    matchdict = json.loads(element.text.split("matchCentreData: ")[1].split(',\n')[0])

    # --------------- Data Cleaning -----------------------

    # Filtering using variable definition for only storing events dictionary data
    match_events = matchdict['events']

    # Converting JSON data into a pandas dataframe
    df = pd.DataFrame(match_events)

    # Dropping all rows that do not include a player ID
    df.dropna(subset='playerId', inplace=True)

    # Replacing all NaN values to None
    df = df.where(pd.notnull(df), None)

    # Renaming columns to ensure consistency and data integrity
    df = df.rename(
        {
            'eventId': 'event_id',
            'expandedMinute': 'expanded_minute',
            'outcomeType': 'outcome_type',
            'isTouch': 'is_touch',
            'playerId': 'player_id',
            'teamId': 'team_id',
            'endX': 'end_x',
            'endY': 'end_y',
            'blockedX': 'blocked_x',
            'blockedY': 'blocked_y',
            'goalMouthZ': 'goal_mouth_z',
            'goalMouthY': 'goal_mouth_y',
            'isShot': 'is_shot',
            'cardType': 'card_type',
            'isGoal': 'is_goal'
        },
        axis=1
    )

    # Working til here

    # Creating new columns from the dictionaries within the dataset variables (df['period', 'type'], etc)
    df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])  # The displayname variable is a key within the dictionary within the dataset (json)
    df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
    df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])

    # Creating a column of 'is_goal' for games without goals. 
    # Otherwise it will create errors
    if 'is_goal' not in df.columns:
        print('missing goals')
        df['is_goal'] = False
        
    # Fixing for offside given
    # Dropping rows that have the offisde given
    df = df[~(df['type_display_name'] == "OffsideGiven")]

    # Dropping the initial dictionary columns since we don't need them anymore
    df.drop(columns = ['period', 'type', 'outcome_type'], inplace=True)

    # Defining and keeping only desired columns
    # ~~ Watch out here. Some leagues will have different columns
    df = df[[ 
        'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x','y', 'end_x', 'end_y', 
        'qualifiers', 'is_touch', 'blocked_x', 'blocked_y', 'goal_mouth_z', 'goal_mouth_y', 'is_shot', 'is_goal', 'type_display_name', 'outcome_type_display_name',
        'period_display_name'
    ]]

    # -- Variables not used: , 'card_type'

    # Defining the types of each variable
    df[['id', 'event_id', 'minute', 'team_id', 'player_id']] = df[['id', 'event_id', 'minute', 'team_id', 'player_id']].astype(int) 
    df[['second', 'x', 'y', 'end_x', 'end_y']] = df[['second', 'x', 'y', 'end_x', 'end_y']].astype(float)
    df[['is_shot', 'is_goal']] =df[['is_shot', 'is_goal']].astype(bool)

    df['is_goal'] = df['is_goal'].fillna(False)
    df['is_shot'] = df['is_shot'].fillna(False)

    # -- Variables not used: , 'card_type'

    # Loop for ensuring accuracy of the columns
    # Pandas & Numpy treat the NaN differently, this will help to assign as None all of those values that previously didn't change from NaN to None
    for column in df.columns:
        if df[column].dtype == np.float64 or df[column].dtype == np.float32:
            df[column] = np.where(
                np.isnan(df[column]),
                None,
                df[column]
            )

    # --------------- Appending Player Data -----------------------

    # Create a new variable to store the new coming information
    # We then later will merge it with initial database
    team_info = []

    # Appending player information of Home team
    team_info.append({
        'team_id': matchdict['home']['teamId'],
        'name': matchdict['home']['name'],
        'country_name': matchdict['home']['countryName'],
        'manager_name': matchdict['home']['managerName'],
        'players': matchdict['home']['players'],
    })

    # Appending player information of Away team
    team_info.append({
        'team_id': matchdict['away']['teamId'],
        'name': matchdict['away']['name'],
        'country_name': matchdict['away']['countryName'],
        'manager_name': matchdict['away']['managerName'],
        'players': matchdict['away']['players'],
    })


    # Creating function for storing player information into new list
    def insert_players(team_info):
        players = []
        
        for team in team_info:
            for player in team['players']:
                players.append({
                    'player_id': player['playerId'],
                    'team_id': team['team_id'],
                    'shirt_no': player['shirtNo'],
                    'name': player['name'],
                    'position': player['position'],
                    'age': player['age'],
                    'MOTM': player['isManOfTheMatch'] # Might not have in Libertadores (Doble check later)
                })
        return players
    
    # Creating function for storing team information and name
    def insert_team(team_info):
        teams = []
        for team in team_info:
            teams.append({
            'team_id': team['team_id'],
            'team': team['name']
            })
        return teams

    # Applying functions
    players = insert_players(team_info)
    teams = insert_team(team_info)

    # Converting JSON data into a pandas dataframe
    players_df = pd.DataFrame(players)
    teams_df = pd.DataFrame(teams)

     # --------------- Merging Events Data with Player Data -----------------------

     # We are going to do the merge on player_id.
    # SQL Schema -> primary key in Players table and foreign key in events
    players_df = pd.merge(players_df, teams_df, on='team_id')
    final_df = pd.merge(df, players_df, on='player_id')

    # Sorting the df in ascending for minute and second
    final_df = final_df.sort_values(by=['minute', 'second'], ascending=True)

    # Resetting the index if needed
    final_df = final_df.reset_index(drop=True)

    # Setting up the name for the file
    # Finding all positions of '-'
    # positions = [pos for pos, char in enumerate(whoscored_url) if char == '-']

    # Getting the position of the second to last '-'
    # second_to_last_dash_position = positions[-2]

    # Slicing the string from the second to last '-' to the end
    # new_variable = whoscored_url[second_to_last_dash_position + 1:]

    # Saving the file for later in CSV
    # final_df.to_csv(f'Datasets/{new_variable}.csv', index=False)

    return final_df

df = scraping_whoscored(whoscored_url)

In [7]:
df.head()

Unnamed: 0,id,event_id,minute,second,team_id_x,player_id,x,y,end_x,end_y,...,type_display_name,outcome_type_display_name,period_display_name,team_id_y,shirt_no,name,position,age,MOTM,team
0,2684305553,3,0,0.0,2050,445146,49.7,50.1,51.0,48.1,...,Pass,Successful,FirstHalf,2050,17,Jairo Concha,MC,25,False,Universitario de Deportes
1,2684305555,4,0,2.0,2050,445809,49.1,44.2,51.9,13.5,...,Pass,Successful,FirstHalf,2050,18,Rodrigo Ureña,DMC,32,False,Universitario de Deportes
2,2684305557,5,0,3.0,2050,149542,61.0,4.1,69.8,26.6,...,Pass,Successful,FirstHalf,2050,24,Andy Polo,MR,30,False,Universitario de Deportes
3,2684319789,404,0,5.0,2050,417919,73.5,28.0,62.9,27.5,...,Pass,Successful,FirstHalf,2050,20,Álex Valera,FW,28,False,Universitario de Deportes
4,2684305573,6,0,10.0,2050,445809,57.7,36.6,59.1,89.1,...,Pass,Successful,FirstHalf,2050,18,Rodrigo Ureña,DMC,32,False,Universitario de Deportes


In [10]:
def get_team_interest():

    list_of_teams = ['Universitario de Deportes']
    # Maybe try to assign colors here as well
    
    return list_of_teams


color_home = '#1CC17D'
color_away = '#CE2939'

# Pass Network
def pass_network(df, color):
    
    # Filter to keep only records that correspond to team of interest
    df = df[df['team'].isin(get_team_interest())]
    
    # Creating dataframe only of substitution records
    subs = df[df['type_display_name'] == 'SubstitutionOff']
    # Only keeping the minute variable of this new sub df
    subs = subs['minute']
    first_sub = subs.min()
    
    # Keeping the data only with records before first substitution
    df = df[df['minute'] < first_sub].reset_index()
    
    # Creating new variables
    df['passer'] = df['name']
    df['receiver'] = df['name'].shift(-1)
    
    # Only interested in successful passes
    df = df[df['type_display_name'] == 'Pass']
    df = df[df['outcome_type_display_name'] == 'Successful']
    
    # Calculating Average Locations of Players
    avg_locations = df.groupby('passer').agg({'x':['mean'], 'y':['mean', 'count']})
    avg_locations.columns = ['x', 'y', 'count']
    avg_locations
    
    # Passes between players (Count of Associations)
    pass_between = df.groupby(['passer', 'receiver']).id.count().reset_index()
    pass_between.rename({'id':'pass_count'}, axis='columns', inplace=True)
    
    # Merging DataFrames
    pass_between = pass_between.merge(avg_locations, left_on='passer',right_index=True)
    pass_between = pass_between.merge(avg_locations, left_on='receiver',right_index=True,suffixes=['','_end'])
    
    
    
    
    return df

pass_network(df)
    



Unnamed: 0,index,id,event_id,minute,second,team_id_x,player_id,x,y,end_x,...,period_display_name,team_id_y,shirt_no,name,position,age,MOTM,team,passer,receiver
0,0,2684305553,3,0,0.0,2050,445146,49.7,50.1,51.0,...,FirstHalf,2050,17,Jairo Concha,MC,25,False,Universitario de Deportes,Jairo Concha,Rodrigo Ureña
1,1,2684305555,4,0,2.0,2050,445809,49.1,44.2,51.9,...,FirstHalf,2050,18,Rodrigo Ureña,DMC,32,False,Universitario de Deportes,Rodrigo Ureña,Andy Polo
2,2,2684305557,5,0,3.0,2050,149542,61.0,4.1,69.8,...,FirstHalf,2050,24,Andy Polo,MR,30,False,Universitario de Deportes,Andy Polo,Álex Valera
3,3,2684319789,404,0,5.0,2050,417919,73.5,28.0,62.9,...,FirstHalf,2050,20,Álex Valera,FW,28,False,Universitario de Deportes,Álex Valera,Rodrigo Ureña
4,4,2684305573,6,0,10.0,2050,445809,57.7,36.6,59.1,...,FirstHalf,2050,18,Rodrigo Ureña,DMC,32,False,Universitario de Deportes,Rodrigo Ureña,Segundo Portocarrero
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,976,2684323827,555,64,52.0,2050,417919,55.3,24.5,50.9,...,SecondHalf,2050,20,Álex Valera,FW,28,False,Universitario de Deportes,Álex Valera,Jairo Concha
496,977,2684323833,556,64,56.0,2050,445146,44.7,48.4,41.9,...,SecondHalf,2050,17,Jairo Concha,MC,25,False,Universitario de Deportes,Jairo Concha,Aldo Corzo
497,978,2684323849,557,64,57.0,2050,62465,41.9,24.1,54.7,...,SecondHalf,2050,29,Aldo Corzo,DC,35,False,Universitario de Deportes,Aldo Corzo,Martín Pérez Guedes
498,979,2684323857,558,65,1.0,2050,125555,60.7,6.4,64.1,...,SecondHalf,2050,16,Martín Pérez Guedes,MC,33,False,Universitario de Deportes,Martín Pérez Guedes,Jairo Concha
