# 02RG. RG Court Vision Raw Data Scraper

Notebook will contain codes and functions for scraping the court vision raw data from the Roland Garros website.
Output is archived in json format.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json
import itertools
import sys
sys._enablelegacywindowsfsencoding() #Deal with pandas problem with reading file with accents in file path i.e Alexis Sánchez, Victor Lindelöf 


headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} 

from ast import literal_eval

import base64
import cryptography.hazmat.backends
import cryptography.hazmat.primitives.ciphers
import cryptography.hazmat.primitives.ciphers.algorithms
import cryptography.hazmat.primitives.ciphers.modes
import cryptography.hazmat.primitives.padding

import sys
from time import sleep
import datetime

### Decrypting Utilities

In [3]:
def formatDate(t):
    #e = datetime.datetime.now().utcoffset().total_seconds() / 60       # ChatGPT suggestion but not needed
    #t = t + datetime.timedelta(minutes=e)                              # ChatGPT suggestion but not needed
    
    t_tstamp = datetime.datetime.utcfromtimestamp(t/1000)
    n = t_tstamp.day
    r = int(str(n if n >= 10 else "0" + str(n))[::-1])
    i = t_tstamp.year
    a = int(str(i)[::-1])
    o = np.base_repr(int(str(t), base=16), 36).lower() + np.base_repr((i + a) * (n + r), 24).lower()
    s = len(o)
    if s < 14:
        o += "0" * (14 - s)
    elif s > 14:
        o = o[:14]
    return "#" + o + "$"


In [4]:
def decode(data):
    e = formatDate(data['lastModified'])
    n = e.encode()
    r = e.upper().encode()
    cipher = cryptography.hazmat.primitives.ciphers.Cipher(
        cryptography.hazmat.primitives.ciphers.algorithms.AES(n),
        cryptography.hazmat.primitives.ciphers.modes.CBC(r),
        backend=cryptography.hazmat.backends.default_backend()
    )
    decryptor = cipher.decryptor()
    i = decryptor.update(base64.b64decode(data['response'])) + decryptor.finalize()
    unpadder = cryptography.hazmat.primitives.padding.PKCS7(128).unpadder()
    #return json.loads(unpadder.update(i) + unpadder.finalize().decode('utf-8'))
    return json.loads(i.decode("utf-8").replace(i.decode("utf-8")[-1],""))

## Get Match Data and URLs

In [5]:
import glob

In [15]:
matches = pd.read_csv("../data/RG_results-all_processed_2021.csv") 

In [23]:
matches_scp = matches[matches.id.str.contains("SM")]
matches_scp.head(1)

In [18]:
eventId = 520
year = 2021

In [24]:
# Loop through matches and scrape one by one 
for i in np.arange(0,1):
    try:
        match_id = matches_scp.id.iloc[i]
        player1 = matches_scp.player1_name.iloc[i]
        player2 = matches_scp.player2_name.iloc[i]

        link = f"https://itp-rg-sls.infosys-platforms.com/prod/api/court-vision/year/{year}/eventId/{eventId}/matchId/{match_id}/pointId/0_0_0"
        # Get request and content from the given link and parse into HTML
        pageTree = requests.get(link, headers=headers)
        pageSoup = BeautifulSoup(pageTree.content, 'html.parser') 

        results_json = json.loads(str(pageSoup))

        # Decode Data
        raw_data = decode(results_json)

        # Match the formatting of player1/2 to that in the court-vision raw data's player data 
        # If player names match their respective indexes in the court-vision raw data, 
        # then we keep the player name order, otherwise we swap 
        # "Truncated Name" for player 1 (e.g. R. NADAL)
        player1_tname = player1.split(" ")[0][0]+"." + player1.split(" ")[1].upper()
        player1_cv = raw_data['courtVisionData'][0]['a79']['a83'][0]['a85']

        if player1_tname == player1_cv:
            player1_cvfile = player1
            player2_cvfile = player2
        else:
            player1_cvfile = player2
            player2_cvfile = player1

        # Formatting
        player1_cvfile = player1_cvfile.replace(" ","-")
        player2_cvfile = player2_cvfile.replace(" ","-")

        # Format the "Round Name" to appear on file path
        round_n = matches_scp["round"].iloc[i]
        if round_n == "First Round":
            round_short = "R128"
        elif round_n == "Second Round":
            round_short = "R64"
        elif round_n == "Third Round":
            round_short = "R32"
        elif round_n == "Fourth Round":
            round_short = "R16"
        elif round_n == "Quarterfinals" or round_n == "Quarter-Finals":
            round_short = "QF"
        elif round_n == "Semifinals" or round_n == "Semi-Finals":
            round_short = "SF"
        elif round_n == "Final" or round_n == "Finals":
            round_short = "F"

        # Output the decoded courtvision data into a json file
        with open(f"../data/court-vision/{eventId}_{round_short}_{player1_cvfile}-vs-{player2_cvfile}_{year}_{match_id}_court-vision.json", 'w') as fp:
            json.dump(raw_data, fp)

        sleeptime = np.random.uniform(3, 20)
        sleep(sleeptime)

    except:
        print(f"{i} Failed or no Data!")
        pass

In [25]:
matches_scp = matches[matches.id.str.contains("SD")]

In [26]:
# Loop through matches and scrape one by one 
for i in np.arange(0,len(matches_scp)):
    try:
        match_id = matches_scp.id.iloc[i]
        player1 = matches_scp.player1_name.iloc[i]
        player2 = matches_scp.player2_name.iloc[i]

        link = f"https://itp-rg-sls.infosys-platforms.com/prod/api/court-vision/year/{year}/eventId/{eventId}/matchId/{match_id}/pointId/0_0_0"
        # Get request and content from the given link and parse into HTML
        pageTree = requests.get(link, headers=headers)
        pageSoup = BeautifulSoup(pageTree.content, 'html.parser') 

        results_json = json.loads(str(pageSoup))

        # Decode Data
        raw_data = decode(results_json)

        # Match the formatting of player1/2 to that in the court-vision raw data's player data 
        # If player names match their respective indexes in the court-vision raw data, 
        # then we keep the player name order, otherwise we swap 
        # "Truncated Name" for player 1 (e.g. R. NADAL)
        player1_tname = player1.split(" ")[0][0]+"." + player1.split(" ")[1].upper()
        player1_cv = raw_data['courtVisionData'][0]['a79']['a83'][0]['a85']

        if player1_tname == player1_cv:
            player1_cvfile = player1
            player2_cvfile = player2
        else:
            player1_cvfile = player2
            player2_cvfile = player1

        # Formatting
        player1_cvfile = player1_cvfile.replace(" ","-")
        player2_cvfile = player2_cvfile.replace(" ","-")

        # Format the "Round Name" to appear on file path
        round_n = matches_scp["round"].iloc[i]
        if round_n == "First Round":
            round_short = "R128"
        elif round_n == "Second Round":
            round_short = "R64"
        elif round_n == "Third Round":
            round_short = "R32"
        elif round_n == "Fourth Round":
            round_short = "R16"
        elif round_n == "Quarterfinals" or round_n == "Quarter-Finals":
            round_short = "QF"
        elif round_n == "Semifinals" or round_n == "Semi-Finals":
            round_short = "SF"
        elif round_n == "Final" or round_n == "Finals":
            round_short = "F"

        # Output the decoded courtvision data into a json file
        with open(f"../data/court-vision/{eventId}_{round_short}_{player1_cvfile}-vs-{player2_cvfile}_{year}_{match_id}_court-vision.json", 'w') as fp:
            json.dump(raw_data, fp)

        sleeptime = np.random.uniform(3, 20)
        sleep(sleeptime)

    except:
        print(f"{i} Failed or no Data!")
        pass

6 Failed or no Data!
7 Failed or no Data!
8 Failed or no Data!
9 Failed or no Data!
10 Failed or no Data!
11 Failed or no Data!
12 Failed or no Data!
13 Failed or no Data!
14 Failed or no Data!
15 Failed or no Data!
16 Failed or no Data!
17 Failed or no Data!
18 Failed or no Data!
19 Failed or no Data!
26 Failed or no Data!
27 Failed or no Data!
28 Failed or no Data!
29 Failed or no Data!
30 Failed or no Data!
31 Failed or no Data!
32 Failed or no Data!
33 Failed or no Data!
34 Failed or no Data!
35 Failed or no Data!
36 Failed or no Data!
37 Failed or no Data!
38 Failed or no Data!
39 Failed or no Data!
40 Failed or no Data!
41 Failed or no Data!
42 Failed or no Data!
43 Failed or no Data!
50 Failed or no Data!
51 Failed or no Data!
52 Failed or no Data!
53 Failed or no Data!
54 Failed or no Data!
55 Failed or no Data!
56 Failed or no Data!
57 Failed or no Data!
58 Failed or no Data!
59 Failed or no Data!
60 Failed or no Data!
61 Failed or no Data!
62 Failed or no Data!
63 Failed or n