# 02r. ATP Court Vision Raw Data Scraper

Notebook will contain codes and functions for scraping the court vision raw data from the ATP website.
Output is archived in json format.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import json
import itertools
import sys
sys._enablelegacywindowsfsencoding() #Deal with pandas problem with reading file with accents in file path i.e Alexis Sánchez, Victor Lindelöf 


headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'} 

from ast import literal_eval

import base64
import cryptography.hazmat.backends
import cryptography.hazmat.primitives.ciphers
import cryptography.hazmat.primitives.ciphers.algorithms
import cryptography.hazmat.primitives.ciphers.modes
import cryptography.hazmat.primitives.padding

import sys
from time import sleep
import datetime

### Decrypting Utilities

In [2]:
def formatDate(t):
    #e = datetime.datetime.now().utcoffset().total_seconds() / 60       # ChatGPT suggestion but not needed
    #t = t + datetime.timedelta(minutes=e)                              # ChatGPT suggestion but not needed
    
    t_tstamp = datetime.datetime.utcfromtimestamp(t/1000)
    n = t_tstamp.day
    r = int(str(n if n >= 10 else "0" + str(n))[::-1])
    i = t_tstamp.year
    a = int(str(i)[::-1])
    o = np.base_repr(int(str(t), base=16), 36).lower() + np.base_repr((i + a) * (n + r), 24).lower()
    s = len(o)
    if s < 14:
        o += "0" * (14 - s)
    elif s > 14:
        o = o[:14]
    return "#" + o + "$"


In [3]:
def decode(data):
    e = formatDate(data['lastModified'])
    n = e.encode()
    r = e.upper().encode()
    cipher = cryptography.hazmat.primitives.ciphers.Cipher(
        cryptography.hazmat.primitives.ciphers.algorithms.AES(n),
        cryptography.hazmat.primitives.ciphers.modes.CBC(r),
        backend=cryptography.hazmat.backends.default_backend()
    )
    decryptor = cipher.decryptor()
    i = decryptor.update(base64.b64decode(data['response'])) + decryptor.finalize()
    unpadder = cryptography.hazmat.primitives.padding.PKCS7(128).unpadder()
    #return json.loads(unpadder.update(i) + unpadder.finalize().decode('utf-8'))
    return json.loads(i.decode("utf-8").replace(i.decode("utf-8")[-1],""))

## Get Match Data and URLs

In [15]:
import glob

In [20]:
file_list = glob.glob("data/atp-tournament-matches/*Masters*.csv")[2:6] + glob.glob("data/atp-tournament-matches/*Masters*.csv")[-1:]

In [24]:
file_list

['data/atp-tournament-matches\\matches_ATP-Masters-1000-Indian-Wells_2022.csv',
 'data/atp-tournament-matches\\matches_ATP-Masters-1000-Madrid_2022.csv',
 'data/atp-tournament-matches\\matches_ATP-Masters-1000-Miami_2022.csv',
 'data/atp-tournament-matches\\matches_ATP-Masters-1000-Monte-Carlo_2022.csv',
 'data/atp-tournament-matches\\matches_ATP-Masters-1000-Rome_2022.csv']

In [25]:
for file in file_list:

    matches = pd.read_csv(file)

    # Subset matches to scrape (exclude qualifying)
    matches_scp = matches[~matches.Round.str.contains("Qualifying")]

    # Loop through matches and scrape one by one 
    for i in np.arange(0,len(matches_scp)):

        try:
            _,_,_,_,_,_,_,year,tourn_id,match_id = matches_scp.URL.iloc[i].split('/')
        
            match_id = match_id.upper()
            #print(match_id)

            player1 = matches_scp.Player1.iloc[i]
            player2 = matches_scp.Player2.iloc[i]

            link = f'https://itp-atp-sls.infosys-platforms.com/static/prod/court-vision/{year}/{tourn_id}/{match_id}/data.json'
            # Get request and content from the given link and parse into HTML
            pageTree = requests.get(link, headers=headers)
            pageSoup = BeautifulSoup(pageTree.content, 'html.parser') 

            results_json = json.loads(str(pageSoup))

            # Decode Data
            raw_data = decode(results_json)

            # Match the formatting of player1/2 to that in the court-vision raw data's player data 
            # If player names match their respective indexes in the court-vision raw data, 
            # then we keep the player name order, otherwise we swap 
            # "Truncated Name" for player 1 (e.g. R. NADAL)
            player1_tname = player1.split(" ")[0][0]+"." + " " + player1.split(" ")[1].upper()
            player1_cv = raw_data['courtVisionData'][0]['a79']['a83'][0]['a85']

            if player1_tname == player1_cv:
                player1_cvfile = player1
                player2_cvfile = player2
            else:
                player1_cvfile = player2
                player2_cvfile = player1

            # Formatting
            player1_cvfile = player1_cvfile.replace(" ","-")
            player2_cvfile = player2_cvfile.replace(" ","-")

            # Format the "Round Name" to appear on file path
            round_n = matches_scp.Round.iloc[i]
            if "Round Of" in round_n:
                round_short = round_n.split(" ")[0][0] + round_n.split(" ")[-1]
            elif "Round" in round_n:
                round_short = "".join([s[0] for s in round_n.split(" ")])
            elif round_n == "Quarterfinals" or round_n == "Quarter-Finals":
                round_short = "QF"
            elif round_n == "Semifinals" or round_n == "Semi-Finals":
                round_short = "SF"
            elif round_n == "Final" or round_n == "Finals":
                round_short = "F"

            # Output the decoded courtvision data into a json file
            with open(f"data/court-vision/{tourn_id}_{round_short}_{player1_cvfile}-vs-{player2_cvfile}_{year}_{match_id}_court-vision.json", 'w') as fp:
                json.dump(raw_data, fp)

            sleeptime = np.random.uniform(3, 20)
            sleep(sleeptime)

        except:
            print(f"{i} Failed or no Data!")
            pass

    sleeptime = np.random.uniform(25, 60)
    sleep(sleeptime)

10 Failed or no Data!
47 Failed or no Data!
63 Failed or no Data!
64 Failed or no Data!
65 Failed or no Data!
66 Failed or no Data!
67 Failed or no Data!
68 Failed or no Data!
69 Failed or no Data!
70 Failed or no Data!
71 Failed or no Data!
72 Failed or no Data!
73 Failed or no Data!
74 Failed or no Data!
75 Failed or no Data!
76 Failed or no Data!
77 Failed or no Data!
78 Failed or no Data!
79 Failed or no Data!
80 Failed or no Data!
81 Failed or no Data!
82 Failed or no Data!
83 Failed or no Data!
84 Failed or no Data!
85 Failed or no Data!
86 Failed or no Data!
87 Failed or no Data!
88 Failed or no Data!
89 Failed or no Data!
90 Failed or no Data!
91 Failed or no Data!
92 Failed or no Data!
93 Failed or no Data!
94 Failed or no Data!
3 Failed or no Data!
4 Failed or no Data!
5 Failed or no Data!
6 Failed or no Data!
7 Failed or no Data!
8 Failed or no Data!
9 Failed or no Data!
10 Failed or no Data!
11 Failed or no Data!
12 Failed or no Data!
13 Failed or no Data!
14 Failed or no D