##### This script is to scrape the required data from vlr.gg for model training purpose

- navigate to the event page
- select event
- select matches
- scrape the table based on the map
- repeat the process until all matches are scrapped

### Import libraries

In [1]:
# selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

# other libraries needed
import time
import pandas as pd
import csv

# bs4
from bs4 import BeautifulSoup

### Webdriver

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
options = Options()
# options.add_argument('--headless=new') # uncomment this when require to run in headless mode

# launch the driver
url = "https://www.vlr.gg/events" # directly go to event tab (all results are stored in there)
driver.get(url)
driver.maximize_window()
main_tab_handle = driver.current_window_handle # to locate first tab
driver.implicitly_wait(5) # wait for the content to load completely before scrape (5 sec)

### Scrapping

#### Function: Locate/direct to match details and scrape

In [3]:
# scrape function
def getMatch(a):
   
    # get the name of the event
    event_name = a.find_element(By.CLASS_NAME, 'event-item-title').text
    print(f"scrapping data for {event_name}", end="...")
    
    # open new tab to show event detail
    action = ActionChains(driver)
    action.key_down(Keys.CONTROL).click(a).key_up(Keys.CONTROL).perform()
    driver.switch_to.window(driver.window_handles[1])
    event_window = driver.current_window_handle
    
    try:
        # navigate to matches tab
        nav_bar = driver.find_element(By.CLASS_NAME, 'wf-nav')
        matches = nav_bar.find_elements(By.TAG_NAME, 'a') 
        matches[1].click() # all matches record are stored in matches tab [second tab]
    
        # click dropdown menu to select "All Matches" to display all matches
        dropdown_menu_btn = driver.find_element(By.CSS_SELECTOR, '.btn.mod-filter.js-dropdown')
        dropdown_menu_btn.click()
        time.sleep(1)
        driver.find_element(By.CSS_SELECTOR, '.wf-dropdown.mod-all').find_element(By.LINK_TEXT, 'All Stages').click()
        time.sleep(1)
    
        # all matches are arranges according to date
        matches_day = driver.find_elements(By.CLASS_NAME, 'wf-card')
    
        for day in matches_day[1:]: # first element with class='wf-card' is the event header, skip it
            all_matches = day.find_elements(By.TAG_NAME, 'a') # find all clickable element in the div

            for match in all_matches: # find all matches & scrape
                # open match details in new tab
                action.key_down(Keys.CONTROL).click(match).key_up(Keys.CONTROL).perform()
                driver.switch_to.window(driver.window_handles[2])
                time.sleep(0.5) 

                # locate the maps div
                maps = driver.find_element(By.CSS_SELECTOR, ".vm-stats-gamesnav.noselect").find_elements(By.TAG_NAME, 'div')

                # scrape each maps in the map div
                for map in maps:
                    try:
                        match_stat = scrape(map)
                        if match_stat != None:
                            arrange(match_stat)

                    except:
                        print("error occurs")
                        continue
                    
                driver.close()# return to previous page and continue next loop
                driver.switch_to.window(event_window)

        print("done")
        driver.close() # close event window
        time.sleep(0.5)
        driver.switch_to.window(main_tab_handle) # focus to event list
    except:
        driver.close()# return to previous page and continue next loop
        driver.switch_to.window(event_window)
        print("fail, continue with next event...\n")
        driver.close() # close event window
        time.sleep(0.5)
        driver.switch_to.window(main_tab_handle) # focus to event list

#### Function: Scrape the data required

In [4]:
# class template for players
class Players:
    def __init__(self, agent, R, ACS, KAST, ADR, HS):
        # players data
        self.agent = agent
        self.R = R
        self.ACS = ACS
        self.KAST = KAST
        self.ADR = ADR
        self.HS = HS

In [5]:
class Match:
    def __init__(self, ct, t, map, winner):
        # 2 team in the match
        self.ct = ct
        self.t = t
        self.map = map
        self.winner = winner

In [6]:
def scrape(map):
    
    # TODO: scrape required data from page
    
    # Access data-disabled and data-game-id attribute value:
    disabled_value = map.get_attribute("data-disabled")
    game_id_value = map.get_attribute("data-game-id")
    
    # check the map condition 
    if (disabled_value == "0" and game_id_value != "all"):
        map.click() # switch to respective map
        
        # create a array to store players from both team
        ct = []
        t = []
        
        # TODO: find out the winner of this map
        score_div = driver.find_element(By.CSS_SELECTOR, '.vm-stats-game[style*="display: block;"]').find_element(By.CLASS_NAME, 'vm-stats-game-header').find_elements(By.CLASS_NAME, 'team')
        
        map = driver.find_element(By.CSS_SELECTOR, '.vm-stats-game[style*="display: block;"]').find_element(By.CLASS_NAME, 'vm-stats-game-header').find_element(By.CLASS_NAME, 'map').find_element(By.TAG_NAME, 'div').find_element(By.CSS_SELECTOR, 'span[style*="position: relative;"]').text
        
        # determine the winner team
        for win_index, team in enumerate(score_div):
            try:
                team.find_element(By.CSS_SELECTOR, '.score.mod-win')
                if win_index == 0:
                    winner = 'ct'
                else:
                    winner = 't'
            except:
                continue
        
        all_table = driver.find_element(By.CSS_SELECTOR, '.vm-stats-game[style*="display: block;"]').find_elements(By.TAG_NAME, 'table')
        
        for t_index, table_div in enumerate(all_table): # for checking purpose, delete later
            table_html = table_div.get_attribute('outerHTML')
            
            # TODO: scrape using BS4 ? since we have the static HTML now
            soup = BeautifulSoup(table_html, 'html.parser')
            
            all_row = soup.find('tbody').find_all('tr')
            
            for row in all_row:
                # sequence [agent, R, ACS, KAST, ADR, HS]
                cache = []  # temporary store the stat to create Players object later
                
                data_blocks = row.find_all('td')
                
                filtered_data_blocks = [data_block for data_block in data_blocks if data_block.get('class', []) == ['mod-stat'] or data_block.get('class', []) == ['mod-agents']]
                
                
                for filtered_block in filtered_data_blocks:
                    if filtered_block.get('class', []) == ['mod-stat']:
                        # find the value and store in cache
                        data = filtered_block.find('span').find('span').text
                        cache.append(data)
                        
                    else:
                        # find the agent name and insert to cache[0] (first place of cache)
                        agent = filtered_block.find('div').find('span').find('img').get('title')
                        cache.insert(0, agent)
                    
                player = Players(*cache)
                
                if t_index == 0:
                    ct.append(player)
                else:
                    t.append(player)
        
        match_stat = Match(ct, t, map, winner)
        return match_stat
    
    time.sleep(0.5)
    return 

In [7]:
def arrange(match_stat):
    ct_list = match_stat.ct
    t_list = match_stat.t
    map = match_stat.map
    winner = match_stat.winner
    
    row = []
    
    with open("data.csv", 'a') as csvfile:
        writer = csv.writer(csvfile)
        
        # agent, R, ACS, KAST, ADR, HS
        for ct in ct_list:
            agent = ct.agent
            R = ct.R
            ACS = ct.ACS
            KAST = ct.KAST
            ADR = ct.ADR
            HS = ct.HS
            
            row.append(agent)
            row.append(R)
            row.append(ACS)
            row.append(KAST)
            row.append(ADR)
            row.append(HS)
            
            
        for t in t_list:
            agent = t.agent
            R = t.R
            ACS = t.ACS
            KAST = t.KAST
            ADR = t.ADR
            HS = t.HS
            
            row.append(agent)
            row.append(R)
            row.append(ACS)
            row.append(KAST)
            row.append(ADR)
            row.append(HS)
        
        row.append(map)
        row.append(winner)
        
        writer.writerow(row) # write into csv
    

#### Locate events list and start scrapping process

In [8]:
# initialize loop condition
page = driver.find_element(By.CLASS_NAME, 'action-container-pages')
span = page.find_element(By.CSS_SELECTOR, 'span.btn.mod-page.mod-active')
next_pages = span.find_elements(By.XPATH, 'following-sibling::a')

# while span (selected page) has next page, proceed
while len(next_pages) > 0:
    # locate completed event div
    events_div = driver.find_elements(By.CLASS_NAME, 'events-container-col')
    completed_events_div = events_div[1] # completed events always == second div

    # extract all clickable element into an array
    all_a = completed_events_div.find_elements(By.TAG_NAME, 'a')

    # scrape all completed event
    for a in all_a: # remove[:] when need to scrape all data
        try:
            getMatch(a)
        except:
            continue
        
    # find page div and go for next page
    page = driver.find_element(By.CLASS_NAME, 'action-container-pages')
    span = page.find_element(By.CSS_SELECTOR, 'span.btn.mod-page.mod-active')
    next_pages = span.find_elements(By.XPATH, 'following-sibling::a')
    
    # repeat the process until there is no next page
    if len(next_pages) > 0: 
        print(f'scrapping {span.text}')
        next_pages[0].click()
    else:
        print('completed for all page')

# repeat the process for all pages


scrapping data for Raidiant Academy 2024: March...fail, continue with next event...

scrapping data for Champions Tour 2024: Americas Kickoff...done
scrapping data for Champions Tour 2024: China Kickoff...done
scrapping data for Champions Tour 2024: EMEA Kickoff...done
scrapping data for Community Gaming SERIES #12...fail, continue with next event...

scrapping data for Champions Tour 2024: Pacific Kickoff...done
scrapping data for Game Changers 2024 LATAM: Open Qualifiers...error occurs
error occurs
done
scrapping data for Project V 2024 Division 1 - Split 1: Stage 1...done
scrapping data for VALORANT Open Tour France 2024: Stage 1...done
scrapping data for VALORANT East: United: Season 3: Split 1 - Weekly Cups...done
scrapping data for Challengers League 2024 North America: Qualifiers...error occurs
error occurs
done
scrapping data for Corsair Championship 2024...fail, continue with next event...

scrapping data for Game Changers 2024: EMEA Kickoff...error occurs
error occurs
error o

#### Store the dataframe as .csv for future use (model training / backup)

In [None]:
# store as csv

### Exit driver, complete scrapping

In [None]:
# quit driver after get the html
driver.quit()