# Counter Strikle Webscraper

## Notebook Description

__Author:__ Daniël Vermaas

This notebook scrapes [Liquidpedia](https://liquipedia.net/counterstrike/Main_Page), in order to make [Counter-Strikle](https://blast.tv/counter-strikle) puzzle-solving easier. Before using the notebook, please read the [Liquidpedia ToS about API usage](https://liquipedia.net/api-terms-of-use).

## Libraries & Constants

In [1]:
import os
import time
from datetime import datetime
import json
import mwparserfromhell
from ratelimit import limits, sleep_and_retry
import pandas as pd
from tqdm.notebook import tqdm
import requests
import ipywidgets as widgets

In [2]:
BASE_URL = "https://liquipedia.net/counterstrike/api.php?"
HEADERS = {"User-Agent": "Counter-Strikle-Bot (dvermaas@live.com)","Accept-Encoding": "gzip"}
CSV_FILE = "players.csv"
QUERY_COOLDOWN = 2
REGIONS = {
    "Europe" : ["Europe", "CIS"],
    "Americas" : ["North America", "South America"],
    "Asia-Pacific" : ["Oceania", "Asia"],
}
COLUMNS = [
    "PAGE",
    "NAME",
    "REAL NAME",
    "REGION",
    "NATIONALITY",
    "TEAM",
    "AGE",
    "WEAPON",
    "MAJOR APPEARANCES",
    "LAST UPDATED"
]

## MediaWiki API requests

In [3]:
@sleep_and_retry
@limits(calls=1, period=QUERY_COOLDOWN)
def query_page(page, rvsection):
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": page,
        "rvprop": "content",
        "rvslots": "*",
        "rvsection": rvsection,
    }
    data = requests.get(BASE_URL, params=params, headers=HEADERS).json()
    pages = data["query"]["pages"]
    page_id = next(iter(pages))
    page = pages[page_id]
    return mwparserfromhell.parse(page["revisions"][0]["slots"]["main"]["*"])

In [33]:
@sleep_and_retry
@limits(calls=1, period=QUERY_COOLDOWN)
def search_page(search):
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": search,
        "srlimit": 1,
    }
    data = requests.get(BASE_URL, params=params, headers=HEADERS).json()
    try:
        pages = data["query"]["search"]
        return next(iter(pages))["title"]
    except:
        print(f"searc: {search}\n", data)

search_page("Kacper Słoma")

'Kap3r'

## Generate Country to Region Map

In [9]:
def build_region_dict():
    country_region_dict = {}
    with open("regions.json", "r") as file:
        region_dict = json.load(file)
    for region, countries in region_dict.items():
            for country in countries:
                country_region_dict[country] = region
    return country_region_dict

REGION_DICT = build_region_dict()

In [10]:
# Fetches list of all concluded majors (checks hltv awards) 
def fetch_majors():
    wikicode = query_page("Majors", 8)
    links = wikicode.filter_wikilinks()
    links = [link.split('|')[0].replace('[[', '') for link in links]
    return links

MAJOR_LIST = fetch_majors()

In [11]:
# Fetches all player ids
def fetch_players():
    results = []
    for category in ["Players", "Coaches"]:
        url = f"{BASE_URL}action=query&list=categorymembers&cmtitle=Category:{category}&cmlimit=max&format=json"

        while True:
            response = requests.get(url)
            time.sleep(QUERY_COOLDOWN)
            data = json.loads(response.text)
            pages = data["query"]["categorymembers"]
            results.extend(pages)

            if "continue" not in data:
                break

            cont = data["continue"]
            cmcontinue = cont["cmcontinue"]
            url = f"{url}&cmcontinue={cmcontinue}"
    results = [row["title"] for row in results if "Category" not in row["title"]]
    return results

#PLAYER_LIST = fetch_players()

In [37]:
# fetch blast players
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import string

OPTIONS = webdriver.ChromeOptions()
OPTIONS.add_experimental_option("excludeSwitches", ["enable-automation"])
OPTIONS.add_experimental_option('useAutomationExtension', False)
OPTIONS.add_argument('--disable-blink-features=AutomationControlled')

def fetch_blast_players():
    browser = webdriver.Chrome(options=OPTIONS)
    browser.get("https://blast.tv/counter-strikle")

    cookies_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@class='coi-consent-banner__agree-button']")))
    cookies_button.click()

    rules_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@class='styled__StyledButton-sc-y8jf4t-0 eccxcB']")))
    rules_button.click()

    blast_players = []
    search_bar = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, "//input[@class='styled__SearchBoxInput-sc-1uwunev-2 uWYFV']")))
    for letter in tqdm(string.ascii_lowercase):
        search_bar.send_keys(letter)
        time.sleep(.2)
        autocomplete_results = browser.find_elements(By.XPATH, "//div[contains(@class, 'styled__SearchResultRow-sc-1uwunev-4')]")
        blast_players.extend([result.text.split(" - ")[1] for result in autocomplete_results])
        search_bar.send_keys(Keys.BACKSPACE)
        time.sleep(.2)
    browser.quit()

    cashed_names = {}
    if os.path.exists(CSV_FILE):
        csv_data = pd.read_csv(CSV_FILE)
        cashed_names = dict(zip(csv_data["REAL NAME"], csv_data["NAME"]))
    return [cashed_names.get(name, search_page(name)) for name in tqdm(blast_players)]


PLAYER_LIST = fetch_blast_players()

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/531 [00:00<?, ?it/s]

searc: Valeriy Vakhovskiy
 {'batchcomplete': '', 'query': {'searchinfo': {'totalhits': 0, 'suggestion': 'valorix vakhovskiy', 'suggestionsnippet': '<em>valorix</em> vakhovskiy'}, 'search': []}}
searc: Evgeny Lebedev
 {'batchcomplete': '', 'query': {'searchinfo': {'totalhits': 0, 'suggestion': 'event lebedev', 'suggestionsnippet': '<em>event</em> lebedev'}, 'search': []}}


KeyboardInterrupt: 

In [28]:
cashed_names.get("Dan Madesclaire")

'apEX'

In [25]:
cashed_names = {}
if os.path.exists(CSV_FILE):
    csv_data = pd.read_csv(CSV_FILE)
    cashed_names = dict(zip(csv_data["REAL NAME"], csv_data["NAME"]))
    print(cashed_names)

{'Dan Madesclaire': 'apEX', 'Sergey Aleksandrovich Rykhtorov': 'Ax1Le', 'Timothy Ta': 'autimatic', 'Aleksi Jalli': 'allu', nan: nan, 'François Delaunay': 'AMANEK', 'Aleksi Antti Kaarlo Virolainen': 'Aleksib', 'Philip Aistrup Larsen': 'aizy', 'Andrei Felipe Piovezan Machado': 'arT', 'Jani Jussila': 'Aerial', 'Frederik Gyldstrand': 'acoR', 'Asger Grunnet Larsen': 'AcilioN', 'Aaron Christian Charles Ward': 'AZR', 'Sheng Yuanzhang': 'Attacker', 'Kyrylo Karasjov': 'ANGE1', 'Liang Zhuo': 'advent', 'Almaz Asadullin': 'almazer', 'Liu Zhihong': 'aumaN', 'Bradley Fodor': 'ANDROID', 'Rinaldo Moda Júnior': 'ableJ', 'Zhengwei Bian': 'alex', 'Alistair Johnston': 'aliStair', 'Omar Chakkor Feltrer': 'arki', 'Anna Ananikova': 'Ant1ka', 'Guy Trachtman': 'anarkez', 'Aurélien Drapier': 'afro', 'Kirill Sergeyevich Mikhaylov': 'Boombl4', 'Valerij Jevghenijovych Vakhovsjkyj': 'b1t', 'Paweł Mateusz Bieliński': 'byali', 'Vincent Cayonte': 'Brehze', 'Timur Tulepov': 'buster', 'Ricardo de Souza Prass': 'boltz', 

In [11]:
print(len(PLAYER_LIST))
PLAYER_LIST.get("ScreaM")

527


'Adil Benrlitom'

In [9]:
def fetch_major_players():
    # load majors from json
    file_name = "majors.json"
    major_dict = {}
    if os.path.exists(file_name):
        with open(file_name, "r") as file:
            major_dict = json.load(file)

    # check for new majors
    for major in tqdm(fetch_majors()):
        if major in major_dict:
            continue
        wikicode = query_page(major, "5")
        teams_dict = {}
        for template in wikicode.filter_templates(matches=r"\bTeamCard\b"):
            if template.name.strip() == "TeamCard":
                if not template.has("qualifier"):
                    continue
                if not template.has("p1"):
                    continue
                team = template.get("team").value.strip()
                players = [template.get(f"p{i}link", template.get(f"p{i}")).value.strip() for i in range(1, 6)]
                teams_dict[team] = players
        major_dict[major] = teams_dict

    # save majors to json
    with open(file_name, "w") as file:
        json.dump(major_dict, file, indent=2)

    # invert dict
    major_players = {}
    for major,teams in major_dict.items():
        for team, players in teams.items():
            for player in players:
                if player not in major_players:
                    major_players[player] = 0
                major_players[player] += 1

    return major_players

MAJOR_PLAYERS = fetch_major_players()

  0%|          | 0/19 [00:00<?, ?it/s]

## Player-Specific Information

In [20]:
# fetches all player data and puts it in dict
def fetch_player_dict(player_id):
    infobox_dict = {}
    wikicode = query_page(player_id, 0)
    try:
        infobox = wikicode.filter_templates(matches="Infobox player")[0]
    except IndexError:
        return infobox_dict
    for param in infobox.params:
        value = mwparserfromhell.parse(param.value.strip_code()).strip()
        infobox_dict[param.name.strip()] = str(value)
    return infobox_dict

# filter player data for relevant information
def fetch_player(player_id):
    # parse player information
    info_dict = fetch_player_dict(player_id)

    # add name to dict
    output_dict = dict()
    output_dict["PAGE"] = player_id
    output_dict["NAME"] = info_dict.get("id")
    output_dict["REGION"] = REGION_DICT.get(info_dict.get("country"))
    output_dict["REAL NAME"] = info_dict.get("romanized_name", info_dict.get("name"))
    output_dict["NATIONALITY"] = info_dict.get("country")
    output_dict["TEAM"] = info_dict.get("team")
    output_dict["AGE"] = info_dict.get("birth_date")
    roles = [info_dict.get("role"), info_dict.get("role2")]
    if "awp" in roles:
        output_dict["WEAPON"] = "AWP"
    else:
        output_dict["WEAPON"] = "AK47"
    output_dict["MAJOR APPEARANCES"] = MAJOR_PLAYERS.get(info_dict.get("id"), 0)
    output_dict["LAST UPDATED"] = datetime.now().strftime("%Y-%m-%d")
    return output_dict

#fetch_player("Boomser")
#fetch_player("ScreaM")

## Generate CSV

In [47]:
def build_df():
    # Get dataframe to write to
    if os.path.exists(CSV_FILE):
        playerdata = pd.read_csv(CSV_FILE)
    else:
        playerdata = pd.DataFrame(columns=COLUMNS)
    try:
        for player in tqdm(PLAYER_LIST, leave=False):
            if not (playerdata["PAGE"].eq(player)).any():
                row_data = fetch_player(player)
                playerdata = pd.concat([playerdata, pd.DataFrame.from_records([row_data])], ignore_index=True)
    except Exception as e:
        print("Error:", e)
        print("Player:", player)
    
    playerdata.to_csv(CSV_FILE, index=False)

build_df()

  0%|          | 0/527 [00:00<?, ?it/s]

Error: 'revisions'
Player: kaper


## Post Processing

In [35]:
def age(birthdate):
    birthdate = datetime.strptime(birthdate, "%Y-%m-%d")
    today = datetime.now()
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

def playerdata_postprocess(df):
    columns = list(pd.read_csv(CSV_FILE).columns)
    del columns[columns.index("TEAM")]
    df.dropna(inplace = True, subset = columns)
    age_conversions = []
    for date in df["AGE"]:
        try:
            age_conversions.append(age(date))
        except ValueError:
            age_conversions.append(0)
    df["AGE"] = age_conversions
    return df.sort_values(["MAJOR APPEARANCES"], ascending=False)

playerdata_post = playerdata_postprocess(pd.read_csv(CSV_FILE))

Unnamed: 0,PAGE,NAME,REAL NAME,REGION,NATIONALITY,TEAM,AGE,WEAPON,MAJOR APPEARANCES,LAST UPDATED
710,Dupreeh,dupreeh,Peter Rothmann Rasmussen,Europe,Denmark,Team Vitality,30,AWP,17,2023-05-19
3035,Xyp9x,Xyp9x,Andreas Højsleth,Europe,Denmark,Astralis Talent,27,AK47,16,2023-05-19
2497,Shox,shox,Richard Papillon,Europe,France,,30,AK47,16,2023-05-19
126,ApEX,apEX,Dan Madesclaire,Europe,France,Team Vitality,30,AK47,15,2023-05-19
1354,Karrigan,karrigan,Finn Andersen,Europe,Denmark,FaZe Clan,33,AK47,15,2023-05-19
...,...,...,...,...,...,...,...,...,...,...
1298,Juissi,juissi,Youssef Adam,Europe,Finland,ENCE Academy,20,AK47,0,2023-05-19
1299,Juli,Juli,Juliana Tosic,Americas,United States,,23,AK47,0,2023-05-19
1300,Juliano,juliano,Julia Kiran,Europe,Sweden,G2 Oya,29,AK47,0,2023-05-19
1302,Julih,julih,Julia Gomes,Americas,Brazil,B4 Esports Female,24,AK47,0,2023-05-19


## UI Search Engine

In [34]:
def generate_selection(data, description):
    unique_data = list(data.unique())
    return widgets.SelectMultiple(
        options=unique_data,
        value=unique_data,
        rows=len(unique_data),
        description=description,
        disabled=False)


def generate_rangeslider(data, description):
    return widgets.FloatRangeSlider(
        value=[min(data), max(data)],
        min=min(data),
        max=max(data),
        description=description,
        step=1,
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='1',
)

region_selector = generate_selection(playerdata_post["REGION"], "REGION")
age_slider = generate_rangeslider(playerdata_post["AGE"], "AGE")
weapon_selector = generate_selection(playerdata_post["WEAPON"], "WEAPON")
majors_slider = generate_rangeslider(playerdata_post["MAJOR APPEARANCES"], "MAJOR APPEARANCES")

def ui_selector(sregion, sage, sweapon, smajors):
    return playerdata_post.loc[(playerdata_post["REGION"].isin(sregion)) &
                               (playerdata_post["AGE"] >= sage[0]) &
                               (playerdata_post["AGE"] <= sage[1]) &
                               (playerdata_post["WEAPON"].isin(sweapon)) &
                               (playerdata_post["MAJOR APPEARANCES"] >= smajors[0]) &
                               (playerdata_post["MAJOR APPEARANCES"] <= smajors[1])
                               ]
    
widgets.interact(ui_selector, sregion = region_selector, sage = age_slider, sweapon = weapon_selector, smajors = majors_slider)

interactive(children=(SelectMultiple(description='REGION', index=(0, 1, 2), options=('Europe', 'Americas', 'As…

<function __main__.ui_selector(sregion, sage, sweapon, smajors)>

  0%|          | 0/26 [00:00<?, ?it/s]