# Counter Strikle Webscraper

## Notebook Description

__Author:__ Daniël Vermaas

This notebook scrapes liquidpedia (https://liquipedia.net/counterstrike/Main_Page), in order to make counter strikle (https://blast.tv/counter-strikle) puzzle-solving easier. Before using the notebook, please read the Liquidpedia ToS about API usage: https://liquipedia.net/api-terms-of-use.

## Libraries & Constants

In [52]:
import os
import time
import lxml
from bs4 import BeautifulSoup
from datetime import datetime
import json
import mwparserfromhell

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import requests

import ipywidgets as widgets
from ipywidgets import interact

In [61]:
HEADERS = {"User-Agent": "Counter-Strikle-Bot","Accept-Encoding": "gzip"}
BASE_URL = "https://liquipedia.net/counterstrike/api.php?"
QUERY_COOLDOWN = 4
CSV_FILE = "players.csv"
REGIONS = {
    "Europe" : ["Europe", "CIS"],
    "Americas" : ["North America", "South America"],
    "Asia-Pacific" : ["Oceania", "Asia"],
}
COLUMNS = [
    "NAME",
    "REAL NAME",
    "REGION",
    "NATIONALITY",
    "TEAM",
    "AGE",
    "WEAPON",
    "MAJOR APPEARANCES",
    "EARNINGS",
    "LAST UPDATED"
]
UNWANTED_STRINGS = [
    "Players",
    "Teams",
    "Tournaments",
    "Casters",
    "Countries",
    "Asia",
    "CIS",
]

## MediaWiki API requests

In [64]:
# API call function
def fetch_page(page, cooldown=0):
    url =  f"{BASE_URL}action=parse&format=json&page={page}"
    response = requests.get(url, HEADERS)
    page_html = response.json()['parse']['text']['*']
    soup = BeautifulSoup(page_html,features="lxml")
    time.sleep(cooldown)
    return soup

def query_page(page, rvsection, cooldown=QUERY_COOLDOWN):
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": page,
        "rvprop": "content",
        "rvslots": "*",
        "rvsection": rvsection,
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()
    pages = data["query"]["pages"]
    page_id = next(iter(pages))
    page = pages[page_id]
    wikicode = mwparserfromhell.parse(page["revisions"][0]["slots"]["main"]["*"])
    time.sleep(cooldown)
    return wikicode

## Generate Country to Region Map

In [65]:
def build_region_dict():
    country_region_dict = {}
    with open("regions.json", "r") as file:
        region_dict = json.load(file)
    for region, countries in region_dict.items():
            for country in countries:
                country_region_dict[country] = region
    return country_region_dict

REGION_DICT = build_region_dict()
REGION_DICT.get("Turkey")

'Europe'

In [27]:
# Fetches list of all concluded majors (checks hltv awards) 
def fetch_majors():
    wikicode = query_page("Majors", 8)
    links = wikicode.filter_wikilinks()
    links = [link.split('|')[0].replace('[[', '') for link in links]
    return links

MAJOR_LIST = fetch_majors()

In [24]:
# Fetches all player data and puts it in dict
def fetch_player_dict(player_id):
    wikicode = query_page(player_id, 0)
    infobox = wikicode.filter_templates(matches="Infobox player")[0]
    infobox_dict = {}
    for param in infobox.params:
        value = mwparserfromhell.parse(param.value.strip_code()).strip()
        infobox_dict[param.name.strip()] = str(value)
    return infobox_dict

# Fetches all player ids
def fetch_players():
    url = f"{BASE_URL}action=query&list=categorymembers&cmtitle=Category:Players&cmlimit=max&format=json"
    results = []

    while True:
        response = requests.get(url)
        time.sleep(QUERY_COOLDOWN)
        data = json.loads(response.text)
        pages = data["query"]["categorymembers"]
        results.extend(pages)

        if "continue" not in data:
            break

        cont = data["continue"]
        cmcontinue = cont["cmcontinue"]
        url = f"{url}&cmcontinue={cmcontinue}"
    return results

fetch_player_dict("ropz")

{'id': 'ropz',
 'image': 'Ropz at Antwerp Major EU RMR.jpg',
 'name': 'Robin Kool',
 'birth_date': '1999-12-22',
 'country': 'Estonia',
 'status': 'Active',
 'years_active': '2015 – Present',
 'team': 'FaZe Clan',
 'role': 'lurk',
 'csgo': 'y',
 'twitter': 'ropz',
 'facebook': 'ropzicle',
 'instagram': 'ropzicle',
 'youtube': 'c/Ropz',
 'twitch': 'ropz',
 'esea': '1042223',
 'faceit': 'ropz',
 'steam': '76561197991272318',
 'team_history': ''}

In [80]:
def fetch_major_players():
    major_dict = {}
    for major in fetch_majors():
        wikicode = query_page(major, "5")
        teams_dict = {}
        for template in wikicode.filter_templates(matches=r"\bTeamCard\b"):
            if template.name.strip() == "TeamCard":
                # skip showmatches
                if not template.has("qualifier"):
                    continue
                if not template.has("p1"):
                    continue
                team = template.get("team").value.strip()
                players = tuple(template.get(f"p{i}").value.strip() for i in range(1, 6))
                teams_dict[team] = players
        major_dict[major] = teams_dict
    return major_dict

major_players = fetch_major_players()
major_players["DreamHack/2013/Winter"]

{'Ninjas in Pyjamas': ('f0rest', 'GeT_RiGhT', 'Xizt', 'Fifflaren', 'friberg'),
 'n!faculty': ('gla1ve', 'karrigan', 'cajunb', 'Pimp', 'raalz'),
 'SK Gaming': ('pita', 'twist', 'xelos', 'Delpan', 'MODDII'),
 'Copenhagen Wolves': ('FeTiSh', 'dupreeh', 'Xyp9x', 'device', 'Nico'),
 'Universal Soldiers': ('TaZ', 'NEO', 'pashaBiceps', 'Snax', 'byali'),
 'Natus Vincere': ('Zeus', 'starix', 'ceh9', 'seized', 'kibaken'),
 'Astana Dragons': ('ANGE1', 'Dosia', 'AdreN', 'markeloff', 'kUcheR'),
 'compLexity Gaming': ('Hiko', 'seang@res', 'Semphis', 'swag', 'n0thing'),
 'VeryGames': ('Ex6TenZ', 'NBK-', 'SmithZz', 'ScreaM', 'shox'),
 'Clan-Mystik': ('HaRts', 'ioRek', 'kioShiMa', 'KQLY', 'apEX'),
 'Fnatic': ('JW', 'flusha', 'schneider', 'Devilwalk', 'pronax'),
 'Team iBUYPOWER': ('anger', 'Skadoodle', 'adreN', 'AZK', 'DaZeD'),
 'LGB eSports': ('SKYTTEN', 'eksem', 'KRiMZ', 'dennis', 'olofm'),
 'Recursive eSports': ('Happy', 'GMX', 'kennyS', 'Uzzziii', 'Maniac'),
 'Xapso': ('ultra', 'centeks', 'cadiaN',

In [37]:
major_players.get("PGL/2022/Antwerp")

{'Heroic': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B27B4C0>,
 'Copenhagen Flames': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B27B3D0>,
 'BIG': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B27BC40>,
 'Cloud9': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B27A5C0>,
 'FURIA Esports': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B68C7C0>,
 'FaZe Clan': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B68C040>,
 'Ninjas in Pyjamas': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B68C130>,
 'Natus Vincere': <generator object fetch_major_players.<locals>.<genexpr> at 0x000001B62B68C220>}

## Player-Specific Information

In [54]:
def fetch_player(player_id, major_list):
    # parse player information
    info_dict = fetch_player_dict(player_id)
    
    # add name to dict
    output_dict = dict()
    output_dict["NAME"] = info_dict.get("id")
    output_dict["REGION"] = "todo"
    output_dict["REAL NAME"] = info_dict.get("romanized_name", info_dict.get("name"))
    output_dict["NATIONALITY"] = info_dict.get("country")
    output_dict["TEAM"] = info_dict.get("team")
    output_dict["AGE"] = info_dict.get("birth_date")
    roles = [info_dict.get("role"), info_dict.get("role2")]
    if "awp" in roles:
        output_dict["WEAPON"] = "AWP"
    else:
        output_dict["WEAPON"] = "AK47"
            
    # Get major appearances
    soup = fetch_page("S1mple/Results")
    try:
        soup = fetch_page(output_dict["NAME"] + "/Results")
        print(soup)
        event_elements = soup.find_all("tr", {"class": "valvemajor-highlighted"})
        event_name_list = [event.find("td", {"style": "text-align:left"}).find("a")["href"] for event in event_elements]
        event_name_list = [event for event in event_name_list if event in major_list]
        output_dict["MAJOR APPEARANCES"] = len(event_name_list)
    except:
        output_dict["MAJOR APPEARANCES"] = 0
    
    output_dict["LAST UPDATED"] = datetime.now().strftime("%d/%m/%Y")
    return output_dict

#fetch_player("XANTARES", MAJOR_LIST)
fetch_player("S1mple", MAJOR_LIST)

FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [31]:
fetch_player_dict("CadiaN")

{'id': 'cadiaN',
 'image': 'CadiaN at BLAST Paris Major 2023 EU RMR.jpeg',
 'name': 'Casper Møller',
 'birth_date': '1995-06-26',
 'country': 'Denmark',
 'team': 'Heroic',
 'status': 'Active',
 'years_active': '2011 – Present',
 'role': 'igl',
 'role2': 'awp',
 'ids': 'cadiii, Cadian',
 'css': 'y',
 'csgo': 'y',
 'twitch': 'cadian',
 'twitter': 'caspercadiaN',
 'facebook': 'caspercadiaN',
 'steam': '76561198004115516',
 'esea': '388062',
 'faceit': 'cadiaN',
 'instagram': 'cadiancs',
 'team_history': 'Counter-Strike: Source\n\nCounter-Strike: Global Offensive'}

## Generate CSV

In [None]:
def build_df():
    # Get dataframe to werite to
    if os.path.exists(CSV_FILE):
        playerdata = pd.read_csv(CSV_FILE)
    else:
        playerdata = pd.DataFrame(columns=COLUMNS)
        
    # Get list of all mayors
    major_list = fetch_majors()

    try:
        for major_region, sub_regions in tqdm(REGIONS.items()):
            for sub_region in tqdm(sub_regions, leave=False):
                region_ids = fetch_ids(sub_region)[:1]
                for player in tqdm(region_ids, leave=False):
                    if not (playerdata["NAME"].eq(player)).any():
                        row_data = fetch_player(player, major_region, major_list)
                        playerdata = pd.concat([playerdata, pd.DataFrame.from_records([row_data])], ignore_index=True)
    except Exception as e:
        print("Error:", e)
    
    playerdata.to_csv(CSV_FILE, index=False)
    return=(TODAY()-B2)/365

build_df()

## Post Processing

In [None]:
def age(birthdate):
    birthdate = datetime.strptime(birthdate, "%d/%m/%Y")
    today = datetime.now()
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

def playerdata_postprocess(df):
    columns = list(pd.read_csv(CSV_FILE).columns)
    del columns[columns.index("TEAM")]
    df.dropna(inplace = True, subset = columns)
    df.sort_values(by=["EARNINGS"], ascending = False, inplace = True)
    df["AGE"] = [age(date) for date in df["AGE"]]
    df["REGION"] = np.where(df["NATIONALITY"].isin(MISLABELS), "Europe", df["REGION"])
    return df

playerdata_post = playerdata_postprocess(pd.read_csv(CSV_FILE))

## UI Search Engine

In [None]:
def generate_selection(data, description):
    unique_data = list(data.unique())
    return widgets.SelectMultiple(
        options=unique_data,
        value=unique_data,
        rows=len(unique_data),
        description=description,
        disabled=False)


def generate_rangeslider(data, description):
    return widgets.FloatRangeSlider(
        value=[min(data), max(data)],
        min=min(data),
        max=max(data),
        step=1,
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='1',
)

region_selector = generate_selection(playerdata_post["REGION"], "REGION")
age_slider = generate_rangeslider(playerdata_post["AGE"], "AGE")
weapon_selector = generate_selection(playerdata_post["WEAPON"], "WEAPON")
majors_slider = generate_rangeslider(playerdata_post["MAJOR APPEARANCES"], "MAJOR APPEARANCES")

def ui_selector(sregion, sage, sweapon, smajors):
    return playerdata_post.loc[(playerdata_post["REGION"].isin(sregion)) &
                               (playerdata_post["AGE"] >= sage[0]) &
                               (playerdata_post["AGE"] <= sage[1]) &
                               (playerdata_post["WEAPON"].isin(sweapon)) &
                               (playerdata_post["MAJOR APPEARANCES"] >= smajors[0]) &
                               (playerdata_post["MAJOR APPEARANCES"] <= smajors[1])
                               ]
    
interact(ui_selector, sregion = region_selector, sage = age_slider, sweapon = weapon_selector, smajors = majors_slider)