# Counter Strikle Webscraper

## Notebook Description

__Author:__ Daniël Vermaas

This notebook scrapes liquidpedia (https://liquipedia.net/counterstrike/Main_Page), in order to make counter strikle (https://blast.tv/counter-strikle) puzzle-solving easier. Before using the notebook, please read the Liquidpedia ToS about API usage: https://liquipedia.net/api-terms-of-use.

## Libraries & Constants

In [1]:
import os
import csv
import time
from time import sleep
from bs4 import BeautifulSoup
from datetime import datetime
import json
import mwparserfromhell

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import requests
#from urllib.request import quote

#!jupyter nbextension enable --py widgetsnbextension
import ipywidgets as widgets
from ipywidgets import interact

In [102]:
HEADERS = {"User-Agent": "Counter-Strikle-Bot","Accept-Encoding": "gzip"}
BASE_URL = "https://liquipedia.net/counterstrike/api.php?"
QUERY_COOLDOWN = 4
CSV_FILE = "players.csv"
REGIONS = {
    "Europe" : ["Europe", "CIS"],
    "Americas" : ["North America", "South America"],
    "Asia-Pacific" : ["Oceania", "Asia"],
}
COLUMNS = [
    "NAME", 
    "REAL NAME", 
    "REGION", 
    "NATIONALITY", 
    "TEAM", 
    "AGE", 
    "WEAPON", 
    "MAJOR APPEARANCES", 
    "EARNINGS", 
    "LAST UPDATED"
]
UNWANTED_STRINGS = [
    "Players", 
    "Teams", 
    "Tournaments", 
    "Casters", 
    "Countries", 
    "Asia", 
    "CIS",
]

## Generate Country to Region Map

In [None]:
# builds country to region dict, and stores it
def build_region_dict():
    # build the region to country dict
    file_name = "region_country_dict.json"
    if not os.path.exists(file_name):
        region_dict = {}
        for region, subregions in REGIONS.items():
            results = []
            for subregion in subregions:
                url = f"{BASE_URL}action=query&list=categorymembers&cmtitle=Category:{subregion}&cmlimit=max&format=json"
                time.sleep(QUERY_COOLDOWN)
                response = requests.get(url)
                data = json.loads(response.text)
                pages = data["query"]["categorymembers"]
                pages = [line.get("title").split(":")[1] for line in pages]
                results.extend(pages)
            results = [result for result in results if not any(substring in result for substring in UNWANTED_STRINGS)]
            region_dict[region] = sorted(list(set(results)))
        with open(file_name, "w") as file:
            json.dump(region_dict, file, indent=2)
    
    # convert to country to region dict  
    with open(file_name, "r") as file:
        region_dict = json.load(file)
    country_region_dict = {}
    for region, countries in region_dict.items():
        for country in countries:
            country_region_dict[country] = region
    
    return country_region_dict

REGION_DICT = build_region_dict()

In [119]:
with open(file_name, "r") as file:
    region_dict = json.load(file)
    
set(region_dict["Europe"]) & set(region_dict["Asia-Pacific"])

{'Azerbaijan',
 'Georgia',
 'Kazakhstan',
 'Kyrgyzstan',
 'Tajikistan',
 'Turkey',
 'Uzbekistan'}

In [125]:
region_dict["Europe"]

['Russia',
 'Germany',
 'Hungary',
 'Albania',
 'Malta',
 'Tajikistan',
 'Switzerland',
 'Sweden',
 'Wales',
 'Greece',
 'England',
 'Portugal',
 'Liechtenstein',
 'Turkey',
 'Kyrgyzstan',
 'Austria',
 'Scotland',
 'Ukraine',
 'United Kingdom',
 'Uzbekistan',
 'Faroe Island',
 'Belarus',
 'Kazakhstan',
 'Belgium',
 'Bosnia and Herzegovina',
 'Ireland',
 'Denmark',
 'Spain',
 'Kosovo',
 'Lithuania',
 'Montenegro',
 'Azerbaijan',
 'Norway',
 'Croatia',
 'Moldova',
 'Italy',
 'France',
 'Bulgaria',
 'Poland',
 'Romania',
 'Armenia',
 'North Macedonia',
 'Georgia',
 'Finland',
 'Estonia']

In [111]:
file_name = "region_country_dict.json"
with open(file_name, "r") as file:
    region_dict = json.load(file)
for region, countries in region_dict.items():
        for country in countries:
            country_region_dict[country] = region
country_region_dict

{'Germany': 'Europe',
 'Hungary': 'Europe',
 'Russia': 'Europe',
 'Albania': 'Europe',
 'Malta': 'Europe',
 'Tajikistan': 'Asia-Pacific',
 'Switzerland': 'Europe',
 'Sweden': 'Europe',
 'Wales': 'Europe',
 'Greece': 'Europe',
 'England': 'Europe',
 'Portugal': 'Europe',
 'Turkey': 'Asia-Pacific',
 'Kyrgyzstan': 'Asia-Pacific',
 'Austria': 'Europe',
 'Scotland': 'Europe',
 'Ukraine': 'Europe',
 'United Kingdom': 'Europe',
 'Uzbekistan': 'Asia-Pacific',
 'Faroe Island': 'Europe',
 'Belarus': 'Europe',
 'Kazakhstan': 'Asia-Pacific',
 'Belgium': 'Europe',
 'Bosnia and Herzegovina': 'Europe',
 'Ireland': 'Europe',
 'Denmark': 'Europe',
 'Spain': 'Europe',
 'Kosovo': 'Europe',
 'Lithuania': 'Europe',
 'Montenegro': 'Europe',
 'Azerbaijan': 'Asia-Pacific',
 'Norway': 'Europe',
 'Croatia': 'Europe',
 'Moldova': 'Europe',
 'Italy': 'Europe',
 'France': 'Europe',
 'Bulgaria': 'Europe',
 'Poland': 'Europe',
 'Romania': 'Europe',
 'Armenia': 'Europe',
 'North Macedonia': 'Europe',
 'Georgia': 'Asi

In [105]:
REGION_DICT

{'A': 'Israel',
 'm': 'Costa Rica',
 'e': 'Estonia',
 'r': 'Estonia',
 'i': 'Israel',
 'c': 'Israel',
 'a': 'Israel',
 's': 'Israel',
 '-': 'Israel',
 'P': 'Israel',
 'f': 'Israel',
 'E': 'Estonia',
 'u': 'Estonia',
 'o': 'Estonia',
 'p': 'Estonia'}

## API Calls

In [13]:
# API call function
def fetch_page(page, cooldown=0):
    url =  f"{BASE_URL}action=parse&format=json&page={page}"
    response = requests.get(url, HEADERS)
    page_html = response.json()['parse']['text']['*']
    soup = BeautifulSoup(page_html,features="lxml")
    time.sleep(cooldown)
    return soup

def query_page(page, rvsection, cooldown=QUERY_COOLDOWN):
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": page,
        "rvprop": "content",
        "rvslots": "*",
        "rvsection": rvsection,
    }
    response = requests.get(BASE_URL, params=params)
    data = response.json()
    pages = data["query"]["pages"]
    page_id = next(iter(pages))
    page = pages[page_id]
    wikicode = mwparserfromhell.parse(page["revisions"][0]["slots"]["main"]["*"])
    time.sleep(cooldown)
    return wikicode

## API Calls

In [4]:
# Fetches list of all concluded majors (checks hltv awards) 
def fetch_majors():
    wikicode = query_page("Majors", 8)
    links = wikicode.filter_wikilinks()
    links = [link.split('|')[0].replace('[[', '') for link in links]
    return links

#fetch_majors()

'Europe'

In [78]:
country_region_dict = {}
unwanted_strings = ["Players", "Teams", "Tournaments", "Casters", "Countries", "East Asia", "Southeast Asia", "CIS"]
for region, countries in region_dict.items():
    for country in countries:
        if not any(substring in country for substring in unwanted_strings):
            country_region_dict[country] = region

In [79]:
country_region_dict.keys()

dict_keys(['Germany', 'Hungary', 'Russia', 'Albania', 'Malta', 'Tajikistan', 'Switzerland', 'Sweden', 'Wales', 'Greece', 'England', 'Portugal', 'Turkey', 'Kyrgyzstan', 'Austria', 'Scotland', 'Ukraine', 'United Kingdom', 'Uzbekistan', 'Faroe Island', 'Belarus', 'Kazakhstan', 'Belgium', 'Bosnia and Herzegovina', 'Ireland', 'Denmark', 'Spain', 'Kosovo', 'Lithuania', 'Montenegro', 'Azerbaijan', 'Norway', 'Croatia', 'Moldova', 'Italy', 'France', 'Bulgaria', 'Poland', 'Romania', 'Armenia', 'North Macedonia', 'Georgia', 'Finland', 'Estonia', 'Uruguay', 'United States', 'Peru', 'Chile', 'Greenland', 'Venezuela', 'Panama', 'Brazil', 'Guatemala', 'Argentina', 'Ecuador', 'Mexico', 'Paraguay', 'Bolivia', 'Colombia', 'Dominican Republic', 'Curaçao', 'Nicaragua', 'El Salvador', 'Honduras', 'Canada', 'Costa Rica', 'Bahrain', 'Nepal', 'Cyprus', 'Kuwait', 'Jordan', 'Bangladesh', 'India', 'Singapore', 'Saudi Arabia', 'Mongolia', 'Afghanistan', 'Palestine', 'New Zealand', 'Brunei', 'Pakistan', 'Laos', 'T

In [55]:
country_region_dict["Ukraine"]

'Europe'

In [5]:
# Fetches all player data and puts it in dict
def fetch_player_dict(player_id):
    wikicode = query_page(player_id, 0)
    infobox = wikicode.filter_templates(matches="Infobox player")[0]
    infobox_dict = {}
    for param in infobox.params:
        value = mwparserfromhell.parse(param.value.strip_code()).strip()
        infobox_dict[param.name.strip()] = str(value)
    return infobox_dict

# Fetches all player ids
def fetch_players():
    url = f"{BASE_URL}action=query&list=categorymembers&cmtitle=Category:Players&cmlimit=max&format=json"
    results = []

    while True:
        response = requests.get(url)
        time.sleep(QUERY_COOLDOWN)
        data = json.loads(response.text)
        pages = data["query"]["categorymembers"]
        results.extend(pages)

        if "continue" not in data:
            break

        cont = data["continue"]
        cmcontinue = cont["cmcontinue"]
        url = f"{url}&cmcontinue={cmcontinue}"
    return results

fetch_player_dict("ropz")
#fetch_majors()

{'id': 'ropz',
 'image': 'Ropz at Antwerp Major EU RMR.jpg',
 'name': 'Robin Kool',
 'birth_date': '1999-12-22',
 'country': 'Estonia',
 'status': 'Active',
 'years_active': '2015 – Present',
 'team': 'FaZe Clan',
 'role': 'lurk',
 'csgo': 'y',
 'twitter': 'ropz',
 'facebook': 'ropzicle',
 'instagram': 'ropzicle',
 'youtube': 'c/Ropz',
 'twitch': 'ropz',
 'esea': '1042223',
 'faceit': 'ropz',
 'steam': '76561197991272318',
 'team_history': ''}

In [None]:
def fetch_major_players():
    major_dict = {}
    for major in fetch_majors():
        wikicode = query_page(major, 6)
        teams_dict = {}
        for template in wikicode.filter_templates(matches=r"\bTeamCard\b"):
            if template.name.strip() == "TeamCard":
                # skip showmatches
                if not template.has("qualifier"):
                    continue
                if not template.has("p1"):
                    continue
                team = template.get("team").value.strip()
                players = (template.get(f"p{i}").value.strip() for i in range(1, 6))
                teams_dict[team] = players
        major_dict[major] = teams_dict
    return major_dict

major_players = fetch_major_players()

In [396]:
major_players["DreamHack/2013/Winter"]

{'Ninjas in Pyjamas': ['f0rest', 'GeT_RiGhT', 'Xizt', 'Fifflaren', 'friberg'],
 'n!faculty': ['gla1ve', 'karrigan', 'cajunb', 'Pimp', 'raalz'],
 'SK Gaming': ['pita', 'twist', 'xelos', 'Delpan', 'MODDII'],
 'Copenhagen Wolves': ['FeTiSh', 'dupreeh', 'Xyp9x', 'device', 'Nico'],
 'Universal Soldiers': ['TaZ', 'NEO', 'pashaBiceps', 'Snax', 'byali'],
 'Natus Vincere': ['Zeus', 'starix', 'ceh9', 'seized', 'kibaken'],
 'Astana Dragons': ['ANGE1', 'Dosia', 'AdreN', 'markeloff', 'kUcheR'],
 'compLexity Gaming': ['Hiko', 'seang@res', 'Semphis', 'swag', 'n0thing'],
 'VeryGames': ['Ex6TenZ', 'NBK-', 'SmithZz', 'ScreaM', 'shox'],
 'Clan-Mystik': ['HaRts', 'ioRek', 'kioShiMa', 'KQLY', 'apEX'],
 'Fnatic': ['JW', 'flusha', 'schneider', 'Devilwalk', 'pronax'],
 'Team iBUYPOWER': ['anger', 'Skadoodle', 'adreN', 'AZK', 'DaZeD'],
 'LGB eSports': ['SKYTTEN', 'eksem', 'KRiMZ', 'dennis', 'olofm'],
 'Recursive eSports': ['Happy', 'GMX', 'kennyS', 'Uzzziii', 'Maniac'],
 'Xapso': ['ultra', 'centeks', 'cadiaN',

## Player-Specific Information

In [369]:
fetch_player_dict("b1t")

{'id': 'b1t',
 'image': 'B1t at Antwerp Major 2022 EU RMR.jpg',
 'name': 'Валерій Євгенійович Ваховський',
 'romanized_name': 'Valerij Jevghenijovych Vakhovsjkyj',
 'birth_date': '2003-01-05',
 'country': 'Ukraine',
 'status': 'Active',
 'years_active': '2019 – Present',
 'team': 'Natus Vincere',
 'role': 'entry',
 'role2': 'awp',
 'ids': 'B1T, B1t',
 'csgo': 'y',
 'faceit': 'b1t',
 'vk': 'b1tcs',
 'steam': '76561198246607476',
 'instagram': 'b1tcsgo',
 'esea': '2255709',
 'twitter': 'b1tcs',
 'twitch': 'b1tcs',
 'team_history': ''}

In [128]:
def fetch_player(player_id, major_list, debug=True):
    # parse player information
    info_dict = fetch_player_dict(player_id)
    if debug:
        print(info_dict)
    
    # add name to dict
    output_dict = dict()
    output_dict["NAME"] = info_dict.get("id")
    output_dict["REGION"] = "todo"
    output_dict["REAL NAME"] = info_dict.get("romanized_name", info_dict.get("name"))
    output_dict["NATIONALITY"] = info_dict.get("country")
    output_dict["TEAM"] = info_dict.get("team")
    output_dict["AGE"] = info_dict.get("birth_date")
    #output_dict["AGE"] = datetime.strptime(, "%B %d %Y").strftime("%d/%m/%Y")
    
    # Get role
    role_description = None
    if "Role" in info_dict:
        role_description = info_dict["Role"]
    elif "Roles" in info_dict:
        role_description = info_dict["Roles"]
    
    if role_description != None:
        if "AWPer" in role_description:
            output_dict["WEAPON"] = "AWP"
        elif "Rifler" in role_description:
            output_dict["WEAPON"] = "AK47"
            
            
    # Get major appearances
    try:
        soup = fetch_page(player_name + "/Results")
        event_elements = soup.find_all("tr", {"class": "valvemajor-highlighted"})
        event_name_list = [event.find("td", {"style": "text-align:left"}).find("a")["href"] for event in event_elements]
        event_name_list = [event for event in event_name_list if event in major_list]
        output_dict["MAJOR APPEARANCES"] = len(event_name_list)
    except:
        output_dict["MAJOR APPEARANCES"] = 0
    
    if "Approx. Total Winnings" in info_dict:
        output_dict["EARNINGS"] = int(info_dict["Approx. Total Winnings"].replace(",", "").replace("$", ""))
    else:
        output_dict["EARNINGS"] = 0
    
    output_dict["LAST UPDATED"] = datetime.now().strftime("%d/%m/%Y")
    return output_dict

#fetch_player("XANTARES", "Turkey", MAJOR_LIST, debug=False)
fetch_player("S1mple", "Ukraine", MAJOR_LIST, debug=False)

{'NAME': 'S1mple',
 'REGION': 'Ukraine',
 'REAL NAME': 'Oleksandr Oleghovych Kostyljev',
 'NATIONALITY': 'Ukraine',
 'TEAM': 'Natus Vincere',
 'AGE': '02/10/1997',
 'WEAPON': 'AWP',
 'MAJOR APPEARANCES': 0,
 'EARNINGS': 1743186,
 'LAST UPDATED': '09/04/2023'}

## Generate CSV

In [7]:
def build_df():
    # Get dataframe to werite to
    if os.path.exists(CSV_FILE):
        playerdata = pd.read_csv(CSV_FILE)
    else:
        playerdata = pd.DataFrame(columns=COLUMNS)
        
    # Get list of all mayors
    major_list = fetch_majors()

    try:
        for major_region, sub_regions in tqdm(REGIONS.items()):
            for sub_region in tqdm(sub_regions, leave=False):
                region_ids = fetch_ids(sub_region)[:1]
                for player in tqdm(region_ids, leave=False):
                    if not (playerdata["NAME"].eq(player)).any():
                        row_data = fetch_player(player, major_region, major_list)
                        playerdata = pd.concat([playerdata, pd.DataFrame.from_records([row_data])], ignore_index=True)
    except Exception as e:
        print("Error:", e)
    
    playerdata.to_csv(CSV_FILE, index=False)
    return=(TODAY()-B2)/365

build_df()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Error: list index out of range


PermissionError: [Errno 13] Permission denied: 'players.csv'

## Post Processing

In [98]:
def age(birthdate):
    birthdate = datetime.strptime(birthdate, "%d/%m/%Y")
    today = datetime.now()
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

def playerdata_postprocess(df):
    columns = list(pd.read_csv(CSV_FILE).columns)
    del columns[columns.index("TEAM")]
    df.dropna(inplace = True, subset = columns)
    df.sort_values(by=["EARNINGS"], ascending = False, inplace = True)
    df["AGE"] = [age(date) for date in df["AGE"]]
    df["REGION"] = np.where(df["NATIONALITY"].isin(MISLABELS), "Europe", df["REGION"])
    return df

playerdata_post = playerdata_postprocess(pd.read_csv(CSV_FILE))

## UI Search Engine

In [99]:
def generate_selection(data, description):
    unique_data = list(data.unique())
    return widgets.SelectMultiple(
        options=unique_data,
        value=unique_data,
        rows=len(unique_data),
        description=description,
        disabled=False)


def generate_rangeslider(data, description):
    return widgets.FloatRangeSlider(
        value=[min(data), max(data)],
        min=min(data),
        max=max(data),
        step=1,
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='1',
)

region_selector = generate_selection(playerdata_post["REGION"], "REGION")
age_slider = generate_rangeslider(playerdata_post["AGE"], "AGE")
weapon_selector = generate_selection(playerdata_post["WEAPON"], "WEAPON")
majors_slider = generate_rangeslider(playerdata_post["MAJOR APPEARANCES"], "MAJOR APPEARANCES")

def ui_selector(sregion, sage, sweapon, smajors):
    return playerdata_post.loc[(playerdata_post["REGION"].isin(sregion)) &
                               (playerdata_post["AGE"] >= sage[0]) &
                               (playerdata_post["AGE"] <= sage[1]) &
                               (playerdata_post["WEAPON"].isin(sweapon)) &
                               (playerdata_post["MAJOR APPEARANCES"] >= smajors[0]) &
                               (playerdata_post["MAJOR APPEARANCES"] <= smajors[1])
                               ]
    
interact(ui_selector, sregion = region_selector, sage = age_slider, sweapon = weapon_selector, smajors = majors_slider)

interactive(children=(SelectMultiple(description='REGION', index=(0, 1, 2), options=('Europe', 'Americas', 'As…

<function __main__.ui_selector(sregion, sage, sweapon, smajors)>