# Counter Strikle Webscraper

## Notebook Description

__Author:__ Daniël Vermaas

This notebook scrapes liquidpedia (https://liquipedia.net/counterstrike/Main_Page), in order to make counter strikle (https://blast.tv/counter-strikle) puzzle-solving easier. Before using the notebook, please read the Liquidpedia ToS about API usage: https://liquipedia.net/api-terms-of-use.

## Libraries & Constants

In [95]:
import os
import csv
import time
from time import sleep
from bs4 import BeautifulSoup
from datetime import datetime
import json
import mwparserfromhell

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import requests
#from urllib.request import quote

#!jupyter nbextension enable --py widgetsnbextension
import ipywidgets as widgets
from ipywidgets import interact

In [3]:
HEADERS = {"User-Agent": "Counter-Strikle-Bot","Accept-Encoding": "gzip"}
BASE_URL = "https://liquipedia.net/counterstrike/api.php?"
REGIONS = {"Europe" : ["Europe", "CIS"], "Americas" : ["Americas"],
           "Asia-Pacific" : ["Oceania", "Eastern_%26_Southern_Asia"]}
CSV_FILE = "players.csv"
COLUMNS = ["NAME", "REAL NAME", "REGION", "NATIONALITY", "TEAM", "AGE", "WEAPON", 
           "MAJOR APPEARANCES", "EARNINGS", "LAST UPDATED"]
MISLABELS = ["Kazakhstan", "Turkey"]

## API Calls

In [90]:
# API call function
def fetch_page(page, cooldown=0):
    url =  f"{BASE_URL}action=parse&format=json&page={page}"
    response = requests.get(url, HEADERS)
    page_html = response.json()['parse']['text']['*']
    soup = BeautifulSoup(page_html,features="lxml")
    time.sleep(cooldown)
    return soup

def fetch_player_info(page, cooldown=0):
    url = f"{BASE_URL}action=parse&format=json&page={page}&prop=text&section=0"
    response = requests.get(url, HEADERS)
    page_html = response.json()['parse']['text']['*']
    soup = BeautifulSoup(page_html,features="lxml")
    time.sleep(cooldown)
    return soup

# Fetches list of all majors
def fetch_majors():
    soup = fetch_page("Majors")
    event_elements = soup.find_all("div", {"class": "divRow tournament-card-premier"})
    return [event.find("b").find("a")["href"] for event in event_elements]

# Fetches all player ids/subdirectories for a given region
def fetch_ids(region):
    page = fetch_page(f"Portal:Players/{region}")
    elements = [item.find_all("a")[1] for item in page.find_all("td")]
    name_dict = [element["title"].replace(" ","_") for element in elements]
    return name_dict

# Fetches all player ids
def fetch_players():
    url = f'{BASE_URL}action=query&list=categorymembers&cmtitle=Category:Players&cmlimit=max&format=json'
    results = []

    while True:
        response = requests.get(url)
        data = json.loads(response.text)
        pages = data['query']['categorymembers']
        results.extend(pages)

        if 'continue' not in data:
            break

        cont = data['continue']
        cmcontinue = cont['cmcontinue']
        url = f"{url}&cmcontinue={cmcontinue}"
    return results

#MAJOR_LIST = fetch_majors()

In [91]:
results[:5]

[{'pageid': 142086, 'ns': 0, 'title': '-Ace'},
 {'pageid': 326014, 'ns': 0, 'title': '.exe'},
 {'pageid': 30662, 'ns': 0, 'title': '.PhP'},
 {'pageid': 92702, 'ns': 0, 'title': '0i'},
 {'pageid': 237638, 'ns': 0, 'title': '129'}]

In [125]:
# Define API query parameters
params = {
    "action": "query",
    "format": "json",
    "prop": "revisions",
    "titles": "s1mple",
    "rvprop": "content",
    "rvslots": "main",
    "rvsection":0
}

# Make API query
response = requests.get("https://liquipedia.net/counterstrike/api.php", params=params)
data = response.json()

# Parse page content and extract infobox data
pages = data["query"]["pages"]
page_id = next(iter(pages))
page = pages[page_id]

In [136]:
wikicode = mwparserfromhell.parse(page["revisions"][0]["slots"]["main"]["*"])
infobox = wikicode.filter_templates(matches="Infobox player")[0]
name = infobox.get("name").value.strip()
romanized_name = infobox.get("romanized_name").value.strip()
nationality = infobox.get("country").value.strip()
team = infobox.get("team").value.strip()
age = infobox.get("birth_date").value.strip()

# Print results
print("Name:", name)
print("Romanized name:", romanized_name)
print("Nationality:", nationality)
print("Team:", team)
print("Team:", age)

Name: Олександр Олегович Костилєв
Romanized name: Oleksandr Oleghovych Kostyljev
Nationality: Ukraine
Team: Natus Vincere


In [142]:
wikicode = mwparserfromhell.parse(page["revisions"][0]["slots"]["main"]["*"])
infobox = wikicode.filter_templates(matches="Infobox player")[0]

templates = wikicode.filter_templates()


infobox_dict = {}
for template in templates:
    if template.name.matches("Infobox player"):
        for param in template.params:
            value = mwparserfromhell.parse(param.value.strip_code()).strip()
            infobox_dict[param.name.strip()] = str(value)

# Print the infobox dictionary
print(infobox_dict)

{'id': 's1mple', 'image': 'S1mple at Antwerp Major 2022 EU RMR.jpg', 'name': 'Олександр Олегович Костилєв', 'romanized_name': 'Oleksandr Oleghovych Kostyljev', 'birth_date': '1997-10-02', 'country': 'Ukraine', 'status': 'Active', 'years_active': '2013 – Present', 'team': 'Natus Vincere', 'role': 'awp', 'nicknames': 'The Undertaker', 'csgo': 'y', 'twitter': 's1mpleO', 'facebook': 's1mpleon', 'instagram': 's1mpleo', 'reddit': 'reals1mplereal', 'vk': '', 'youtube': '', 'twitch': 's1mple', 'esea': '636916', 'esl': '7574927', 'faceit': 's1mple', 'steam': '76561198034202275', 'team_history': ''}


In [143]:
wikicode = mwparserfromhell.parse(page["revisions"][0]["slots"]["main"]["*"])
infobox = wikicode.filter_templates(matches="Infobox player")[0]

for param in infobox.params:
    value = mwparserfromhell.parse(param.value.strip_code()).strip()
    infobox_dict[param.name.strip()] = str(value)

# Print the infobox dictionary
print(infobox_dict)

{'id': 's1mple', 'image': 'S1mple at Antwerp Major 2022 EU RMR.jpg', 'name': 'Олександр Олегович Костилєв', 'romanized_name': 'Oleksandr Oleghovych Kostyljev', 'birth_date': '1997-10-02', 'country': 'Ukraine', 'status': 'Active', 'years_active': '2013 – Present', 'team': 'Natus Vincere', 'role': 'awp', 'nicknames': 'The Undertaker', 'csgo': 'y', 'twitter': 's1mpleO', 'facebook': 's1mpleon', 'instagram': 's1mpleo', 'reddit': 'reals1mplereal', 'vk': '', 'youtube': '', 'twitch': 's1mple', 'esea': '636916', 'esl': '7574927', 'faceit': 's1mple', 'steam': '76561198034202275', 'team_history': ''}


## Player-Specific Information

In [128]:
def fetch_player(player_name, region, major_list, debug=False):
    # parse player information
    soup = fetch_player_info(player_name)
    info_list = soup.find_all("div", {"class": "infobox-cell-2"})
    info_list = [item.parent.find_all("div") for item in info_list]
    info_dict = {info_name.text[:-1] : info_value.text for info_name, info_value in info_list}
    if debug:
        print(info_dict)
    
    # add name to dict
    output_dict = dict()
    output_dict["NAME"] = player_name
    output_dict["REGION"] = region
    
    # replace name with romanised name if needed
    if "Romanized Name" in info_dict:
        output_dict["REAL NAME"] = info_dict["Romanized Name"]
    else:
        output_dict["REAL NAME"] = info_dict["Name"]
    
    # get nationality
    if "Nationality" in info_dict:
        output_dict["NATIONALITY"] = info_dict["Nationality"][1:].split("\xa0")[0]
    
    # get team
    if "Team"  in info_dict:
        output_dict["TEAM"] = info_dict["Team"]
    
    # Get age
    if "Born" in info_dict:
        try:
            output_dict["AGE"] = datetime.strptime(info_dict["Born"][:-9].replace(",", ""), "%B %d %Y").strftime("%d/%m/%Y")
        except:
            pass
    
    # Get role
    role_description = None
    if "Role" in info_dict:
        role_description = info_dict["Role"]
    elif "Roles" in info_dict:
        role_description = info_dict["Roles"]
    
    if role_description != None:
        if "AWPer" in role_description:
            output_dict["WEAPON"] = "AWP"
        elif "Rifler" in role_description:
            output_dict["WEAPON"] = "AK47"
            
            
    # Get major appearances
    try:
        soup = fetch_page(player_name + "/Results")
        event_elements = soup.find_all("tr", {"class": "valvemajor-highlighted"})
        event_name_list = [event.find("td", {"style": "text-align:left"}).find("a")["href"] for event in event_elements]
        event_name_list = [event for event in event_name_list if event in major_list]
        output_dict["MAJOR APPEARANCES"] = len(event_name_list)
    except:
        output_dict["MAJOR APPEARANCES"] = 0
    
    if "Approx. Total Winnings" in info_dict:
        output_dict["EARNINGS"] = int(info_dict["Approx. Total Winnings"].replace(",", "").replace("$", ""))
    else:
        output_dict["EARNINGS"] = 0
    
    output_dict["LAST UPDATED"] = datetime.now().strftime("%d/%m/%Y")
    return output_dict

#fetch_player("XANTARES", "Turkey", MAJOR_LIST, debug=False)
fetch_player("S1mple", "Ukraine", MAJOR_LIST, debug=False)

{'NAME': 'S1mple',
 'REGION': 'Ukraine',
 'REAL NAME': 'Oleksandr Oleghovych Kostyljev',
 'NATIONALITY': 'Ukraine',
 'TEAM': 'Natus Vincere',
 'AGE': '02/10/1997',
 'WEAPON': 'AWP',
 'MAJOR APPEARANCES': 0,
 'EARNINGS': 1743186,
 'LAST UPDATED': '09/04/2023'}

## Generate CSV

In [7]:
def build_df():
    # Get dataframe to werite to
    if os.path.exists(CSV_FILE):
        playerdata = pd.read_csv(CSV_FILE)
    else:
        playerdata = pd.DataFrame(columns=COLUMNS)
        
    # Get list of all mayors
    major_list = fetch_majors()

    try:
        for major_region, sub_regions in tqdm(REGIONS.items()):
            for sub_region in tqdm(sub_regions, leave=False):
                region_ids = fetch_ids(sub_region)[:1]
                for player in tqdm(region_ids, leave=False):
                    if not (playerdata["NAME"].eq(player)).any():
                        row_data = fetch_player(player, major_region, major_list)
                        playerdata = pd.concat([playerdata, pd.DataFrame.from_records([row_data])], ignore_index=True)
    except Exception as e:
        print("Error:", e)
    
    playerdata.to_csv(CSV_FILE, index=False)
    return

build_df()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Error: list index out of range


PermissionError: [Errno 13] Permission denied: 'players.csv'

## Post Processing

In [None]:
def age(birthdate):
    birthdate = datetime.strptime(birthdate, "%d/%m/%Y")
    today = datetime.now()
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

def playerdata_postprocess(df):
    columns = list(pd.read_csv(CSV_FILE).columns)
    del columns[columns.index("TEAM")]
    df.dropna(inplace = True, subset = columns)
    df.sort_values(by=["EARNINGS"], ascending = False, inplace = True)
    df["AGE"] = [age(date) for date in df["AGE"]]
    df["REGION"] = np.where(df["NATIONALITY"].isin(MISLABELS), "Europe", df["REGION"])
    return df

playerdata_post = playerdata_postprocess(pd.read_csv(CSV_FILE))

## UI Search Engine

In [None]:
def generate_selection(data, description):
    unique_data = list(data.unique())
    return widgets.SelectMultiple(
        options=unique_data,
        value=unique_data,
        rows=len(unique_data),
        description=description,
        disabled=False)


def generate_rangeslider(data, description):
    return widgets.FloatRangeSlider(
        value=[min(data), max(data)],
        min=min(data),
        max=max(data),
        step=1,
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='1',
)

region_selector = generate_selection(playerdata_post["REGION"], "REGION")
age_slider = generate_rangeslider(playerdata_post["AGE"], "AGE")
weapon_selector = generate_selection(playerdata_post["WEAPON"], "WEAPON")
majors_slider = generate_rangeslider(playerdata_post["MAJOR APPEARANCES"], "MAJOR APPEARANCES")

def ui_selector(sregion, sage, sweapon, smajors):
    return playerdata_post.loc[(playerdata_post["REGION"].isin(sregion)) &
                               (playerdata_post["AGE"] >= sage[0]) &
                               (playerdata_post["AGE"] <= sage[1]) &
                               (playerdata_post["WEAPON"].isin(sweapon)) &
                               (playerdata_post["MAJOR APPEARANCES"] >= smajors[0]) &
                               (playerdata_post["MAJOR APPEARANCES"] <= smajors[1])
                               ]
    
interact(ui_selector, sregion = region_selector, sage = age_slider, sweapon = weapon_selector, smajors = majors_slider)