# Counter Strikle Webscraper

> The following code scrapes liquidpedia (https://liquipedia.net/counterstrike/Main_Page), in order to make counter strikle (https://blast.tv/counter-strikle) puzzle-solving easier.

# Libs

In [1]:
import os
import csv
import time
from time import sleep
from bs4 import BeautifulSoup
from datetime import datetime

import numpy as np
import pandas as pd
#pd.set_option("display.max_rows", None)
from tqdm.notebook import tqdm
import requests
from urllib.request import quote

In [2]:
headers = {"User-Agent": "Counter-Strikle-Bot","Accept-Encoding": "gzip"}
base_url = "https://liquipedia.net/counterstrike/api.php?"
REGIONS = {"Europe" : ["Europe", "CIS"], "Americas" : ["Americas"],
           "Asia-Pacific" : ["Oceania", "Eastern_%26_Southern_Asia"]}
CSV_FILE = "players2.csv"
COLUMNS = ["NAME", "REAL NAME", "REGION", "NATIONALITY", "TEAM", "AGE", "WEAPON", 
           "MAJOR APPEARANCES", "EARNINGS", "LAST UPDATED"]

def fetch_page(page):
    url = base_url +"action=parse&format=json&page=" + page
    response = requests.get(url, headers)
    page_html = response.json()['parse']['text']['*']
    soup = BeautifulSoup(page_html,features="lxml")
    time.sleep(30)
    return soup

In [3]:
def fetch_majors():
    soup = fetch_page("Majors")
    event_elements = soup.find_all("div", {"class": "divRow tournament-card-premier"})
    return [event.find("b").find("a")["href"] for event in event_elements]

#fetch_majors()

In [14]:
def fetch_ids(region):
    page = fetch_page("Portal:Players/" + region)
    elements = [item.find_all("a")[1] for item in page.find_all("td")]
    name_dict = [element["title"].replace(" ","_") for element in elements]
    return name_dict

#fetch_ids("Europe")

['Stikle-',
 'ChrissK',
 'Dukiiii',
 'Kakafu',
 'NinoZjE',
 'Orbit',
 'ReacTioNNN',
 'S3NSEY',
 'SesL',
 'TWEEZYCS',
 'XTreMe',
 'AntO_oNNN',
 'Claeys',
 'FARIS',
 'Infernal',
 'Keoz',
 'KllyVe',
 'Kzealos',
 'MetaL',
 'Monu',
 'Nexius',
 'RitchiEE',
 'Simix',
 'Stev0se',
 'To1nou',
 'YNc',
 'HuNter-',
 'NiKo',
 'Sarenii',
 'Blocker',
 'Bubble',
 'CeRq',
 'Dennyslaw',
 'Doublemagic',
 'Dream3r',
 'Duplicate',
 'H4rn',
 'KalubeR',
 'Mar',
 'Niki1',
 'Numb',
 'Oxygen',
 'Patrick_(Bulgarian_player)',
 'PNshr',
 'Poizon',
 'POP0V',
 'Rafftu',
 'Rainwaker',
 'REDSTAR',
 'Rock1nG',
 'ShaiK',
 'SHiPZ',
 'SPELLAN',
 'SpyleadeR',
 'ToH1o',
 'V1c7oR',
 'Zix',
 'AJTT',
 'Beastik',
 'Bfull',
 'Blogg1s',
 'CapseN',
 'Daxen',
 'Desty',
 'DEV7L',
 'Dytor',
 'EYO',
 'Forsyy',
 'Fraged',
 'Fredi',
 'HONES',
 'K1-FiDa',
 'Kinzo',
 'Leckr',
 'Levi',
 'Manguss',
 'MoriiSko',
 'Nbqq',
 'NEOFRAG',
 'Oskar',
 'Pechyn',
 'SHOCK',
 'Stinx',
 'SyncD',
 'Tomkeejs',
 'Twist_(Czech_player)',
 'Valencio',
 'ZEDKO',

In [5]:
def total_players():
    total = 0
    for major_region, sub_regions in REGIONS.items():
        for region in sub_regions:
            total += len(fetch_ids(region))
    return total

#total_players()

In [18]:
def fetch_player(player_name, region, major_list, debug=False):
    # pull player profile
    soup = fetch_page(player_name)
    output_dict = dict()
    
    # parse player information
    info_list = soup.find_all("div", {"class": "infobox-cell-2"})
    info_dict = {info_list[i].text[:-1] : info_list[i+1].text for i in range(0, len(info_list),2)}
    if debug:
        print(info_dict)
    # add name to dict
    output_dict["NAME"] = player_name
    output_dict["REGION"] = region
    
    # replace name with romanised name if needed
    if "Romanized Name" in info_dict:
        output_dict["REAL NAME"] = info_dict["Romanized Name"]
    else:
        output_dict["REAL NAME"] = info_dict["Name"]
    
    # get nationality
    if "Nationality" in info_dict:
        output_dict["NATIONALITY"] = info_dict["Nationality"][1:]
    
    # get team
    if "Team"  in info_dict:
        output_dict["TEAM"] = info_dict["Team"]
    
    # Get age
    if "Born" in info_dict:
        try:
            output_dict["AGE"] = datetime.strptime(info_dict["Born"][:-9].replace(",", ""), "%B %d %Y").strftime("%d/%m/%Y")
        except:
            pass
    
    # Get role
    if "Role" in info_dict:
        if "AWPer" in info_dict["Role"]:
            output_dict["WEAPON"] = "AWP"
        else:
            output_dict["WEAPON"] = "AK47"
    elif "Roles" in info_dict:
        if "AWPer" in info_dict["Roles"]:
            output_dict["WEAPON"] = "AWP"
        else:
            output_dict["WEAPON"] = "AK47"
            
    # Get major appearances
    try:
        soup = fetch_page(player_name + "/Results")
        event_elements = soup.find_all("tr", {"class": "valvemajor-highlighted"})
        event_name_list = [event.find("td", {"style": "text-align:left"}).find("a")["href"] for event in event_elements]
        event_name_list = [event for event in event_name_list if event in major_list]
        output_dict["MAJOR APPEARANCES"] = len(event_name_list)
    except:
        output_dict["MAJOR APPEARANCES"] = 0
    
    if "Approx. Total Winnings" in info_dict:
        output_dict["EARNINGS"] = info_dict["Approx. Total Winnings"]
    else:
        output_dict["EARNINGS"] = "0$"
    
    output_dict["LAST UPDATED"] = datetime.now().strftime("%d/%m/%Y")
    return output_dict
    
#fetch_player("MingSir", "Eastern_%26_Southern_Asia", fetch_majors(), debug=True)
fetch_player("AntO_oNNN", "CIS", fetch_majors(), debug=True)

{'Name': 'Anton Van Gorp', 'Nationality': '\xa0Belgium', 'Born': 'July 18, 1995 (age\xa027)', 'Status': 'Active', 'Years Active (Org)': '2017 – 2020', 'Years Active (Coach)': '2017 – Present', 'Years Active (Analyst)': '2018 – Present', 'Roles': 'Assistant CoachAnalyst', 'Team': 'Complexity Gaming', 'Games': 'Global Offensive'}


{'NAME': 'AntO_oNNN',
 'REGION': 'CIS',
 'REAL NAME': 'Anton Van Gorp',
 'NATIONALITY': 'Belgium',
 'TEAM': 'Complexity Gaming',
 'AGE': '18/07/1995',
 'WEAPON': 'AK47',
 'MAJOR APPEARANCES': 0,
 'EARNINGS': '0$',
 'LAST UPDATED': '14/10/2022'}

In [19]:
def build_df():
    # Get dataframe to werite to
    if os.path.exists(CSV_FILE):
        playerdata = pd.read_csv(CSV_FILE)
    else:
        playerdata = pd.DataFrame(columns=COLUMNS)
        
    # Get list of all mayors
    major_list = fetch_majors()

    try:
        for major_region, sub_regions in tqdm(REGIONS.items()):
            for sub_region in tqdm(sub_regions, leave=False):
                region_ids = fetch_ids(sub_region)
                for player in tqdm(region_ids, leave=False):
                    if not (playerdata["NAME"].eq(player)).any():
                        row_data = fetch_player(player, major_region, major_list)
                        playerdata = pd.concat([playerdata, pd.DataFrame.from_records([row_data])], ignore_index=True)
    except Exception as e:
        print("Error:", e)
    
    playerdata.to_csv(CSV_FILE, index=False)
    return

build_df()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Error: [Errno Expecting value] <!DOCTYPE HTML><title>Rate Limited</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><style type=text/css>body{margin:40px auto;max-width:700px;line-height:1.6;font-size:18px;color:#444;padding:0 10px}h1{line-height:1.2}.reason{display:none}</style><script src=https://www.recaptcha.net/recaptcha/api.js></script><h1>Rate Limited</h1><p>Your IP address has been temporarily blocked from accessing Liquipedia due to excessive or invalid requests. This may happen if you're using a scraper / bot or otherwise accessing pages too quickly. Scrapers and similar tools are not permitted to access Liquipedia. Please see <a href=/api-terms-of-use>the Liquipedia API terms of use</a> for information on how to access to Liquipedia data in a supported manner. Rate limiting may also be triggered by problematic browser extensions, excessive pre-fetching or corporate content scanners. If you still aren't sure why you were rate limited,

In [8]:
playerdata = pd.read_csv(CSV_FILE)

In [9]:
#playerdata