In [1]:
!python --version

Python 3.11.7


In [2]:
!pip install selenium



In [3]:
!pip install beautifulsoup4



In [4]:
!pip install webdriver-manager



In [5]:
!pip install fake_useragent



In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService

from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup

In [7]:
from datetime import datetime

In [8]:
import re

In [9]:
from fake_useragent import UserAgent

In [10]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [11]:
def web_driver(headless=True):
    options = webdriver.ChromeOptions()
    options.add_argument('--verbose')
    options.add_argument('--no-sandbox')
    if headless:
        options.add_argument('--headless')
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-software-rasterizer")
    # options.add_argument('--window-size=1920, 1200')
    options.add_argument("--start-maximized");
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--lang=pt-BR')
    # options.add_argument('--lang=en-US')

    random_user_agent = UserAgent().random
    options.add_argument(f'user-agent={random_user_agent}')
    
    # driver = webdriver.Chrome(options=options)
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    
    return driver

def open_page(url, headless=True):
  driver = web_driver(headless)
  driver.get(url)
  res = driver.page_source.encode('utf-8')
  soup = BeautifulSoup(res, 'html.parser')
  return driver, soup

def open_browser(url):
  driver = web_driver()
  if url:
      driver.get(url)

def get_page_source(driver):
    res = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(res, 'html.parser')
    return soup

## Players

In [12]:
driver, soup = open_page('https://www.hltv.org/stats/players?matchType=Majors')
driver.quit()

In [13]:
player_stats = []

tbl_player_stats = soup.find('table', attrs={'class': 'stats-table player-ratings-table'})
rows_player_stats = tbl_player_stats.find('tbody').find_all('tr')

for row in rows_player_stats:
    cols = row.find_all('td')
    player_id = int(re.search(r'/stats/players/(\d+)/', cols[0].find('a')['href'].strip()).group(1))
    player_name = cols[0].text.strip()
    player_link = 'https://www.hltv.org' + cols[0].find('a')['href'].strip()
    players_teams = list(dict.fromkeys(map(lambda img: img['alt'], cols[1].find_all('img'))))
    player_maps = int(cols[2].text.strip())
    player_rounds = int(cols[3].text.strip())
    player_kd_diff = int(cols[4].text.strip())
    player_kd = float(cols[5].text.strip())
    player_rating = float(cols[6].text.strip())

    player_stat = {
        'player_id': player_id,
        'player_name': player_name,
        'player_link': player_link,
        'player_teams': players_teams,
        'player_maps': player_maps,
        'player_rounds': player_rounds,
        'player_kd_diff': player_kd_diff,
        'player_kd': player_kd,
        'player_kd': player_kd,
        'player_rating': player_rating,
    }
    
    player_stats.append(player_stat)

In [14]:
# player_stats = player_stats[:5]

In [15]:
def get_stat(stats, stat_filter):
    return list(filter(lambda s: stat_filter == s.find('span').text.strip(), stats))[0].find_all('span')[1].text.strip()

In [16]:
from concurrent.futures import ThreadPoolExecutor

def process_player(player):
    print(player['player_link'])
    
    driver, soup = open_page(player['player_link'])
    driver.quit()

    summary_info = soup.find('div', attrs={'class': 'summaryInfoContainer'})

    player_realname = summary_info.find('div', attrs={'class': 'summaryRealname'}).find('div').text.strip()
    player_flag = summary_info.find('img', attrs={'class': 'flag'})['title']
    player_age = summary_info.find('div', attrs={'class': 'summaryPlayerAge'}).text.strip()
    player_age = int(player_age.replace(' years', ''))
    player_yob = int(datetime.now().year - player_age) # yob = year of birth

    player['player_realname'] = player_realname
    player['player_flag'] = player_flag
    player['player_age'] = player_age
    player['player_yob'] = player_yob

    stats = soup.find('div', attrs={'class': 'statistics'}).find_all('div', attrs={'class': 'stats-row'})

    headshot_pct = get_stat(stats, 'Headshot %').replace('%', '')
    stat_adr = get_stat(stats, 'Damage / Round')
    stat_kills = get_stat(stats, 'Total kills')
    stat_deaths = get_stat(stats, 'Total deaths')
    stat_kills_round = get_stat(stats, 'Kills / round')
    stat_assists_round = get_stat(stats, 'Assists / round')
    stat_deaths_round = get_stat(stats, 'Deaths / round')
    
    player['stat_hs_pct'] = headshot_pct
    player['stat_adr'] = stat_adr
    player['stat_kills'] = stat_kills
    player['stat_deaths'] = stat_deaths
    player['stat_kills_round'] = stat_kills_round
    player['stat_assists_round'] = stat_assists_round
    player['stat_deaths_round'] = stat_deaths_round

In [17]:
import os
os.cpu_count()

20

In [18]:
num_threads = 10

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_player, player_stats)

https://www.hltv.org/stats/players/11893/zywoo?matchType=Majors
https://www.hltv.org/stats/players/9216/coldzera?matchType=Majors
https://www.hltv.org/stats/players/7998/s1mple?matchType=Majors
https://www.hltv.org/stats/players/7592/device?matchType=Majors
https://www.hltv.org/stats/players/15631/kscerato?matchType=Majors
https://www.hltv.org/stats/players/3741/niko?matchType=Majors
https://www.hltv.org/stats/players/2553/snax?matchType=Majors
https://www.hltv.org/stats/players/3055/flusha?matchType=Majors
https://www.hltv.org/stats/players/18053/broky?matchType=Majors
https://www.hltv.org/stats/players/10994/stavn?matchType=Majors
https://www.hltv.org/stats/players/7594/flamie?matchType=Majors
https://www.hltv.org/stats/players/18221/spinx?matchType=Majors
https://www.hltv.org/stats/players/9960/frozen?matchType=Majors
https://www.hltv.org/stats/players/8918/electronic?matchType=Majors
https://www.hltv.org/stats/players/11816/ropz?matchType=Majors
https://www.hltv.org/stats/players/8

In [1]:
df_player_stats = pd.DataFrame.from_dict(player_stats)
df_player_stats.sort_values(by=['player_rating', 'player_name'],
                            ascending=[False, True],
                            ignore_index=True, inplace=True)

df_player_stats.head(3)

NameError: name 'pd' is not defined

In [20]:
import pickle

with open('df_player_stats.pkl', 'wb') as pkl_file:
    pickle.dump(df_player_stats, pkl_file)

## Majors

In [21]:
driver, soup = open_page('https://www.hltv.org/stats/events?matchType=Majors')
driver.quit()

In [22]:
events_stats = []

tbl_events_stats = soup.find('table', attrs={'class': 'stats-table events-table'})
rows_events_stats = tbl_events_stats.find('tbody').find_all('tr')

for row in rows_events_stats:
    if not row.find('td', attrs={'class': 'winner-col'}).find('img'):
        continue
    
    event_winner = row.find('td', attrs={'class': 'winner-col'}).find('img')['title'].strip()
    event_name = row.find('td', attrs={'class': 'name-col'}).find('a').text.strip()
    event_link = row.find('td', attrs={'class': 'name-col'}).find('a')['href'].strip()
    event_id = int(re.search(r'&event=(\d+)', event_link).group(1))

    event_stats = {
        'event_id': event_id,
        'event_name': event_name,
        'event_link': 'https://www.hltv.org' + event_link,
        'event_winner': event_winner,
    }

    events_stats.append(event_stats)

In [23]:
def get_event_metadata(metadata, metadata_filter):
    metadata_info = metadata.find('th', string=metadata_filter)
    return metadata_info.find_next().text.strip() if metadata_info else ''

def process_event(event):
    print(event['event_link'])
    
    driver, soup = open_page(event['event_link'], headless=False)
    driver.quit()

    event_link_details = 'https://www.hltv.org' + soup.find('div', attrs={'class': 'event-hub-bottom'}).find('a')['href']

    driver, soup = open_page(event_link_details, headless=False)
    driver.quit()

    tbl_event_metadata = soup.find('table', attrs={'class': 'table eventMeta'})
    
    event['start_date'] = get_event_metadata(tbl_event_metadata, 'Start date')
    event['end_date'] = get_event_metadata(tbl_event_metadata, 'End date')
    event['teams'] = get_event_metadata(tbl_event_metadata, 'Teams')
    event['prize_pool'] = get_event_metadata(tbl_event_metadata, 'Prize pool')
    event['location'] = get_event_metadata(tbl_event_metadata, 'Location')

In [24]:
num_threads = 5

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_event, events_stats)

https://www.hltv.org/stats?matchType=Majors&event=7148
https://www.hltv.org/stats?matchType=Majors&event=6793
https://www.hltv.org/stats?matchType=Majors&event=6586
https://www.hltv.org/stats?matchType=Majors&event=6372
https://www.hltv.org/stats?matchType=Majors&event=4866
https://www.hltv.org/stats?matchType=Majors&event=4443
https://www.hltv.org/stats?matchType=Majors&event=3883
https://www.hltv.org/stats?matchType=Majors&event=3564
https://www.hltv.org/stats?matchType=Majors&event=3247
https://www.hltv.org/stats?matchType=Majors&event=2720
https://www.hltv.org/stats?matchType=Majors&event=2471
https://www.hltv.org/stats?matchType=Majors&event=2062
https://www.hltv.org/stats?matchType=Majors&event=2027
https://www.hltv.org/stats?matchType=Majors&event=1617
https://www.hltv.org/stats?matchType=Majors&event=1666
https://www.hltv.org/stats?matchType=Majors&event=1611
https://www.hltv.org/stats?matchType=Majors&event=1553
https://www.hltv.org/stats?matchType=Majors&event=1444
https://ww

In [25]:
df_events_stats = pd.DataFrame.from_dict(events_stats)
df_events_stats

Unnamed: 0,event_id,event_name,event_link,event_winner,start_date,end_date,teams,prize_pool,location
0,7148,PGL CS2 Major Copenhagen 2024,https://www.hltv.org/stats?matchType=Majors&event=7148,Natus Vincere,Mar 21st 2024,Mar 31st 2024,16.0,"$1,250,000","Copenhagen, Denmark"
1,6793,BLAST.tv Paris Major 2023,https://www.hltv.org/stats?matchType=Majors&event=6793,Vitality,May 13th 2023,May 21st 2023,16.0,"$1,250,000","Paris, France"
2,6586,IEM Rio Major 2022,https://www.hltv.org/stats?matchType=Majors&event=6586,Outsiders,Nov 5th 2022,Nov 13th 2022,16.0,"$1,250,000","Rio de Janeiro, Brazil"
3,6372,PGL Major Antwerp 2022,https://www.hltv.org/stats?matchType=Majors&event=6372,FaZe,May 14th 2022,May 22nd 2022,16.0,"$1,000,000","Antwerp, Belgium"
4,4866,PGL Major Stockholm 2021,https://www.hltv.org/stats?matchType=Majors&event=4866,Natus Vincere,Oct 30th 2021,Nov 7th 2021,16.0,"$2,000,000","Stockholm, Sweden"
5,4443,StarLadder Major Berlin 2019,https://www.hltv.org/stats?matchType=Majors&event=4443,Astralis,Aug 28th 2019,Sep 8th 2019,16.0,"$1,000,000","Berlin, Germany"
6,3883,IEM Katowice 2019,https://www.hltv.org/stats?matchType=Majors&event=3883,Astralis,Feb 20th 2019,Mar 3rd 2019,16.0,"$1,000,000","Katowice, Poland"
7,3564,FACEIT Major 2018,https://www.hltv.org/stats?matchType=Majors&event=3564,Astralis,Sep 12th 2018,Sep 23rd 2018,16.0,"$1,000,000","London, United Kingdom"
8,3247,ELEAGUE Major 2018,https://www.hltv.org/stats?matchType=Majors&event=3247,Cloud9,Jan 19th 2018,Jan 28th 2018,16.0,"$1,000,000","Atlanta & Boston, United States"
9,2720,PGL Major Krakow 2017,https://www.hltv.org/stats?matchType=Majors&event=2720,Gambit,Jul 16th 2017,Jul 23rd 2017,16.0,"$1,000,000","Krakow, Poland"


In [26]:
import pickle

with open('df_events_stats.pkl', 'wb') as pkl_file:
    pickle.dump(df_events_stats, pkl_file)