In [2]:
# Imports

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np

In [3]:
# Using the HLTV stats page we will crawl and scrape economy data for each match

# URL for HLTV event page
url = 'https://www.hltv.org/results?event=6586'

response = requests.get(url)
soup = bs(response.text, 'html.parser')

table = soup.find('div', {'class': 'results-all'})
rows = table.find_all('div', {'class': 'result-con'})

# For each element in rows, get the href attribute
links = [row.find('a')['href'] for row in rows]

# Extract match codes for future pandas table
match_codes = [link.split('/')[2] for link in links]

# Create a list of full links for each match
base_url = 'https://www.hltv.org/'
full_links = [base_url + link for link in links]

In [4]:
# General Stats
def stats_link(link: str) -> str:
    '''Finds the detailed stats page for a given match link'''

    response = requests.get(link)
    soup = bs(response.text, 'html.parser')
    
    # Stats match codes are different than on the event page.
    detailed_stats_div = soup.find('div', {'class': 'small-padding stats-detailed-stats'})
    stats_href = detailed_stats_div.find('a')['href']

    stats_link = base_url + stats_href

    return stats_link


# Helper functions for "scrape_table"
def get_teams(soup: bs) -> tuple:
    '''Gets the left and right team from a stats page.'''

    team_left = soup.find('div', {'class': 'team-left'}).find('a').text.lower()
    team_right = soup.find('div', {'class': 'team-right'}).find('a').text.lower()

    return team_left, team_right

def get_stats(soup: bs) -> list:
    '''Gets the left and right team summary stats from a stats page.'''

    stats = []

    results = soup.find_all('div', {'class': 'right'})

    for result in results:
        info = result.text.split(':')
        info = [text.strip() for text in info]
        stats.append(info)

    # We get 2 empty results at the beginning of the list so we remove them.
    stats = stats[2:]

    # Single map matches have 4 stats so we remove the first one.
    if len(stats) > 3:
        stats = stats[1:]

    return stats

def get_winner(soup: bs) -> str:
    '''Gets the winner of a match from a stats page.'''

    winner_soup = soup.find('div', {'class': 'stats-match-map-winner-logo-con'})

    try:
        # Get the title attribute of winner.
        winner = winner_soup.find('img')['title'].lower()
    except:
        winner = float('nan')

    return winner

def get_date(soup: bs) -> str:
    '''Gets the date of a match from a stats page.'''

    date = soup.find('div', {'class': 'small-text'}).text.split()[0]

    return date


def scrape_table(link: str) -> list:
    '''Scrapes the stats table for a given match link and returns data about the match.'''

    response = requests.get(link)
    soup = bs(response.text, 'html.parser')

    team_left, team_right = get_teams(soup)
    stats = get_stats(soup)
    winner = get_winner(soup)
    date = get_date(soup)

    # stats always returns categories in the same order.
    print(stats)
    ratings, openings, clutches = stats

    # Schema for column ordering:
    # date, winner, team_left, team_right, team_left_rating, team_right_rating, team_left_openings, team_right_openings, team_left_clutches, team_right_clutches
    data = [date, winner, team_left, team_right, float(ratings[0]), float(ratings[1]), int(openings[0]), int(openings[1]), int(clutches[0]), int(clutches[1])]

    return data

In [34]:
# Economic Stats
def get_econ_link(stat_link: str) -> str:
    '''Gets the link for the economy page for a given match.'''

    # To get to the economy page, we need to add "economy" between matches and the match code.
    econ_link = stat_link.replace('matches', 'matches/economy')

    return econ_link


def get_econ_map_links(econ_link: str) -> str:
    '''Gets the links for the economy page for each map.'''

    response = requests.get(econ_link)
    soup = bs(response.text, 'html.parser')

    map_soup = soup.find_all('a', {'class': 'col stats-match-map standard-box a-reset inactive'})
    map_links = []

    for ms in map_soup:
        map_link = ms['href']

        # Get the full link.
        map_link = base_url + map_link
        map_links.append(map_link)

    return map_links


# Helper functions for "scrape_map_econ"
def scrape_rows(rows: list) -> list:
    '''
    Goes through each row and scrapes the data. This is data for the
    amount of rounds played under each economic cateogory and the amount
    of each category won.
    '''

    stats = []

    for row in rows:
        # Find span with title = 'Played'
        played_won = row.find('span', {'title': 'Played'}).text
        played, won = played_won.split()

        # Remove the parentheses from the won string.
        won = won[1:-1]

        stats.append([played, won])

    return stats


def scrape_map_econ(map_link: str) -> dict:
    '''Scrapes the economy data for a given map link.'''

    response = requests.get(map_link)
    soup = bs(response.text, 'html.parser')

    # Will grab the two columns of data.
    stats_cols = soup.find_all('div', {'class': 'col standard-box stats-rows'})
    
    team_econ_stats = {}

    for col in stats_cols:
        team_name = col.find('span', {'class': 'label-and-text'}).text.lower()
        rows = col.find_all('div', {'class': 'stats-row team-economy-stat'})

        stats = scrape_rows(rows)

        team_econ_stats[team_name] = stats

    return team_econ_stats


# Helper function for "scrape_maps"
def aggr_econ_one_team(econ_data: list, team_name: str) -> np.array:
    '''Aggregates the economy data for a team.'''
    data = []

    for econ_dict in econ_data:
        map_data = econ_dict[team_name]
        data.append(np.array(map_data).astype(int))

    aggr_data = np.sum(data, axis=0)

    return aggr_data

def aggr_econ_data(econ_data: list):
    '''Aggregates the economy data for a match.'''

    team_one, team_two = econ_data[0].keys()

    # Get and aggregate the data for each team.
    team_one_data = aggr_econ_one_team(econ_data, team_one)
    team_two_data = aggr_econ_one_team(econ_data, team_two)

    return (team_one, team_one_data), (team_two, team_two_data)


def scrape_maps(econ_map_links: list) -> tuple:
    '''Scrapes the economy data for each map in a given match.'''

    econ_data = []

    for ml in econ_map_links:
        data = scrape_map_econ(ml)
        econ_data.append(data)

    team_one_dat, team_two_dat = aggr_econ_data(econ_data)
        
    return team_one_dat, team_two_dat, len(econ_map_links)

In [39]:
# Combining Scraped Data
def scrape_match_data(link: str):
    '''Scrapes the data for a given match.'''

    # Get the link for the stats page of the match.
    # This can be used directly with scrape_table.
    # Must be converted to the econ_link to get
    # economy data.
    match_link = stats_link(link)

    # Get the data for each map.
    # Schema: # date, winner, team_left, team_right, team_left_rating, team_right_rating, team_left_openings, team_right_openings, team_left_clutches, team_right_clutches
    stats_data = scrape_table(match_link)

    # Get team_left, and team_right for organization with economy data.
    team_left, team_right = stats_data[1], stats_data[2]

    # Get the proper links for scrapping economy data.
    econ_link = get_econ_link(match_link)
    econ_map_links = get_econ_map_links(econ_link)

    # Get the aggregated economy data for the match.
    # Schema: (team_one, team_one_data), (team_two, team_two_data)
    team_one, team_two, maps_played = scrape_maps(econ_map_links)

    # To fit with the schema of stats_data, we match team_left and team_right with team_one and team_two.
    # Then the schema added onto the match data will be:
    # team_left_eco, team_left_eco_won, team_left_semi_eco, team_left_semi_econ_won,
    # team_left_semi_buy, team_left_semi_buy_won, team_left_full_buy, team_left_full_buy_won,
    # team_right_eco, team_right_eco_won, team_right_semi_eco, team_right_semi_econ_won,
    # team_right_semi_buy, team_right_semi_buy_won, team_right_full_buy, team_right_full_buy_won
    if team_left == team_one[0]:
        team_left_econ = team_one[1]
        team_right_econ = team_two[1]
    else:
        team_left_econ = team_two[1]
        team_right_econ = team_one[1]

    # Flatten the economy data.
    team_left_econ = team_left_econ.flatten()
    team_right_econ = team_right_econ.flatten()

    # Add the economy data to the match data.
    stats_data.extend(team_left_econ)
    stats_data.extend(team_right_econ)

    # Add the number of maps played to the match data.
    stats_data.append(maps_played)

    return stats_data

In [40]:
def scrape_all_data(links: list):
    '''Scrapes the data for all matches in the given list of links.'''

    data = []
    counter = 0

    for link in links:
        try:
            data.append(scrape_match_data(link))
        except:
            pass
    
    return data

COLUMNS = ['date', 'winner', 't1', 't2', 't1_rating', 't2_rating', 't1_opening_duels', 't2_opening_duels', 't1_clutches', 't2_clutches',
           't1_eco', 't1_eco_won', 't1_semi_eco', 't1_semi_eco_won', 't1_semi_buy', 't1_semi_buy_won', 't1_full_buy', 't1_full_buy_won',
           't2_eco', 't2_eco_won', 't2_semi_eco', 't2_semi_eco_won', 't2_semi_buy', 't2_semi_buy_won', 't2_full_buy', 't2_full_buy_won', 'maps_played']

def create_df(links: list, cols: list=COLUMNS):
    '''Creates a dataframe from the scraped data.'''

    data = scrape_all_data(links)
    df = pd.DataFrame(data, columns=cols)
    
    return df

In [42]:
rio_data = create_df(full_links)

[['1.25', '0.85'], ['26', '23'], ['4', '0']]
[['1.05', '1.00'], ['44', '35'], ['6', '3']]
[['1.17', '0.85'], ['47', '31'], ['6', '3']]
[['1.13', '0.98'], ['40', '41'], ['6', '6']]
[['0.99', '1.10'], ['26', '28'], ['4', '7']]
[['1.08', '1.00'], ['42', '38'], ['3', '3']]
[['1.19', '0.88'], ['26', '25'], ['5', '1']]
[['1.01', '1.04'], ['37', '46'], ['4', '2']]
[['0.89', '1.12'], ['24', '31'], ['1', '1']]
[['1.14', '0.93'], ['37', '33'], ['4', '5']]
[['1.08', '0.95'], ['34', '38'], ['2', '0']]
[['1.32', '0.82'], ['31', '18'], ['5', '5']]
[['1.14', '0.92'], ['39', '23'], ['4', '1']]
[['0.88', '1.26'], ['21', '32'], ['4', '4']]
[['0.76', '1.24'], ['21', '31'], ['1', '1']]
[['1.45', '0.63'], ['28', '16'], ['1', '6']]
[['1.11', '0.97'], ['32', '25'], ['4', '5']]
[['1.29', '0.77'], ['26', '21'], ['1', '2']]
[['1.09', '0.97'], ['55', '46'], ['4', '6']]
[['0.82', '1.34'], ['19', '31'], ['5', '7']]
[['1.02', '1.01'], ['13', '17'], ['0', '1']]
[['0.71', '1.40'], ['9', '14'], ['3', '1']]
[['0.98', '

In [43]:
rio_data

Unnamed: 0,date,winner,t1,t2,t1_rating,t2_rating,t1_opening_duels,t2_opening_duels,t1_clutches,t2_clutches,...,t1_full_buy_won,t2_eco,t2_eco_won,t2_semi_eco,t2_semi_eco_won,t2_semi_buy,t2_semi_buy_won,t2_full_buy,t2_full_buy_won,maps_played
0,2022-11-13,outsiders,outsiders,heroic,1.25,0.85,26,23,4,0,...,26,5,0,5,0,9,2,26,12,2
1,2022-11-12,heroic,heroic,furia,1.05,1.0,44,35,6,3,...,25,4,0,4,0,12,5,47,30,3
2,2022-11-12,outsiders,outsiders,mouz,1.17,0.85,47,31,6,3,...,36,4,0,6,0,13,4,50,25,3
3,2022-11-11,furia,furia,natus vincere,1.13,0.98,40,41,6,6,...,37,6,0,8,3,9,5,53,26,3
4,2022-11-11,heroic,spirit,heroic,0.99,1.1,26,28,4,7,...,28,2,0,6,0,7,3,35,15,2
5,2022-11-10,mouz,mouz,cloud9,1.08,1.0,42,38,3,3,...,35,4,0,6,1,14,4,51,30,3
6,2022-11-10,outsiders,outsiders,fnatic,1.19,0.88,26,25,5,1,...,26,3,0,6,0,4,1,34,16,2
7,2022-11-08,spirit,liquid,spirit,1.01,1.04,37,46,4,2,...,34,5,0,4,0,10,7,58,30,3
8,2022-11-08,natus vincere,big,natus vincere,0.89,1.12,24,31,1,1,...,28,2,0,7,0,3,2,39,20,2
9,2022-11-08,mouz,mouz,ence,1.14,0.93,37,33,4,5,...,34,5,0,2,0,13,5,44,23,3


In [46]:
challenger_url = 'https://www.hltv.org/results?event=6588'

response = requests.get(challenger_url)
soup = bs(response.text, 'html.parser')

table = soup.find('div', {'class': 'results-all'})
rows = table.find_all('div', {'class': 'result-con'})

# For each element in rows, get the href attribute
challenger_links = [row.find('a')['href'] for row in rows]

# Extract match codes for future pandas table
challenger_match_codes = [link.split('/')[2] for link in challenger_links]

# Create a list of full links for each match
challenger_full_links = [base_url + link for link in challenger_links]

challenger_data = create_df(challenger_full_links)

[['1.10', '0.96'], ['34', '23'], ['3', '3']]
[['0.95', '1.13'], ['29', '29'], ['1', '4']]
[['1.04', '1.03'], ['39', '43'], ['4', '8']]
[['0.83', '1.23'], ['21', '30'], ['3', '6']]
[['0.72', '1.32'], ['17', '31'], ['1', '3']]
[['1.31', '0.73'], ['33', '14'], ['3', '1']]
[['1.25', '0.79'], ['31', '19'], ['3', '0']]
[['1.13', '0.96'], ['36', '21'], ['3', '1']]
[['1.26', '0.85'], ['34', '19'], ['3', '4']]
[['1.12', '0.99'], ['35', '40'], ['5', '7']]
[['0.85', '1.23'], ['23', '25'], ['4', '4']]
[['1.04', '1.05'], ['54', '46'], ['10', '4']]
[['1.14', '0.94'], ['38', '36'], ['9', '5']]
[['0.95', '1.16'], ['16', '19'], ['3', '0']]
[['1.39', '0.70'], ['15', '7'], ['1', '0']]
[['0.88', '1.19'], ['15', '21'], ['3', '1']]
[['0.93', '1.15'], ['10', '18'], ['1', '0']]
[['1.17', '0.92'], ['18', '18'], ['0', '4']]
[['1.06', '1.02'], ['15', '13'], ['0', '3']]
[['0.88', '1.20'], ['10', '16'], ['1', '0']]
[['1.06', '1.00'], ['10', '17'], ['2', '0']]
[['1.09', '0.93'], ['14', '13'], ['1', '1']]
[['0.84', 

In [47]:
# Export rio data to csv
rio_data.to_csv('legends_playoffs.csv', index=False)

# Export challenger data to csv
challenger_data.to_csv('challenger.csv', index=False)