In [147]:
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup
import numpy as np

In [20]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}

In [21]:
url = "https://www.dotabuff.com/esports/series/2379020-the-international-11-playoff-secret-vs-tundra"

In [22]:
res = requests.get(url, headers=headers, timeout=3)

In [172]:
def get_matches_section(soup):
    return soup.find('section', attrs={'class': 'series-matches recent-esports-matches series-show'})

def get_teams(soup):
    # table soup
    
    teams_soup = soup.find_all('span', attrs={'class': 'team-text team-text-full'})
    teams = []
    for team_soup in teams_soup:
        teams.append(team_soup.text)
    return teams

def get_winner(soup):
    # tr soup
    
    return soup.find('td', attrs={'class': 'winner'}).text

def get_match_id(soup):
    # tr soup
    
    return soup.find('div', attrs={'class': 'match-link'}).text

def get_side_and_first_draft(soup):
    # tr soup
    
    sides = []
    first_picks = []
    
    draft_cols = soup.find_all('td', attrs={'class': 'r-none-tablet cell-xxlarge'})
    
    for col in draft_cols:
        
        side = ''
        first_pick = ''
        
        if col.find('span', attrs={'class': 'the-radiant'}):
            side = col.find('span', attrs={'class': 'the-radiant'}).text
            
        if col.find('span', attrs={'class': 'the-dire'}):
            side = col.find('span', attrs={'class': 'the-dire'}).text
        
        if col.find('acronym'):
            first_pick = col.find('acronym').text
            
        sides.append(side)
        first_picks.append(first_pick)
        
    return sides, first_picks

def get_phrase_heroes(soup, phrase):
    # tr soup
    
    heroes = soup.find_all('div', attrs={'class': phrase})
    
    heroes_data = []
    for hero in heroes:
        seq = hero.find('div', attrs={'class': 'seq'}).text
        hero_div = hero.find('div', attrs={'class': 'image-container image-container-hero image-container-medicon'})
        hero_name = hero_div.find('img')['alt']
        
        heroes_data.append((int(seq), hero_name))
        
    return heroes_data

def get_heroes(soup):
    # tr soup
    
    heroes = []
    for phrase in ['pick', 'ban']:
        phrase_heroes = get_phrase_heroes(soup, phrase)
        heroes += phrase_heroes
        
    return heroes


def get_game_data(soup):
    
    winner = get_winner(soup)
    match_id = get_match_id(soup)
    sides, first_pick = get_side_and_first_draft(soup)
    heroes = get_heroes(soup)
    
    dict_ = {
        'winner': winner,
        'match_id': match_id,
        'team1_side': sides[0],
        'team2_side': sides[1],
        'team1_pick': first_pick[0],
        'team2_pick': first_pick[1],
        'heroes': heroes
    }
    
    # return winner, match_id, sides, first_pick, heroes
    return dict_

def get_hero_df(hero_pairs):
    hero_dict = {}
    for k, v in hero_pairs:
        hero_dict[k] = [v]
    hero = pd.DataFrame(hero_dict)
    hero = hero[np.sort(hero.columns)]
    return hero



def get_match_dataframe(url):
    
    res = requests.get(url, headers=headers, timeout=3)
    soup = BeautifulSoup(res.content)
    
    # get games section
    soup = get_matches_section(soup)
    table_body_soup = soup.find('tbody')
    rows = table_body_soup.find_all('tr')
    teams = get_teams(soup)

    rows_data = []
    draft_dfs = []

    for row in rows:
        if not row.find('td', attrs={'class': 'not-played'}):
            row_d = get_game_data(row)
            rows_data.append(row_d)
            draft_dfs.append(get_hero_df(row_d['heroes']))
            
            
    df = pd.DataFrame(rows_data)
    df['team1_name'] = teams[0]
    df['team2_name'] = teams[1]
    df = df.drop(columns=['heroes'])
    df = df.join(pd.concat(draft_dfs).reset_index(drop=True))
    
    return df

In [173]:
df = get_match_dataframe(url)

In [174]:
df

Unnamed: 0,winner,match_id,team1_side,team2_side,team1_pick,team2_pick,team1_name,team2_name,1,2,...,15,16,17,18,19,20,21,22,23,24
0,Tundra,Game 1: 6832008209,The Radiant,The Dire,1st Pick,,Team Secret,Tundra Esports,Doom,Marci,...,Hoodwink,Enigma,Lich,Naga Siren,Lycan,Bloodseeker,Night Stalker,Drow Ranger,Pudge,Tidehunter
1,Tundra,Game 2: 6832140410,The Radiant,The Dire,,1st Pick,Team Secret,Tundra Esports,Nyx Assassin,Broodmother,...,Chen,Chaos Knight,Phoenix,Bristleback,Pangolier,Viper,Batrider,Hoodwink,Arc Warden,Morphling
2,Tundra,Game 3: 6832287527,The Dire,The Radiant,1st Pick,,Team Secret,Tundra Esports,Doom,Monkey King,...,Pangolier,Silencer,Naga Siren,Beastmaster,Troll Warlord,Shadow Shaman,Lifestealer,Clockwerk,Ember Spirit,Medusa
