In [50]:
"""Scrapes a WC3 profile page from battlenet and returns json."""
from bs4 import BeautifulSoup
import requests



def _get_soup(player=None, server=None):
    url = 'http://classic.battle.net/war3/ladder/w3xp-player-profile.aspx'
    params = {'Gateway': server, 'PlayerName': player}
    headers = {
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.8',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
    }
    cookies = {
    'cookies': 'true',
    'loc': 'en-us',
    'optimizelyEndUserId': 'oeu1541036470495r0.7790622100412552',
    '_ga': 'GA1.2.1909275158.1540339822',
    '_gid': 'GA1.2.612761280.1541291293',
    '__utmt': '1',
    '__utmt_~1': '1',
    '__utma': '23057448.1909275158.1540339822.1541291289.1541293521.10',
    '__utmb': '23057448.12.10.1541293521',
    '__utmc': '23057448',
    '__utmz': '23057448.1541291289.9.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
}
    try:
        r = requests.get(url, params=params, headers=headers, cookies=cookies)
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
    soup = BeautifulSoup(r.content, 'lxml')
    return soup


def _get_stats_tables(soup):
    soup = soup.find('table', {'class': 'mainTable'})
    game_tables = soup.find_all('td', {'align': 'center', 'valign': 'top'})
    tables = {
        'stats': game_tables[0],
        'individual': game_tables[1],
        'team': game_tables[2]
    }

    return tables


def _get_individual_data(table_player):
    type_ = 'games'
    game_types = ['Team Games', 'Solo Games', 'FFA Games']
    vocab = {'Team Games': 'random_team', 'Solo Games': 'solo', 'FFA Games': 'free_for_all'}
    data = {}

    for game_type in game_types:
        table = table_player.find(text=game_type)
        if table:
            table = table_player.find(text=game_type).parent.parent.parent.parent.parent
            values = [x.get_text() for x in table.find_all('b')]
            d = _parse_values(type_, values)
            d['level'] = get_level(table, d['level_base'])
            d.pop('level_base')
            d['win_percentage'] = calc_win_percentage(d['wins'], d['losses'])
            key = vocab[game_type]
            data[key] = d

    return data


def _parse_values(type_, values):
    fields = ['wins', 'losses', 'partners', 'level_base', 'rank', 'experience']
    data = {}

    for field in fields:
        d = data_positions[type_][field]
        if not d:
            continue
        i = d['position']
        v = values[i]
        if not v:
            continue
        f = d['function']
        if f:
            value = f(v)
        else:
            value = v

        data[field] = value

    return data


def _get_team_data(table_teams):
    type_ = 'teams'
    teams = table_teams.find_all(text='Partner(s):')
    data = []

    for team in teams:
        table = team.parent.parent.parent.parent.parent.parent.parent.parent
        values = extract_values(table)
        d = _parse_values(type_, values)
        d['level'] = get_level(table, d['level_base'])
        d.pop('level_base')
        d['win_percentage'] = calc_win_percentage(d['wins'], d['losses'])
        data.append(d)

    return data


def get_data(soup):
    data = {}
    tables = _get_stats_tables(soup)
    data['individual'] = _get_individual_data(tables['individual'])
    data['team'] = _get_team_data(tables['team'])
    return data


def make_int(x):
    return int(x)


def get_level_base(value):
    value = int(value.split('\t')[-1])
    return value


def get_level_decimal(table):
    level_decimal = table.find('td', {'background': '/war3/images/ladder/expbar-bg.gif'})
    level_decimal = level_decimal.find('img').get('width')
    level_decimal = float('00.{}'.format(level_decimal.replace('%', '')))
    return level_decimal


def get_level(table, level_base):
    level_decimal = get_level_decimal(table)
    level = level_base - 1 + (level_decimal * 2)
    return level


def get_rank(value):
    if value == 'Unranked':
        return None
    else:
        return int(value[:-2])


def extract_values(table):
    values = []
    values_old = table.find_all('b')

    for i, value in enumerate(values_old):
        if i == 3:
            partners = [x.get_text() for x in value]
            if len(partners) > 1:
                partners.remove('')
            values.append(partners)

        else:
            values.append(value.get_text())

    return values


def calc_win_percentage(wins, losses):
    win_percentage = round((100 * int(wins)) / (int(wins) + int(losses)), 2)
    return win_percentage


def validate_player(player=None, soup=None):
    error_span = soup.find('span', class_='colorRed')
    if error_span is not None:
        raise Exception('Invalid player: {}'.format(player))


def validate_server(server):
    servers = ['azeroth', 'lordaeron', 'northrend', 'kalimdor']
    if not server.lower() in servers:
        raise Exception('Invalid server: {}'.format(server))

data_positions = {
    'teams':
    {
        'wins':
        {
            'position': 0,
            'function': make_int
        },
        'losses':
        {
            'position': 1,
            'function': make_int
        },
        'level_base':
        {
            'position': 2,
            'function': get_level_base
        },
        'partners':
        {
            'position': 3,
            'function': None
        },
        'rank':
        {
            'position': 4,
            'function': get_rank
        },
        'experience': None
    },
    'games':
    {
        'wins':
        {
            'position': 4,
            'function': make_int
        },
        'losses':
        {
            'position': 5,
            'function': make_int
        },
        'level_base':
        {
            'position': 1,
            'function': get_level_base
        },
        'partners': None,
        'rank':
        {
            'position': 3,
            'function': get_rank
        },
        'experience':
        {
            'position': 2,
            'function': None
        }
    }
}

In [48]:
soup = _get_soup(player='wearefoals', server='northrend')

In [49]:
str(soup)

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<html>\n<head>\n<title>Frozen Throne - Northrend - wearefoals - Player Profile</title>\n<link href="/war3/includes/war3-human-ie.css" rel="stylesheet" type="text/css"/>\n<link href="war3-ladder-profiles.css" rel="stylesheet" type="text/css"/>\n</head>\n<body bgcolor="#000000" leftmargin="0" marginheight="0" marginwidth="0" topmargin="0">\n<div style="POSITION: absolute; WIDTH: 100%; TEXT-ALIGN: center; TOP: 85px;"><center><div style="width: 650px;">\n<small><a href="w3xp-ladders.aspx?Gateway=Northrend">Ladders Home</a> &gt; <b>Player Profile</b> | \r\n\t\t\t\t\t\t <a href="w3xp-player-stats.aspx?Gateway=Northrend&amp;PlayerName=wearefoals">Player Statistics</a> | \r\n\t\t\t\t\t\t <a href="w3xp-player-reports-overall.aspx?Gateway=Northrend&amp;PlayerName=wearefoals">Overall Reports</a> |\r\n\t\t\t\t\t\t <a href="w3xp-player-reports-map-by-race.aspx?Gateway=Northrend&amp;PlayerName=wearefoals">Map by Race Reports</a> <br/>\

In [37]:
with open("output1.html", "w") as file:
    file.write(str(soup))

In [36]:
soup.string

In [31]:
soup.find('wearefoals')

In [18]:
tables = _get_stats_tables(soup)

In [26]:
soup.find('span', class_='rankingData').parent.find_all('b')

[<b>wearefoals</b>,
 <b><a href="w3xp-clan-profile.aspx?Gateway=Azeroth&amp;ClanTag=aG">aG</a></b>,
 <b></b>]

In [51]:
import requests





params = (
    ('Gateway', 'Azeroth'),
    ('PlayerName', 'WEAREFOALS'),
)

response = requests.get('http://classic.battle.net/war3/ladder/w3xp-player-profile.aspx', headers=headers, params=params, cookies=cookies)

#NB. Original query string below. It seems impossible to parse and
#reproduce query strings 100% accurately so the one below is given
#in case the reproduced version is not "correct".
# response = requests.get('http://classic.battle.net/war3/ladder/w3xp-player-profile.aspx?Gateway=Azeroth&PlayerName=WEAREFOALS', headers=headers, cookies=cookies)


In [52]:
response.content

b'\r\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" >\r\n<HTML>\r\n\t<HEAD>\r\n\t\t<title>Frozen Throne - Azeroth - WEAREFOALS - Player Profile</title>\r\n\t\t<link rel="stylesheet" type="text/css" href="/war3/includes/war3-human-ie.css">\r\n\t\t<link rel="stylesheet" type="text/css" href="war3-ladder-profiles.css">\r\n\r\n\t</HEAD>\r\n\t<body bgcolor="#000000" marginheight="0" marginwidth="0" topmargin="0" leftmargin="0">\r\n\t\t\r\n\t\t\t\t\t\t<DIV style = "POSITION: absolute; WIDTH: 100%; TEXT-ALIGN: center; TOP: 85px;"><center><div style = "width: 650px;">\r\n\t\t\t\t\t\t<small><a href = "w3xp-ladders.aspx?Gateway=Azeroth">Ladders Home</a> > <b>Player Profile</b> | \r\n\t\t\t\t\t\t <a href = "w3xp-player-stats.aspx?Gateway=Azeroth&PlayerName=WEAREFOALS">Player Statistics</a> | \r\n\t\t\t\t\t\t <a href = "w3xp-player-reports-overall.aspx?Gateway=Azeroth&PlayerName=WEAREFOALS">Overall Reports</a> |\r\n\t\t\t\t\t\t <a href = "w3xp-player-reports-map-by-race.aspx?Gatewa