In [1]:

%run ../../load_magic/storage.py
%run ../../load_magic/dataframes.py
%pprint
%who

DATA_FOLDER: ../data/
SAVES_FOLDER: ../saves/
Pretty printing has been turned OFF
DATA_CSV_FOLDER	 DATA_FOLDER	 ENCODING_TYPE	 SAVES_CSV_FOLDER	 SAVES_FOLDER	 SAVES_PICKLE_FOLDER	 attempt_to_pickle	 example_iterrows	 get_column_descriptions	 
load_csv	 load_dataframes	 load_object	 math	 os	 pd	 pickle	 save_dataframes	 store_objects	 
sys	 


In [2]:

import pandas as pd

html_dir = os.path.join(DATA_FOLDER, 'html')
html_path = os.path.join(html_dir, 'aoestats.io_stats.html')
win_rate_df = pd.read_html(html_path)[0]
win_rate_df.set_index('Civilization', drop=True, inplace=True)
for column_name in ['Win Rate', 'Play Rate']:
    win_rate_df[column_name] = win_rate_df[column_name].map(lambda x: float(x.split('%')[0]))

def get_hours(x):
    
    return int(x.split(':')[0])

def get_minutes(x):
    
    return int(x.split(':')[1])

def get_seconds(x):
    
    return int(x.split(':')[0])
    
for column_name in ['Feudal Time', 'Castle Time', 'Imperial Time', 'Game Length']:
    win_rate_df['{} Hours'.format(column_name)] = win_rate_df[column_name].map(get_hours)
    win_rate_df['{} Minutes'.format(column_name)] = win_rate_df[column_name].map(get_minutes)
    win_rate_df['{} Seconds'.format(column_name)] = win_rate_df[column_name].map(get_seconds)
win_rate_df.sample(10).T

Civilization,Goths,Portuguese,Vikings,Italians,Britons,Chinese,Turks,Byzantines,Vietnamese,Magyars
Win Rate,48.93,40.52,55.37,43.73,51.68,46.98,46.51,49.56,40.13,50.9
Play Rate,6.94,5.62,6.86,5.6,7.05,6.82,6.87,6.92,5.61,5.88
Score,6102,5834,6468,6048,6246,6421,6089,6614,5843,6229
Research,20,19,22,21,20,22,20,22,20,22
Feudal Time,0:11:24,0:11:10,0:11:29,0:11:00,0:11:10,0:11:29,0:11:12,0:11:20,0:11:16,0:10:50
Castle Time,0:24:02,0:24:10,0:23:49,0:23:56,0:24:06,0:23:57,0:23:40,0:24:09,0:24:38,0:23:45
Imperial Time,0:40:36,0:40:07,0:39:11,0:38:45,0:40:15,0:39:35,0:39:17,0:39:20,0:39:21,0:40:44
Game Length,0:40:47,0:40:11,0:41:06,0:40:47,0:40:40,0:41:20,0:40:41,0:42:20,0:40:01,0:40:29
Feudal Time Hours,0,0,0,0,0,0,0,0,0,0
Feudal Time Minutes,11,11,11,11,11,11,11,11,11,10


In [3]:

win_rate_df.columns

Index(['Win Rate', 'Play Rate', 'Score', 'Research', 'Feudal Time',
       'Castle Time', 'Imperial Time', 'Game Length', 'Feudal Time Hours',
       'Feudal Time Minutes', 'Feudal Time Seconds', 'Castle Time Hours',
       'Castle Time Minutes', 'Castle Time Seconds', 'Imperial Time Hours',
       'Imperial Time Minutes', 'Imperial Time Seconds', 'Game Length Hours',
       'Game Length Minutes', 'Game Length Seconds'],
      dtype='object')

In [4]:

from sklearn.ensemble import RandomForestRegressor

columns_list = ['Play Rate', 'Score', 'Research', 'Feudal Time Hours',
       'Feudal Time Minutes', 'Feudal Time Seconds', 'Castle Time Hours',
       'Castle Time Minutes', 'Castle Time Seconds', 'Imperial Time Hours',
       'Imperial Time Minutes', 'Imperial Time Seconds', 'Game Length Hours',
       'Game Length Minutes', 'Game Length Seconds']
X = win_rate_df[columns_list].values
y = win_rate_df['Win Rate'].values
regr = RandomForestRegressor(max_depth=2, n_estimators=100)
regr.fit(X, y)
sorted([(cn, ci) for cn, ci in zip(columns_list, regr.feature_importances_)], key=lambda x: x[1],
       reverse=True)

[('Play Rate', 0.5736182784103575), ('Score', 0.19062673753418735), ('Research', 0.1403321408079184), ('Imperial Time Minutes', 0.05700559972277815), ('Game Length Minutes', 0.027047976080529626), ('Castle Time Minutes', 0.008641044223944147), ('Feudal Time Minutes', 0.00272822322028475), ('Feudal Time Hours', 0.0), ('Feudal Time Seconds', 0.0), ('Castle Time Hours', 0.0), ('Castle Time Seconds', 0.0), ('Imperial Time Hours', 0.0), ('Imperial Time Seconds', 0.0), ('Game Length Hours', 0.0), ('Game Length Seconds', 0.0)]

In [5]:

selector_str = '#Britons > div.Overview > ul > li.bonuses > ul'
xpath_str = '//*[@id="Britons"]/div[1]/ul/li[5]/ul'
html_str = '''<li>
    Civilization bonuses:
    <ul>
        <li>Town Centers cost -50% wood starting in the Castle Age.</li>
        <li>Foot archers (except Skirmishers) have +1/+2 range in the Castle/Imperial Age.</li>
        <li>Shepherds work 25% faster.</li>
    </ul>
</li>'''
html_folder = os.path.join(DATA_FOLDER, 'html')
file_path = os.path.join(html_folder, 'aoe2_civ_bonuses.html')
with open(file_path, 'rb') as f:
    html_doc = f.read().decode('utf-8')

In [6]:

from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:

for civ in soup.find_all('h3')[:3]:
    print(civ, civ.next_element)

<h3><span>Britons</span></h3> <span>Britons</span>
<h3><span>Byzantines</span></h3> <span>Byzantines</span>
<h3><span>Celts</span></h3> <span>Celts</span>


In [8]:

soup.select(selector_str)

[<ul class="bonuses">
<li>Town Centers cost -50% wood starting in the Castle Age.</li>
<li>Foot archers (except Skirmishers) have +1/+2 range in the Castle/Imperial Age.</li>
<li>Shepherds work 25% faster.</li>
</ul>]

In [9]:

sub_selector = 'div.Overview > ul > li.bonuses > ul'
civ_name_list = []
for civ_div in soup.find_all('div', id=True):
    civ_name = civ_div.get_attribute_list('id')[0]
    print(civ_name)
    bonuses_ul = civ_div.select(sub_selector)[0]
    for bonus_li in bonuses_ul.find_all('li'):
        if bonus_li.string:
            print(bonus_li.string)
    print()

Britons
Town Centers cost -50% wood starting in the Castle Age.
Foot archers (except Skirmishers) have +1/+2 range in the Castle/Imperial Age.
Shepherds work 25% faster.

Byzantines
Buildings have +10%/+20%/+30%/+40% HP in the Dark/Feudal/Castle/Imperial Age.
Camels, Skirmishers, and the Spearman lines are 25% cheaper.
Fire Ships attack 20% faster.
Advancing to the Imperial Age is 33% cheaper.
Town Watch is free.

Celts
Infantry units move 15% faster.
Lumberjacks work 15% faster.
Siege weapons fire +20% faster.
Enemy herdables can be converted regardless of enemy units next to them.

Chinese
Town Center supports ten population (instead of five).
Technologies are 10%/15%/20% cheaper in the Feudal/Castle/Imperial Age.
Demolition Ships have +50% HP.

Franks
Farm upgrades are free.
Castles are 25% cheaper.
Cavalry have +20% HP.
Foragers work 25% faster.

Goths
Infantry are 35% cheaper starting in the Feudal Age.
Infantry have +1 attack bonus against standard buildings.
+10 population cap i

In [10]:

sub_selector = 'div.Overview > ul > li.bonuses > ul'
for civ_div in soup.find_all('div', id=True):
    civ_name = civ_div.get_attribute_list('id')[0]
    bonuses_ul = civ_div.select(sub_selector)[0]
    for bonus_li in bonuses_ul.find_all('li'):
        if bonus_li.string:
            print(bonus_li.string)

Town Centers cost -50% wood starting in the Castle Age.
Foot archers (except Skirmishers) have +1/+2 range in the Castle/Imperial Age.
Shepherds work 25% faster.
Buildings have +10%/+20%/+30%/+40% HP in the Dark/Feudal/Castle/Imperial Age.
Camels, Skirmishers, and the Spearman lines are 25% cheaper.
Fire Ships attack 20% faster.
Advancing to the Imperial Age is 33% cheaper.
Town Watch is free.
Infantry units move 15% faster.
Lumberjacks work 15% faster.
Siege weapons fire +20% faster.
Enemy herdables can be converted regardless of enemy units next to them.
Town Center supports ten population (instead of five).
Technologies are 10%/15%/20% cheaper in the Feudal/Castle/Imperial Age.
Demolition Ships have +50% HP.
Farm upgrades are free.
Castles are 25% cheaper.
Cavalry have +20% HP.
Foragers work 25% faster.
Infantry are 35% cheaper starting in the Feudal Age.
Infantry have +1 attack bonus against standard buildings.
+10 population cap in the Imperial Age.
Fishing Ships have double HP, +

In [11]:

sub_selector = 'div.Overview > ul > li.bonuses > ul'
civ_name_list = []
for civ_div in soup.find_all('div', id=True):
    civ_name = civ_div.get_attribute_list('id')[0]
    civ_name_list.append(civ_name.strip())

In [14]:

import re

line_regex = re.compile('[\r\n]+', re.MULTILINE)
with open(file_path, 'rb') as f:
    feature_list = line_regex.split(f.read().decode('utf-8').strip())

In [15]:

features_df = pd.DataFrame(data=None, index=civ_name_list, columns=feature_list)

In [16]:

csv_folder = os.path.join(DATA_FOLDER, 'csv')
file_path = os.path.join(csv_folder, 'aoe2_civ_features.csv')
features_df.to_csv(file_path)

In [17]:

import urllib.request

civ_soup_dict = {}
for civ_name in civ_name_list:
    try:
        civ_url = 'https://ageofempires.fandom.com/wiki/{}_(Age_of_Empires_II)'.format(civ_name)
        with urllib.request.urlopen(civ_url) as response:
            civ_html = response.read()
    except:
        civ_url = 'https://ageofempires.fandom.com/wiki/{}'.format(civ_name)
        with urllib.request.urlopen(civ_url) as response:
            civ_html = response.read()
    civ_soup = BeautifulSoup(civ_html, 'html.parser')
    civ_soup_dict[civ_name] = civ_soup

In [None]:

for i, civ_name in enumerate(civ_soup_dict):
    print(civ_name)
    bonuses_selector = '#Civilization_bonuses'
    bonuses_span = civ_soup_dict[civ_name].select(bonuses_selector)[0]
    bonuses_ul = list(bonuses_span.next_elements)[6]
    for bonus_li in bonuses_ul.find_all('li'):
        print(bonus_li.text.strip())
    print()

In [19]:

txt_folder = os.path.join(DATA_FOLDER, 'txt')
file_path = os.path.join(txt_folder, 'aoe2_civ_features.txt')
with open(file_path, 'wb') as f:
    f.write('\n'.encode('utf-8'))
    for i, civ_name in enumerate(civ_soup_dict):
        bonuses_selector = '#Civilization_bonuses'
        bonuses_span = civ_soup_dict[civ_name].select(bonuses_selector)[0]
        bonuses_ul = list(bonuses_span.next_elements)[6]
        for bonus_li in bonuses_ul.find_all('li'):
            f.write('{}\n'.format(bonus_li.text.strip()).encode('utf-8'))


Make sure to clean up the list before running the cell below to load it back in.

In [None]:

import re

line_regex = re.compile('[\r\n]+', re.MULTILINE)
with open(file_path, 'rb') as f:
    feature_list = line_regex.split(f.read().decode('utf-8').strip())

In [None]:

space_regex = re.compile('\s+')
feature_list = [space_regex.sub('_', feature.strip()) for feature in feature_list]

In [None]:

features_df = pd.DataFrame(data=None, index=civ_name_list, columns=feature_list)

In [None]:

xlsx_folder = os.path.join(DATA_FOLDER, 'xlsx')
os.makedirs(name=xlsx_folder, exist_ok=True)
file_path = os.path.join(xlsx_folder, 'aoe2_civ_features.xlsx')
features_df.to_excel(file_path)


# More webscraping

In [22]:

wiki_url = 'https://ageofempires.fandom.com/wiki/'
tables_url = '{}Villager_(Age_of_Empires_II)'.format(wiki_url)
tables_df_list = pd.read_html(tables_url)
print(len(tables_df_list))
villager_gathering_rate_df = tables_df_list[1]

4


In [21]:

villager_gathering_rate_df

Unnamed: 0,Villager type,Resource,Gathering rate (resources/second)
0,Farmer,Food,0.53*
1,Fisherman,Food,0.43
2,Hunter,Food,0.41
3,Lumberjack,Wood,0.39
4,Gold Miner,Gold,0.38
5,Stone Miner,Stone,0.36
6,Shepherd,Food,0.33
7,Forager,Food,0.31


In [None]:

tables_url = '{}Civilizations_(Age_of_Empires_II)'.format(wiki_url)
tables_df_list = pd.read_html(tables_url)
len(tables_df_list)

In [None]:

tables_df_list[0]

In [None]:

tables_df_list[2]

In [None]:

file_path = os.path.join(html_folder, 'military_unit_links.html')
with open(file_path, 'rb') as f:
    html_doc = f.read().decode('utf-8')
soup = BeautifulSoup(html_doc, 'html.parser')


# Scraping the Availability Stats

In [52]:

summary_box_url = 'https://ageofempires.fandom.com/wiki/Age_of_Empires_II:_The_Age_of_Kings'
with urllib.request.urlopen(summary_box_url) as response:
    summary_box_html = response.read()
summary_box_soup = BeautifulSoup(summary_box_html, 'html.parser')
summary_box_selector = '#mw-content-text > table[class="navbox"]:nth-child(114)'
summary_box_table = summary_box_soup.select(summary_box_selector)[0]

In [226]:

tr_soup_dict = {}
tr_list = []
th_list = ['The Age of Kings', 'Infantry', 'Archers', 'Cavalry', 'Siege', 'Navy', 'Unique',
           'The Conquerors', 'The Forgotten', 'The African Kingdoms', 'Rise of the Rajas']
military_units_list = ['Infantry', 'Archers', 'Cavalry', 'Siege', 'Navy', 'Unique']
heading_list = []
for summary_box_th in summary_box_table.find_all('th'):
    for heading in th_list:
        if heading in summary_box_th.text:
            tr_soup_dict[heading] = summary_box_th.parent
            if heading in military_units_list:
                tr_list.append(tr_soup_dict[heading])
            else:
                tr_list.append(tr_soup_dict[heading].next_sibling)
            heading_list.append(heading)

In [216]:

def get_link_html(link):
    link_url = 'https://ageofempires.fandom.com{}'.format(link['href'])
    if '_(Age_of_Empires_II)' in link_url:
        with urllib.request.urlopen(link_url) as response:
            link_html = response.read()
    else:
        try:
            extended_link_url = '{}_(Age_of_Empires_II)'.format(link_url)
            with urllib.request.urlopen(extended_link_url) as response:
                link_html = response.read()
            link_url = extended_link_url
        except:
            with urllib.request.urlopen(link_url) as response:
                link_html = response.read()
    
    return link_url, link_html

In [265]:

def process_all_except(appl_civ_text, row_dict):
    for civ_name in civ_name_list:
        if civ_name in appl_civ_text:
            row_dict[civ_name] = 0
        else:
            row_dict[civ_name] = 1
    
    return row_dict

In [266]:

def process_article(link_soup, row_dict):
    art_table = art_table_list[0]
    art_ul_list = art_table.find_all('ul')
    assert len(art_ul_list) == 2

    # Get available civs
    available_ul = art_ul_list[0]
    for link in available_ul.find_all('a'):
        civ_name = link['title'].split('(')[0].strip()
        row_dict[civ_name] = 1

    # Get unavailable civs
    unavailable_ul = art_ul_list[1]
    for link in unavailable_ul.find_all('a'):
        civ_name = link['title'].split('(')[0].strip()
        row_dict[civ_name] = 0

    return row_dict

In [267]:

space_regex = re.compile(r'\s+')
def process_small_list_of_units(appl_civ_text, row_dict):
    appl_civ_list = space_regex.split(appl_civ_text)
    for civ_name in civ_name_list:
        row_dict[civ_name] = 0
    for civ_name in appl_civ_list:
        if civ_name in civ_name_list:
            row_dict[civ_name] = 1
    
    return row_dict

In [268]:

toc_selector = '#toc'
def process_exceptions(link_soup, row_dict):
    desc_p_list = link_soup.select(toc_selector)
    if len(desc_p_list):
        for sib in desc_p_list[0].previous_siblings:
            if 'available to all civilizations except for' in str(sib):
                break
        process_all_except(sib.text.strip(), row_dict)
    
    return row_dict

In [269]:

article_selector = '#mw-content-text > table.article-table'
def process_unit(link_soup, appl_civ_text, row_dict):
    for column_name in civ_name_list:
        columns_set.add(column_name)
    if 'All civilizations' in appl_civ_text:
        for civ_name in civ_name_list:
            row_dict[civ_name] = 1
    elif 'All except ' in appl_civ_text:
        row_dict = process_all_except(unit_title, appl_civ_text, row_dict)
    elif 'See ' in appl_civ_text:
        art_table_list = link_soup.select(article_selector)
        if len(art_table_list):
            row_dict = process_article(link_soup, row_dict)
        else:
            row_dict = process_exceptions(link_soup, row_dict)
    else:
        row_dict = process_small_list_of_units(appl_civ_text, row_dict)
    
    return row_dict

In [270]:

def consume_aside(unit_soup, row_dict):
    aside_list = unit_soup.select('#mw-content-text > aside')
    for aside_soup in aside_list:
        for data_source_div in aside_soup.find_all('div', {'data-source': True}):
            data_source_h3_list = data_source_div.find_all('h3')
            if len(data_source_h3_list):
                h3_siblings_list = list(data_source_h3_list[0].next_siblings)
                if len(h3_siblings_list):
                    column_name = data_source_div['data-source']
                    value_list = list(h3_siblings_list[1].strings)
                    column_value = ' '.join(x.strip() for x in value_list)
                    if column_name == 'Civilization':
                        row_dict = process_unit(unit_soup, column_value, row_dict)
                    else:
                        columns_set.add(column_name)
                        row_dict[column_name] = column_value
    
    return row_dict

In [271]:

appl_civ_selector = '#mw-content-text > aside > section > div[data-source="Civilization"] > div'
title_selector = '#PageHeader > div.page-header__main > h1'
def get_unit_pages(soup, row_dict):
    for link in soup.find_all('a'):
        unit_url, unit_html = get_link_html(link)
        unit_soup = BeautifulSoup(unit_html, 'html.parser')
        unit_title = unit_soup.select(title_selector)[0].string.split('(')[0].strip()
        index_list.append(unit_title)
        row_dict = consume_aside(unit_soup, row_dict)
        rows_list.append(row_dict)

In [233]:

rows_list = []
index_list = []
columns_set = set()
for soup, heading in zip(tr_list, heading_list):
    column_name = 'Heading'
    row_dict = {column_name: heading}
    columns_set.add(column_name)
    get_unit_pages(soup, row_dict)

In [234]:

other_columns_set = columns_set - set(civ_name_list)
other_columns_set = other_columns_set - set(['Heading'])
columns_list = ['Heading'] + list(other_columns_set) + civ_name_list
availability_df = pd.DataFrame(data=rows_list, index=index_list, columns=columns_list)
print(availability_df.shape)
availability_df[availability_df.isnull().any(axis=1)].shape

(98, 32)


(0, 32)

In [235]:

file_path = os.path.join(DATA_CSV_FOLDER, 'availability_df.csv')
availability_df.to_csv(file_path)

In [249]:

div = list(data_source_h3.next_siblings)[1]
dir(div)

['HTML_FORMATTERS', 'XML_FORMATTERS', '__bool__', '__call__', '__class__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_all_strings', '_find_all', '_find_one', '_formatter_for_name', '_is_xml', '_lastRecursiveChild', '_last_descendant', '_should_pretty_print', 'append', 'attrs', 'can_be_empty_element', 'childGenerator', 'children', 'clear', 'contents', 'decode', 'decode_contents', 'decompose', 'descendants', 'encode', 'encode_contents', 'extend', 'extract', 'fetchNextSiblings', 'fetchParents', 'fetchPrevious', 'fetchPreviousSiblings', 'find', 'findAll', 'findAllNext', 'findAllPre

In [255]:

div.strings?