In [None]:
import random
import time

from collections import defaultdict
from multiprocessing.pool import Pool

import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [None]:
pd.set_option('display.max_columns', 50)

In [None]:
table_pages = ['https://en.wikipedia.org/wiki/List_of_battles_before_301',
               'https://en.wikipedia.org/wiki/List_of_battles_301%E2%80%931300',
               'https://en.wikipedia.org/wiki/List_of_battles_1301%E2%80%931600',
               'https://en.wikipedia.org/wiki/List_of_battles_since_2001']

list_pages = ['https://en.wikipedia.org/wiki/List_of_battles_1601%E2%80%931800',
               'https://en.wikipedia.org/wiki/List_of_battles_1801%E2%80%931900',
               'https://en.wikipedia.org/wiki/List_of_battles_1901%E2%80%932000',]

In [None]:
def retrieve_battles(url):
    link_list = []
    title_list = []
    
    with requests.Session() as session:
        response = session.get(url)
        
    soup = BeautifulSoup(response.content)
    bullets = soup.find_all('li')
    
    for bullet in bullets:
        if 'title' in str(bullet):
            if 'footer' not in str(bullet):
                link = bullet.a['href']
                link_list.append(link)
                
                title = bullet.a.get('title', None)
                title_list.append(title)

    return link_list, title_list

# Them's
fightin_words = ['Fall', 'Battle', 'Siege', 'Capture', 'Operation', 'Action', 'Recapture']

def retrieve_early_battles(url):
    link_list = []
    title_list = []
    
    with requests.Session() as session:
        responses = requests.get(url)
        soup = BeautifulSoup(responses.content)
        tables = soup.find_all('table', {'class': 'wikitable'})
        
        for table in tables:
            cells = table.find_all('td')
            for cell in cells:
                if cell.a is not None:
                    if any(x in str(cell.a['href']) for x in fightin_words):
                        title_list.append(cell.a['title'])
                        link_list.append(cell.a['href'])
                        
    return link_list, title_list

In [None]:
link_list = []
title_list = []

for url in list_pages:
    links, titles = retrieve_battles(url)
    link_list += links
    title_list += titles

late_battles = pd.DataFrame.from_dict({'url': link_list, 'title': title_list})
late_battles.drop_duplicates(inplace=True)
late_battles.dropna(subset=['title'], inplace=True)

print("# Late battles:", len(late_battles))
late_battles.sample(n=5)

In [None]:
link_list = []
title_list = []

for url in table_pages:
    links, titles = retrieve_early_battles(url)
    link_list += links
    title_list += titles

early_battles = pd.DataFrame.from_dict({'url': link_list, 'title': title_list})
early_battles.drop_duplicates(inplace=True)
early_battles.dropna(subset=['title'], inplace=True)

print("# Early battles:", len(early_battles))
early_battles.sample(n=5)

In [None]:
battles = pd.concat([early_battles, late_battles])
battles.reset_index(drop=True, inplace=True)
battles.drop_duplicates(inplace=True)

print('Total # battles:', len(battles))
battles.sample(n=5)

In [None]:
# Exclude nonexistent pages or non-english wikipedia pages
# (There aren't that many, so better to dump them)
battles = battles.loc[~battles.title.str.contains('does not exist')]
battles = battles.loc[~battles.url.str.contains('https')]

# Exclude wars
battles = battles.loc[~battles.title.map(lambda s: 'War' in s.split(' '))]

# Exclude support page links etc
battles = battles.loc[~battles.url.str.contains('Wikipedia')]
battles = battles.loc[~battles.url.str.contains('index.php')]
battles = battles.loc[~battles.url.str.contains('Special:')]
battles = battles.loc[~battles.url.str.contains('List_of')]
battles = battles.loc[~battles.url.str.contains('Category:')]

# We only want battles, not extended campaigns or distributed multi-event things
conflict_terms = ('battle', 'siege', 'fall', 'sack', 
                  'operation', 'capture', 'raid', 
                  'action', 'destruction', 'massacre')

battles = battles.loc[battles.title.map(lambda s: any(b in s.lower() for b in conflict_terms))]
print('Validated # battles:', len(battles))

In [None]:
battles.to_csv('./data/bayes_battles.tsv', encoding='utf-8', sep='\t')

In [None]:
omit_list = ['Capital punishment',
             'Military advisor',
             'Wounded in action',
             'Prisoner of war',
             'Killed in action',
             'Surrender (military)',
             'Surrendered',
             'Common military ranks in English']

redirect_names = ['Napoleon I', 'Alexander III of Macedon']

non_english_tokens = ['pt.', 'tr.', 'ko.', 'ja.',
                      'th.', 'da.', 'es.', 'de.',
                      'it.', 'fr.', 'zh.']

sides = {0: 'lhs', 1: 'rhs'}

bad_text = ('class="image"', 'class="thumbborder"',
            'cite_note', 'disambiguation needed',
            'cnote_g')

def get_full_url(url):
    if 'https://' in url:
        return url
    else:
        return f'https://en.wikipedia.org{url}'
    
def process_side_links(cell, ul, session):
    anchors = cell.find_all('a')
    for anchor in anchors:
        anchor_string = str(anchor)
        if 'title' in anchor_string:
            if any(map(lambda s: s in anchor_string, bad_text)):
                continue

            title = anchor['title']
            if title in redirect_names:
                print(f"Redirect for {title}")

            href = anchor['href']
            url = get_full_url(href)

            if len(href) <= 10:
                pass
            elif href[8] + href[9] + href[10] in non_english_tokens:
                print(f'Non-english page found at {url}')
            elif ul is not None:
                print(f'Redirected for {title} at {url}')
                url = get_full_url(ul.a['href'])

            try:
                response = session.get(url)
            except requests.ConnectionError:
                print(f"Belligerent request error for {url}")
                return None

            soup = BeautifulSoup(response.text)
            name = soup.find('title')

            # Blank? Move on.
            if not name:
                continue

            name = name.text.replace(' - Wikipedia', '')

            # Not the real princess.
            if name in omit_list:
                continue
                
            yield name
                
def scrape_page(row):
    battle_page_url = get_full_url(row.url)
    
    with requests.Session() as session:
        try:
            battle_page_response = session.get(battle_page_url)
        except requests.ConnectionError:
            print(f'Initial request to {battle_page_url} failed')
            return None
            
        battle_page_soup = BeautifulSoup(battle_page_response.text)
        infobox = battle_page_soup.find('table', {'class': 'infobox vevent'})
        
        if infobox is None:
            print(f"No infobox found for {battle_page_url}")
            return None
        
        try:
            details, stub = pd.read_html(str(infobox), )
        except ValueError:
            print(f"Error parsing details table from HTML from {battle_page_url}")
            
            return None
        
        battle_data = defaultdict(list)
        html_table = infobox.find_all('tr')
        ul = battle_page_soup.find('ul', {'class': 'redirectText'}) 
        
        for index, table_row in enumerate(html_table[:-1]):
            if 'Belligerents' in str(table_row):                
                belligerents = html_table[index + 1]
                cells = belligerents.find_all('td')
                
                for side, cell in enumerate(cells):
                    for belligerent_name in process_side_links(cell, ul, session):
                        battle_data[f"{sides[side]}_belligerents"].append(belligerent_name)
                        
                    if side == 1:
                        break
                
            # Parse commanders and leaders cell contents
            elif 'Commanders and leaders' in str(table_row):
                leaders = html_table[index + 1]
                cells = leaders.find_all('td')

                for side, cell in enumerate(cells):
                    for leader_name in process_side_links(cell, ul, session):
                        battle_data[f"{sides[side]}_leaders"].append(leader_name)
                        
                    if side == 1:
                        break
                        
            elif 'Strength' in str(table_row):
                strengths = html_table[index + 1]
                cells = strengths.find_all('td')
                
                for side, cell in enumerate(cells):
                    cell = BeautifulSoup(str(cell).replace('<br/>', '\n')).find('td')
                    for stg in map(lambda x: x.strip(), cell.text.strip().split('\n')):
                        if stg:
                            battle_data[f"{sides[side]}_strengths"].append(stg)
                    
                    if side == 1:
                        break
                            
            elif 'Casualties and losses' in str(table_row):
                casualties = html_table[index + 1]
                cells = casualties.find_all('td')
                
                for side, cell in enumerate(cells):
                    cell = BeautifulSoup(str(cell).replace('<br/>', '\n')).find('td')
                    for cas in map(lambda x: x.strip(), cell.text.strip().split('\n')):
                        if cas:
                            battle_data[f"{sides[side]}_casualties"].append(cas)
                            
                    if side == 1:
                        break

        for _, stub_row in stub.iterrows():
            # Right and left columns
            rl, rr = stub_row.iloc[0], stub_row.iloc[1]
            
            if rl == 'Date':
                battle_data['date'] = rr.replace('\xa0', '')
            elif rl == 'Location':
                battle_data['loc'] = rr
            elif rl == 'Result':
                battle_data['outcome'] = rr

    
    battle_data = {k: maybe_join(v) for k, v in battle_data.items()}
    return {row.title: battle_data}

def maybe_join(v):
    if type(v) is list:
        return '\t'.join(v)
    else:
        return v

In [None]:
results = []
for i in range(5):
    test_row = battles.sample(n=1).iloc[0]
    battle_data = scrape_page(test_row)
    results.append(battle_data)
# battle_data

In [None]:
battles_df = pd.concat([pd.DataFrame(r).T for r in results if r is not None], axis=0)
battles_df

In [None]:
with Pool(processes=12) as pool:
    results = pool.map(scrape_page, map(lambda r: r[1], battles.iterrows()))

In [None]:
battles_df = pd.concat([pd.DataFrame(r).T for r in results if r is not None], axis=0)
battles_df.drop_duplicates(inplace=True)

gotta_have_fields = ('outcome', 
                     'rhs_leaders', 'lhs_leaders', 
                     'rhs_belligerents', 'lhs_belligerents')

battles_df.dropna(subset=gotta_have_fields, inplace=True)

battles_df.sample(n=5)

In [None]:
battles_df.to_csv('./data/bayes_battle_records.tsv', sep=';')

## Inference

In [None]:
battles_df = pd.read_csv('./data/bayes_battle_records.tsv', sep=';', index_col=0)

In [None]:
hand_labels = pd.read_csv('./data/current_run.csv', index_col=0)
hand_labels['Battle'] = hand_labels.Battle.str.replace('_', ' ')

In [None]:
def get_winner(battle):
    sel = hand_labels.Battle == battle
    sel &= hand_labels.VorD == 'V'
    vrow = hand_labels.loc[sel]
    try:
        vrow = vrow.iloc[0]
        return vrow.pos
    except IndexError:
        return None

battles_df['hand_outcome'] = battles_df.index.map(get_winner)
battles_df.dropna(subset=('hand_outcome', ), inplace=True)

In [None]:
force_types = ("Infantry", "Cavalry", "Artillery", 
               "Ships", "Airforce", "Special")

In [None]:
for ft in force_types:
    battles_df[f"lhs_{ft}"] = np.nan
    battles_df[f"rhs_{ft}"] = np.nan
    
    def get_forces(battle, pos):
        sel = hand_labels.Battle == battle
        sel &= hand_labels.pos == pos
        prow = hand_labels.loc[sel]
        
        try:
            prow = prow.iloc[0]
            return prow[ft]
        except IndexError:
            return None
    
    battles_df[f"lhs_{ft}"] = battles_df.index.map(lambda b: get_forces(b, 'L'))
    battles_df[f"rhs_{ft}"] = battles_df.index.map(lambda b: get_forces(b, 'R'))

In [None]:
battles_df.sample(n=5)

In [None]:
excluded_leaders = set(['Capital punishment', 'Lieutenant general', '6th Panzer Army', 
                        'Navy', 'Tsar', 'Strategos', 'Knight', 'Kurdistan Democratic Party', 
                        'List of Khazar rulers', 'Commodore (rank)', 'II Corps (Pakistan)',
                        'Air marshal', 'Air chief marshal', 'Captain (armed forces)', 
                        'Campuzano Polanco family', 'XX Corps (United Kingdom)', 
                        'American Civil War', 'Israeli Navy', 'Archduke', 'Arab Liberation Army',
                        "Eighty Years' War", 'Central Command (India)', 'South Wales Borderers', 
                        'Big Red Meat', 'XI Corps (India)', 'Prime Minister of Israel', 'Army Group B', 
                        'Bangladesh Police', 'Brigadier general', 'Colonel', 'Republic of Venice', 
                        'Suicide', 'Wikimedia Error', 'Sweden', 'Wikipedia:Citation needed',
                        'Ethiopian Empire', 'Major general', 'Grand vizier', 'Germanic kingship', 
                        'Lieutenant colonel', 'Norway', 'Sir', 'Brigadier'])

leader_counts = defaultdict(lambda: 0)
def count_leaders(leaders):
    for leader in leaders.split('\t'):
        if leader not in excluded_leaders:
            leader_counts[leader] += 1
        
    return None

battles_df.lhs_leaders.map(count_leaders)
battles_df.rhs_leaders.map(count_leaders);

In [None]:
leaders_df = pd.DataFrame.from_dict(leader_counts, orient='index', columns=['battle_count'])
leaders_df = leaders_df.loc[leaders_df.battle_count >= 2]

In [None]:
def filter_leaders(leader_string):
    leaders = leader_string.split('\t')
    
    new_string = '\t'.join(l for l in leaders if l in leaders_df.index)
    
    if new_string == '':
        return None
    
    else:
        return new_string

In [None]:
battles_df['lhs_filtered'] = battles_df.lhs_leaders.map(filter_leaders)
battles_df['rhs_filtered'] = battles_df.rhs_leaders.map(filter_leaders)
battles_df.dropna(subset=('lhs_filtered', 'rhs_filtered'), inplace=True)

In [None]:
battles_df['label'] = battles_df.hand_outcome.map({'R': 0, 'L': 1})

In [None]:
battles_df.sample(n=3)

In [None]:
leaders_df.sort_index(inplace=True)
leaders_df['selector'] = list(range(len(leaders_df)))
leaders_df.sample(n=3)

In [None]:
# leaders_df.loc[leaders_df.battle_count > 6].sort_index().iloc[:50]

In [None]:
import theano.tensor as tt
from theano import shared as tshared

In [None]:
import pymc3 as pm

In [None]:
lhs_leaders_mat = np.zeros((len(battles_df), len(leaders_df)), dtype=int)
rhs_leaders_mat = np.zeros((len(battles_df), len(leaders_df)), dtype=int)

for bi, (battle_name, row) in enumerate(battles_df.iterrows()):
    lhs_leaders = row.lhs_filtered.split('\t')
    for leader in lhs_leaders:
        li = leaders_df.loc[leader].selector
        lhs_leaders_mat[bi, li] = 1
        
    rhs_leaders = row.rhs_filtered.split('\t')
    for leader in rhs_leaders:
        li = leaders_df.loc[leader].selector
        rhs_leaders_mat[bi, li] = 1

lhs_leaders_shared = tshared(lhs_leaders_mat)
rhs_leaders_shared = tshared(rhs_leaders_mat)

In [None]:
battles_df.fillna(value=0, inplace=True)

In [None]:
lhs_strength_mat = np.zeros((len(battles_df), len(force_types)))
rhs_strength_mat = np.zeros((len(battles_df), len(force_types)))

for bi, (battle_name, row) in enumerate(battles_df.iterrows()):
    for fi, ft in enumerate(force_types):
        s = row[f"lhs_{ft}"]
        if pd.notnull(s):
            lhs_strength_mat[bi, fi] = s
            
        s = row[f"rhs_{ft}"]
        if pd.notnull(s):
            rhs_strength_mat[bi, fi] = s

In [None]:
# Convert strengths to ratios 
# - if you have twice as many infantry as opponent, then you get an infantry score of +2 and your opponent -.5

# max_strengths = np.stack([lhs_strength_mat, rhs_strength_mat]).max(axis=0) + 1e-4

# lhs_strength_mat = lhs_strength_mat / max_strengths
# rhs_strength_mat = rhs_strength_mat / max_strengths

# Use strength diffs instead (equivalent in un-normalized case)
# Rescale by dividing by max strength

max_strengths = np.stack([lhs_strength_mat, rhs_strength_mat]).max(axis=0) + 1e-4
strength_diffs = lhs_strength_mat - rhs_strength_mat
strength_diffs /= max_strengths

In [None]:
with pm.Model() as model:
    leaders_prior = pm.HalfCauchy('leaders_prior', 2.5)
    leader_coeffs = pm.Normal('leader_coeffs', 
                              mu=0, sigma=leaders_prior, 
                              shape=(len(leaders_df), ))
    
    strengths_prior = pm.HalfCauchy('strengths_prior', 2.5)
#     strength_nu = pm.Uniform("nu", 1, 100)
    
    # Hard prior on strength coefficients - a forces advantage should always lead to a win improvement
    strength_coeffs = pm.HalfNormal('strength_coeffs', 
#                                       nu=strength_nu, 
                                    sigma=strengths_prior,
                                    shape=(len(force_types, )))
        
    scores = tt.dot(lhs_leaders_shared, leader_coeffs) / lhs_leaders_shared.sum(axis=1)
    scores -= tt.dot(rhs_leaders_shared, leader_coeffs) / rhs_leaders_shared.sum(axis=1)
    scores += tt.dot(strength_diffs, strength_coeffs)
    
    y = pm.Bernoulli('outcomes', logit_p=scores, observed=battles_df.label.values)

In [None]:
with model:
    trace = pm.sample(tune=2500, draws=2500, chains=4, cores=4)

In [None]:
trace_summary = pm.summary(trace)
trace_summary

In [None]:
import arviz as az
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
def plot_posterior(trace, name, ax):
    leader_trace = trace['leader_coeffs'][:, leaders_df.loc[name].selector]
    az.plot_posterior(leader_trace,
                      kind='hist', point_estimate='mean', round_to=3,
                      ref_val=0, credible_interval=.95, rope=(-.1, .1),
                      edgecolor='white', ax=ax)
    
    plt.setp(ax, title=name)
    return ax

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(24, 12), dpi=150, sharey=True, sharex=True)

plot_posterior(trace, 'George B. McClellan', axes[0, 0])
plot_posterior(trace, 'William Tecumseh Sherman', axes[0, 1])
plot_posterior(trace, 'Ulysses S. Grant', axes[0, 2])
plot_posterior(trace, 'Robert E. Lee', axes[1, 0])
plot_posterior(trace, 'Stonewall Jackson', axes[1, 1])
plot_posterior(trace, 'James Longstreet', axes[1, 2]);

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(24, 6), dpi=150, sharex=True, sharey=True)

plot_posterior(trace, 'Napoleon', axes[0])
plot_posterior(trace, 'Arthur Wellesley, 1st Duke of Wellington', axes[1])
plot_posterior(trace, 'Gebhard Leberecht von Blücher', axes[2]);

In [None]:
def plot_diff(trace, name0, name1, ax):
    leader_trace_0 = trace['leader_coeffs'][:, leaders_df.loc[name0].selector]
    leader_trace_1 = trace['leader_coeffs'][:, leaders_df.loc[name1].selector]
    
    leader_trace = leader_trace_0 - leader_trace_1
    
    az.plot_posterior(leader_trace,
                      kind='hist', point_estimate='mean', round_to=3,
                      ref_val=0, credible_interval=.95, rope=(-.1, .1),
                      edgecolor='white', ax=ax)
    
    plt.setp(ax, title=f"{name0} VS {name1}")
    return ax

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(4, 3), dpi=150)
plot_posterior(trace, 'George Washington', axes);

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(4, 3), dpi=150)
plot_diff(trace, 'Ulysses S. Grant', 'Robert E. Lee', axes);

In [None]:
leaders_df.sample(n=5)

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(4, 3), dpi=150)
plot_posterior(trace, 'Winfield Scott', axes);

In [None]:
def plot_coeff_posterior(trace, name, ax):
    idx = force_types.index(name)
    force_trace = trace["strength_coeffs"][:, idx]
    
    az.plot_posterior(force_trace,
                      kind='hist', point_estimate='mean', round_to=3,
                      ref_val=0, credible_interval=.95, rope=(-.1, .1),
                      edgecolor='white', ax=ax);
    
    plt.setp(ax, title=name)
    return ax

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(24, 12), dpi=150, sharey=True)
for i, ax in enumerate(axes.flatten()):
    ft = force_types[i]
    plot_coeff_posterior(trace, ft, ax)

In [None]:
# battles_df.loc[battles_df.lhs_leaders.str.contains("Napoleon")]

In [None]:
# leaders_df.loc[leaders_df.index.str.contains('George')]