# Scrape battles from Wikipedia (Archival)

The original system scraped Wikipedia closely to the following. I've edited it to get rid of the `global` statements and remove side effects from the functions so that they can be used with `threading` or `multiprocessing` (and then do so for the longer scrape).

## Imports and config

In [None]:
from collections import defaultdict
from multiprocessing.pool import Pool

import requests
import pandas as pd

from bs4 import BeautifulSoup

In [None]:
pd.options.display.max_rows = 50
%matplotlib inline

## Scrape list pages to get battle page URLs

### List pages

In [None]:
table_pages = ['https://en.wikipedia.org/wiki/List_of_battles_before_301',
               'https://en.wikipedia.org/wiki/List_of_battles_301%E2%80%931300',
               'https://en.wikipedia.org/wiki/List_of_battles_1301%E2%80%931600',
               'https://en.wikipedia.org/wiki/List_of_battles_since_2001']

list_pages = ['https://en.wikipedia.org/wiki/List_of_battles_1601%E2%80%931800',
               'https://en.wikipedia.org/wiki/List_of_battles_1801%E2%80%931900',
               'https://en.wikipedia.org/wiki/List_of_battles_1901%E2%80%932000',]

### Scrape functions

In [None]:
def retrieve_battles(url):
    link_list = []
    title_list = []
    
    with requests.Session() as session:
        response = session.get(url)
        
    soup = BeautifulSoup(response.content)
    bullets = soup.find_all('li')
    
    for bullet in bullets:
        if 'title' in str(bullet):
            if 'footer' not in str(bullet):
                link = bullet.a['href']
                link_list.append(link)
                
                title = bullet.a.get('title', None)
                title_list.append(title)

    return link_list, title_list

# Them's
fightin_words = ['Fall', 'Battle', 'Siege', 'Capture', 'Operation', 'Action', 'Recapture']

def retrieve_early_battles(url):
    link_list = []
    title_list = []
    
    with requests.Session() as session:
        responses = requests.get(url)
        soup = BeautifulSoup(responses.content)
        tables = soup.find_all('table', {'class': 'wikitable'})
        
        for table in tables:
            cells = table.find_all('td')
            for cell in cells:
                if cell.a is not None:
                    if any(x in str(cell.a['href']) for x in fightin_words):
                        title_list.append(cell.a['title'])
                        link_list.append(cell.a['href'])
                        
    return link_list, title_list

### Run scrape

In [None]:
link_list = []
title_list = []

for url in list_pages:
    links, titles = retrieve_battles(url)
    link_list += links
    title_list += titles

late_battles = pd.DataFrame.from_dict({'url': link_list, 'title': title_list})
late_battles.drop_duplicates(inplace=True)
late_battles.dropna(subset=['title'], inplace=True)

print("# Late battles:", len(late_battles))
late_battles.sample(n=5)

In [None]:
link_list = []
title_list = []

for url in table_pages:
    links, titles = retrieve_early_battles(url)
    link_list += links
    title_list += titles

early_battles = pd.DataFrame.from_dict({'url': link_list, 'title': title_list})
early_battles.drop_duplicates(inplace=True)
early_battles.dropna(subset=['title'], inplace=True)

print("# Early battles:", len(early_battles))
early_battles.sample(n=5)

### Combine battles and save

In [None]:
battles = pd.concat([early_battles, late_battles])
battles.reset_index(drop=True, inplace=True)
battles.drop_duplicates(inplace=True)

print('Total # battles:', len(battles))
battles.sample(n=5)

In [None]:
battles.to_csv('./data/all_battles.tsv', encoding='utf-8', sep='\t')

## Load saved dataframes

(Resume work)

In [None]:
battles = pd.read_csv('./data/all_battles.tsv', encoding='utf-8', sep='\t', index_col=0)
# Exclude nonexistent pages or non-english wikipedia pages
# (There aren't that many, so better to dump them)
battles = battles.loc[~battles.title.str.contains('does not exist')]
battles = battles.loc[~battles.url.str.contains('https')]

# Exclude wars
battles = battles.loc[~battles.title.map(lambda s: 'War' in s.split(' '))]

# Exclude support page links etc
battles = battles.loc[~battles.url.str.contains('Wikipedia')]
battles = battles.loc[~battles.url.str.contains('index.php')]
battles = battles.loc[~battles.url.str.contains('Special:')]
battles = battles.loc[~battles.url.str.contains('List_of')]
battles = battles.loc[~battles.url.str.contains('Category:')]

# We only want battles, not extended campaigns or distributed multi-event things
conflict_terms = ('battle', 'siege', 'fall', 'sack', 
                  'operation', 'capture', 'raid', 
                  'action', 'destruction', 'massacre')

battles = battles.loc[battles.title.map(lambda s: any(b in s.lower() for b in conflict_terms))]
len(battles)

In [None]:
omit_list = ['Capital punishment',
             'Military advisor',
             'Wounded in action',
             'Prisoner of war',
             'Killed in action',
             'Surrender (military)',
             'Surrendered',
             'Common military ranks in English']

redirect_names = ['Napoleon I', 'Alexander III of Macedon']

non_english_tokens = ['pt.', 'tr.', 'ko.', 'ja.',
                      'th.', 'da.', 'es.', 'de.',
                      'it.', 'fr.', 'zh.']


def get_full_url(url):
    if 'https://' in url:
        return url
    else:
        return f'https://en.wikipedia.org{url}'


def table_scrape(url, name):
    battle_page_url = get_full_url(url)
    try:
        battle_page_response = requests.get(battle_page_url)
    except requests.ConnectionError:
        print(f'Initial request error for {battle_page_url}')
        return None

    battle_page_soup = BeautifulSoup(battle_page_response.text)
    infobox = battle_page_soup.find('table', {'class': 'infobox vevent'})

    if infobox is not None:
        try:
            details, stub = pd.read_html(str(infobox), )
        except ValueError:
            print(f"Error parsing details table from HTML from {battle_page_url}")
            return str(infobox)
    else:
        print(f"No infobox found for {battle_page_url}")
        return None

    # TODO: also keep belligerents info, not just leader
    if 'Belligerents' in str(infobox):
        try:
            clean_rows = stub.drop_duplicates(0).set_index(0, drop=True).loc[['Date', 'Location', 'Result'], :]
        except KeyError:
            print(f'Error extracting rows from table from {battle_page_url}')
            return None

        clean_col = clean_rows.transpose()
        clean_col.reset_index(drop=True, inplace=True)

        html_table = infobox.find_all('tr')
        belligerents = defaultdict(list)

        lhs_df = pd.DataFrame()
        rhs_df = pd.DataFrame()

        for index, cell in enumerate(html_table):
            # Parse commanders and leaders cell contents
            if 'Commanders and leaders' in str(cell):
                leaders = html_table[index + 1]
                cells = leaders.find_all('td')

                for side, cell in enumerate(cells):
                    anchors = cell.find_all('a')

                    for anchor in anchors:
                        anchor_string = str(anchor)

                        if 'title' in anchor_string:
                            # If any of this text is in anchor string,
                            # it's not a link we want to keep - move on
                            bad_text = ('class="image"', 'class="thumbborder"',
                                        'cite_note', 'disambiguation needed',
                                        'cnote_g')

                            if any(map(lambda s: s in anchor_string, bad_text)):
                                continue

                            title = anchor['title']
                            if title in redirect_names:
                                print(f"Redirect for {title}")

                            href = anchor['href']
                            leader_page_url = get_full_url(href)
                            ul = battle_page_soup.find('ul', {'class': 'redirectText'})

                            if len(href) <= 10:
                                pass
                            elif href[8] + href[9] + href[10] in non_english_tokens:
                                print(f'Non-english page found at {leader_page_url}')
                            elif ul is not None:
                                print(
                                    f'Redirected for {title} at {leader_page_url}')
                                leader_page_url = get_full_url(ul.a['href'])

                            try:
                                leader_response = requests.get(leader_page_url)
                            except requests.ConnectionError:
                                print(f"Leader request error for {leader_page_url}")
                                return None

                            leader_soup = BeautifulSoup(leader_response.text)
                            leader_name = leader_soup.find('title')

                            # Blank leader? Move on.
                            if not leader_name:
                                continue
                                
                            leader_name = leader_name.text.replace(' - Wikipedia', '')

                            # These aren't actually links to leaders - move on
                            if leader_name in omit_list:
                                continue

                            belligerents[side].append(leader_name)

        for leader in belligerents[0]:
            clean_col['leader'] = leader
            lhs_df = pd.concat([lhs_df, clean_col])

        for leader in belligerents[1]:
            clean_col['leader'] = leader
            rhs_df = pd.concat([rhs_df, clean_col])

        lhs_df.reset_index(drop=True, inplace=True)
        rhs_df.reset_index(drop=True, inplace=True)

        # rhs_df = rhs_df.rename(columns={0: 'Location'})

        if 'Strength' in str(details):
            strength_finder = details.loc[details.iloc[:, 0] == 'Strength']
            strength_row = strength_finder.index + 1
            strength = details.loc[strength_row]
            strength_x = strength.iloc[0, 0]
            strength_y = strength.iloc[0, 1]
        else:
            strength_x = None
            strength_y = None

        lhs_df['own'] = strength_x
        lhs_df['opp'] = strength_y

        rhs_df['own'] = strength_y
        rhs_df['opp'] = strength_x

        if 'Casualties and losses' not in str(details):
            lhs_df['taken'] = None
            lhs_df['inflicted'] = None
            rhs_df['taken'] = None
            rhs_df['inflicted'] = None

        else:
            casualties_finder = details[details.iloc[:, 0] == 'Casualties and losses']
            casualties_row = casualties_finder.index + 1
            casualties = details.loc[casualties_row]
            casualties_x = casualties.iloc[0, 0]
            casualties_y = casualties.iloc[0, 1]

            lhs_df['taken'] = casualties_x
            lhs_df['inflicted'] = casualties_y
            rhs_df['taken'] = casualties_y
            rhs_df['inflicted'] = casualties_x

        lhs_df['Battle'] = name
        rhs_df['Battle'] = name

        lhs_df['pos'] = 'L'
        rhs_df['pos'] = 'R'

        df = pd.concat([lhs_df, rhs_df])
        df.reset_index(drop=True, inplace=True)
        df.drop_duplicates(inplace=True)
        return df

    else:
        print('Filtered: ' + name)
        
def map_func(row):
    """Run table_scrape with a single argument for use with itterrows."""
    return table_scrape(row.url, row.title)

In [None]:
# Vroom
# Using processes instead of threads as each function call combines IO/CPU blocking steps
# If you don't have a fat CPU, use fewer processes

with Pool(processes=32) as pool:
    results = pool.map(map_func, map(lambda r: r[1], battles.iterrows()))
    
results = pd.concat([r for r in results if type(r) == type(pd.DataFrame())])
results.reset_index(drop=True, inplace=True)
results.drop_duplicates(inplace=True)
results.sample(n=5)

In [None]:
print(f"Scraped page for {len(results.Battle.unique())} found battles / {len(battles)}")
print('Sample URLS')
for url in battles.sample(n=5).url.map(lambda x: f"https://en.wikipedia.org{x}").tolist():
    print(url)
    
results.leader.value_counts()

In [None]:
results.to_csv('./data/scraped_battle_data.csv', encoding='utf-8')