# Compute WAR scores

## Imports and config

In [None]:
import pickle

from multiprocessing.pool import Pool

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
%matplotlib inline

## Load saved logistic regression model

In [None]:
with open('./data/saved_logistic_regression.pkl', 'rb') as f:
    logistic_regression = pickle.load(f)

## Load data

In [None]:
# From original. Adding Battle of Arras manually?
arras_1 = {'Battle': ['Battle of Arras', 'Battle of Arras'], 
           'Date': ['21 May 1940', '21 May 1940'],
           'Location': ['Arras', 'Arras'], 
           'Result': ['German victory', 'German victory'] ,
           'belligerent': ['Erwin Rommel', 'Harold Franklyn'],
           'opp': ['manual', 'manual'],
           'own': ['manual', 'manual'], 
           'pos': ['R', 'L'],
           'VorD': ['V', 'D'],
           'Infantry': [7500.0, 2000.0],
           'Cavalry': [225.0, 74.0],
           'Artillery': [np.NaN, np.NaN],
           'Ships': [np.NaN, np.NaN],
           'Airforce': [np.NaN, np.NaN],
           'Special': [np.NaN, np.NaN],
           'year': [1940, 1940]}

In [None]:
df_strength_all = pd.read_csv('./data/current_run.csv', encoding='utf-8', index_col=0)
df_strength_all['Battle'] = df_strength_all.Battle.str.replace('_'," ")

df_run = df_strength_all.loc[df_strength_all.VorD.notnull()]
df_run = df_run.dropna(subset=['belligerent'])

df_lookup = df_strength_all.drop_duplicates(subset=['Battle', 'pos'])
df_strength_all = pd.concat([df_lookup, pd.DataFrame(arras_1, index=[10515, 10516])])

df_run.sample(n=3)

## Estimate WAR scores

### Scoring function

In [None]:
flip_sides = {'L': 'R', 'R': 'L'}

opponent_column_renames = {'Infantry': 'infantry_opp', 
                           'Cavalry': 'cavalry_opp', 
                           'Artillery': 'artillery_opp', 
                           'Ships':' ships_opp', 
                           'Airforce': 'airforce_opp',
                           'Special': 'special_opp'}

outcome_values = {'V': .5, 'D': -.5, 'I': 0}

def estimate_WAR(general):
    df_dbg = df_run.loc[df_run.belligerent == general]

    war = 0    
    battle_records = []
    
    for battle in df_dbg.Battle.unique():
        row_ = df_dbg.loc[df_dbg.Battle == battle]
        row = row_.iloc[0]
        year = row.year
        battle = row.Battle
        outcome = row.VorD

        df_str = df_lookup.loc[df_lookup.Battle == battle].reset_index(drop=True)
        opp_row = df_str.loc[df_str.pos == flip_sides[row.pos]]
            
        # TODO: probably better ways to do this
        own = row_.drop(['Battle', 'Date', 'Location', 'Result', 'belligerent', 'opp', 'own', 'pos', 'VorD', 'year'], axis=1)
        opp = opp_row.drop(['Battle', 'Date', 'Location', 'Result', 'belligerent', 'opp', 'own', 'pos', 'VorD', 'year'], axis=1)
        opp = opp.rename(columns=opponent_column_renames).reset_index(drop=True)

        pred_row = pd.concat([pd.DataFrame(own), opp], axis=1)
        pred_diff = pd.DataFrame()
        
        for k, v in opponent_column_renames.items():
            x, y = pred_row[k], pred_row[v]
            pred_diff[v.replace('opp', 'diff')] = (x - y) / (x + y)
            
        pred_diff.fillna(0, inplace=True)
        pred_diff.drop(['artillery_diff', 'special_diff'], axis=1, inplace=True)
            
        df_pred = pd.DataFrame(logistic_regression.predict_proba(pred_diff))        

        # Not sure how these checks were decided on
        if len(df_str) == 1 or df_str.Infantry.loc[0] == 1.0 or df_str.Infantry.loc[1] == 1.0:
            value = outcome_values[outcome]
        else:
            if outcome == 'V':
                value = df_pred.iloc[0, 0]
            elif outcome == 'D':
                value = 0 - df_pred.iloc[0, 1]
            elif outcome == 'I':
                value = .5 - df_pred.iloc[0, 1]    
        
        war = war + value
        war_per = float(war) / float(len(df_dbg))
        
        battle_records.append({'Battle': battle, 'Value': value, 'Year': year, 'Outcome': outcome})
        
    return {general: {'WAR': war, 'Battles': len(df_dbg), 'WAR_per_battle': war_per}}, {general: battle_records}

### Compute WAR for all leaders

In [None]:
# These should have been taken care of as early as possible! Probably even more problem ones...
excluded_leaders = set(['Capital punishment', 'Lieutenant general', '6th Panzer Army', 
                        'Navy', 'Tsar', 'Strategos', 'Knight', 'Kurdistan Democratic Party', 
                        'List of Khazar rulers', 'Commodore (rank)', 'II Corps (Pakistan)',
                        'Air marshal', 'Air chief marshal', 'Captain (armed forces)', 
                        'Campuzano Polanco family', 'XX Corps (United Kingdom)', 
                        'American Civil War', 'Israeli Navy', 'Archduke', 'Arab Liberation Army',
                        "Eighty Years' War", 'Central Command (India)', 'South Wales Borderers', 
                        'Big Red Meat', 'XI Corps (India)', 'Prime Minister of Israel', 'Army Group B', 
                        'Bangladesh Police'])

generals = list(set(df_run.belligerent.tolist()) - excluded_leaders)

# Need for speed
with Pool(processes=8) as pool:
    war_results = pool.map(estimate_WAR, generals)

In [None]:
# Unpack results from pool into separate DataFrames
# war_results_df has per-general WAR scores and battle counts
# battle_records_df has per-(general, battle) incremental values/attributions

main_war_results, battle_records = list(zip(*war_results))

war_results_df = pd.DataFrame.from_dict({k: v for w in main_war_results for k, v in w.items()}, orient='index')

battle_records_ = []
for i, rec in enumerate(battle_records):
    for general, record_list in rec.items():
        for r in record_list:
            new_rec = {}
            new_rec['General'] = general
            new_rec.update(r)
            battle_records_.append(new_rec)

battle_records_df = pd.DataFrame(battle_records_)

## Inspect results

In [None]:
war_results_df.sample(n=10)

In [None]:
war_results_df.loc['Pyrrhus of Epirus']

In [None]:
war_results_df.sort_values('WAR', ascending=False).head(10)

In [None]:
war_results_df.sort_values('WAR', ascending=False).tail(10)

## Figures

### WAR distribution

Histogram of WAR scores. Annotated with leaders of interest.

In [None]:
def annotate_hist(general, ax, offset=0, text_height=-400):
    score = war_results_df.loc[general].WAR
    ax.annotate(f"{general}\n({score:.2f})", 
                zorder=-1,
                xy=(score, -1),
                xytext=(score - offset, text_height),
                arrowprops={'facecolor': 'black', 'alpha': .2})
    
    return ax

with plt.style.context('default'):
    fig, axes = plt.subplots(1, 1, figsize=(8, 6), dpi=150)

    sns.distplot(war_results_df['WAR'].values, kde=False, bins=45, 
                 ax=axes, hist_kws={'edgecolor': 'white'})

    annotate_hist('Napoleon', axes, offset=.985)
    annotate_hist('Julius Caesar', axes, offset=1.325)
    annotate_hist('Ulysses S. Grant', axes, offset=1.71, text_height=-600)
    annotate_hist('Robert E. Lee', axes, offset=1.37)
    annotate_hist('Crazy Horse', axes, offset=1.25)

plt.setp(axes, title='WAR Scores', ylabel='# Generals')

sns.despine()
fig.tight_layout()