### Year Estimates from Topical Wars

In [1]:
import pandas as pd
import re
import numpy as np

#### Pull in some data just to test it out

orig_data = pd.read_csv("presidential_speeches.csv")
orig_data.head()

data = orig_data.drop(columns = ['Date', 'Party', 'Speech Title', 'Summary', 'URL'])

data = data.dropna()

#### List of Wars and Start Dates

In [5]:
#read in Wikipedia data
url = 'https://en.wikipedia.org/wiki/List_of_the_lengths_of_United_States_participation_in_wars'
wars = pd.read_html(url, header = 0)

In [6]:
wars = wars[1]

In [7]:
wars = wars.drop(columns = ['Duration (graphical representation)', 'Unnamed: 0'])

In [8]:
wars = wars.dropna()
wars.head()

Unnamed: 0,War,Dates,Duration
0,Afghan War,"2001 – 2014, 2015 – present[1][2]","19.2 years (19 years, 1 month)"
2,Vietnam War,1955/11 – 1973/04[3][4][5][6][7],"17.4 years (17 years, 4 months)"
4,Moro Rebellion,1899 – 1913,14 years
6,War in North-West Pakistan,2004 – 2017,13 years
8,Northwest Indian War,1785 – 1795,10 years


###### Add a lowercase column containing the names of wars, without the word "war" at the end.

In [9]:
# To lowercase and remove the word "war." (While this may occationally lead to odd behavior, I think its best. 
# This is because
# people are more inclined to say, for instance "Vietnam" than "Vietnam War.")
def remove_war(name):
    if name[-4:len(name)] == ' War':
        return name[0:-4].lower()
    else:
        return name.lower()

In [10]:
remove_war('Afghan War')

'afghan'

In [11]:
wars['shortened_name'] = wars['War'].apply(remove_war)
wars.head()

Unnamed: 0,War,Dates,Duration,shortened_name
0,Afghan War,"2001 – 2014, 2015 – present[1][2]","19.2 years (19 years, 1 month)",afghan
2,Vietnam War,1955/11 – 1973/04[3][4][5][6][7],"17.4 years (17 years, 4 months)",vietnam
4,Moro Rebellion,1899 – 1913,14 years,moro rebellion
6,War in North-West Pakistan,2004 – 2017,13 years,war in north-west pakistan
8,Northwest Indian War,1785 – 1795,10 years,northwest indian


###### Add a column of the start date for each war.

In [12]:
# Extract year and turn it into an int.
def extract_year(date):
    year = date[0:4]
    return int(year)

In [13]:
wars['year'] = wars['Dates'].apply(extract_year)
wars.head()

Unnamed: 0,War,Dates,Duration,shortened_name,year
0,Afghan War,"2001 – 2014, 2015 – present[1][2]","19.2 years (19 years, 1 month)",afghan,2001
2,Vietnam War,1955/11 – 1973/04[3][4][5][6][7],"17.4 years (17 years, 4 months)",vietnam,1955
4,Moro Rebellion,1899 – 1913,14 years,moro rebellion,1899
6,War in North-West Pakistan,2004 – 2017,13 years,war in north-west pakistan,2004
8,Northwest Indian War,1785 – 1795,10 years,northwest indian,1785


In [14]:
wars.dtypes

War               object
Dates             object
Duration          object
shortened_name    object
year               int64
dtype: object

###### Create a regular expression to match war names.

In [15]:
class rgx():
    def __init__(self):
        self.rgx = ''
    
    def make_regex(self, name):
        self.rgx += name + '|'

In [16]:
rgx = rgx()
wars['shortened_name'].apply(rgx.make_regex)
rgx.rgx = rgx.rgx[0:-1]

In [17]:
# The "|" symbol means "or." So, the regular expression is looking for something that says "afghan" or "vietnam" 
# or "moro rebellion" or ...
rgx.rgx

"afghan|vietnam|moro rebellion|war in north-west pakistan|northwest indian|iraq|american revolutionary|second seminole|war on isil|first barbary|american civil|world war ii|korean|war of 1812|red cloud's|mexican–american|world war i|russian civil|great sioux war of 1876|libyan civil war (2011)|persian gulf|whiskey rebellion|spanish–american|kosovo|invasion of panama|cuban missile crisis|invasion of grenada|bay of pigs invasion"

###### After all that preparation, the function can now be written.

In [39]:
def year_from_wars(speeches):
    estimates = np.zeros(len(speeches)) # for holding average year
    
    # for every speech
    for idx in range(len(speeches)):
        start_pos = 0
        num_wars = 0
        
        # loop through transcript until every war mention has been found
        while re.search(rgx.rgx, speeches.iloc[idx,:]['Transcript'][start_pos:], re.IGNORECASE) is not None:
            
            found = re.search(rgx.rgx, speeches.iloc[idx,:]['Transcript'][start_pos:], re.IGNORECASE)
            
            # add the date of the beginning of the war
            estimates[idx] += wars[wars['shortened_name'] == speeches.iloc[idx,:]['Transcript'] \
                                   [found.start()+start_pos:found.end()+start_pos].lower()].iloc[0,4]
            
            # the start position for looking at the remaining transcript
            start_pos += found.end()
            
            # tick number of wars found in this speech
            num_wars += 1
        
        # if any wars were found, divide by total number of name drops
        if estimates[idx]: estimates[idx] = estimates[idx]/num_wars
            
    # scale
    estimates = estimates/2019
    
    # return as series WITH NANS
    return pd.Series(estimates, name = 'year_from_wars').replace(to_replace = 0.0, value = np.nan) 
        

###### Test it out. Looks like it works pretty well, although there are a lot of NaNs at the beginning.

estimates = year_from_wars(data)

import matplotlib.pyplot as plt
plt.scatter(estimates, range(len(estimates)))