### Find the average float value of years from 1800 to 2019 mentioned in speeches

In [3]:
### Imports

import pandas as pd
import re
import numpy as np

#### Pull in some data just to test it out

orig_data = pd.read_csv("presidential_speeches.csv")
orig_data.head()

data = orig_data.drop(columns = ['Date', 'Party', 'Speech Title', 'Summary', 'URL'])

data = data.dropna()

###### From looking at the dataset, it appears that numbers are written in numerical form (which is very convenient). Here's the regex for that.

In [7]:
# This will match strings of the form " #### ". This should prevent it from picking up on longer numbers.
rgx = ' [0-9]{4} '

###### Now a function can be written to loop over all speeches.

In [33]:
def named_years(speeches):
    estimates = np.zeros(len(speeches)) # for holding average year
    
    # for every speech
    for idx in range(len(speeches)):
        start_pos = 0
        num_years = 0
        
        # loop through transcript until every year mention has been found
        while re.search(rgx, speeches.iloc[idx,:]['Transcript'][start_pos:], re.IGNORECASE) is not None:
            
            found = re.search(rgx, speeches.iloc[idx,:]['Transcript'][start_pos:], re.IGNORECASE)
            
            num = int(speeches.iloc[idx,:]['Transcript'][found.start()+start_pos+1:found.end()+start_pos-1])
            
            # the start position for looking at the remaining transcript
            start_pos += found.end()
            
            if num > 1600 and num < 2020:
                # add the date of the beginning of the war
                estimates[idx] += num

                # tick number of years found in this speech
                num_years += 1
        
        # if any years were found, divide by total number of name drops
        if estimates[idx]: estimates[idx] = estimates[idx]/num_years
            
    # scale
    estimates = estimates/2020
    
    # return as series WITH NANS
    return pd.Series(estimates, name = 'average_named_years').replace(to_replace = 0.0, value = np.nan) 
        

###### Test it out. It looks like it might be a pretty good predictor.

estimates = named_years(data)

estimates.max()

import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(estimates, range(len(estimates)))