In [None]:
# Requirements
import csv
import numpy
import nltk
from nltk.corpus import stopwords
from nltk.collocations import *
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import re
import fnmatch

# EDGI's web monitoring scripts
!git clone https://github.com/edgi-govdata-archiving/web-monitoring-processing &>/dev/null; 
!pip install -r web-monitoring-processing/requirements.txt &>/dev/null;
!python web-monitoring-processing/setup.py develop &>/dev/null;
from web_monitoring import internetarchive;

In [None]:
# Functions to help us count terms
default_stopwords = set(nltk.corpus.stopwords.words('english'))
all_stopwords = default_stopwords

def count(term, visible_text): # this function counts single word terms from the decoded HTML
    term = term.lower()  # normalize so as to make result case insensitive
    tally = 0
    for section in visible_text:
        ##bigram here. instead of section.split, bigram the section
        for token in section.split():
            token = re.sub(r'[^\w\s]','',token)#remove punctuation
            tally += int(term == token.lower()) # instead of in do ==
    #print(term, tally)
    return tally

def two_count (term, visible_text): # this function counts phrases from the decoded HTML
    tally = 0
    length = len(term)
    for section in visible_text:
        tokens = nltk.word_tokenize(section)
        tokens = [x.lower() for x in tokens] # standardize to lowercase
        tokens = [re.sub(r'[^\w\s]','',x) for x in tokens]
        grams=nltk.ngrams(tokens,length)
        fdist = nltk.FreqDist(grams)
        tally += fdist[term[0].lower(), term[1].lower()]
    #print(term, tally)    
    return tally

In [None]:
# Parameters
terms = ['climate', ['climate', 'change']] # The terms we want to count
dates = [2016,1,1, 2016,7,1] # Looking for snapshots between Jan 1 2016 and July 1 2016, working backwards.
pages = ['epa.gov', 'epa.gov/climatechange'] # The pages we want to look at

In [None]:
final_urls= {}
row_count = len(pages)
column_count = len(terms) 
matrix = numpy.full((row_count,column_count), 999, dtype=numpy.int16) # Default is 999 until counted otherwise
print("Looking for "+str(column_count)+" terms on "+str(row_count)+" pages.") 

In [None]:
# Go get a snapshot of the page
for pos, row in enumerate(pages):
      thisPage = row #change for specific CSVs
      final_urls[thisPage]=""
      try:
          with internetarchive.WaybackClient() as client:
               dump = client.list_versions(thisPage, from_date=datetime(dates[0], dates[1],dates[2]), to_date=datetime(dates[3], dates[4], dates[5])) # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo
               versions = reversed(list(dump))
               for version in versions: # For each version in all the snapshots
                   if version.status_code == '200' or version.status_code == '-': # If the IA snapshot was viable...
                      url=version.raw_url
                      contents = requests.get(url, timeout=120).content.decode() # Decode the url's HTML # Handle the request so that it doesn't hang
                      contents = BeautifulSoup(contents, 'lxml')
                      body=contents.find('body')
                      d=[s.extract() for s in body('footer')]
                      d=[s.extract() for s in body('header')]
                      d=[s.extract() for s in body('nav')]
                      d=[s.extract() for s in body('script')]
                      d=[s.extract() for s in body('style')]
                      del d
                      body=[text for text in body.stripped_strings]
                      for p, t in enumerate(terms):
                          if type(t) is list:
                              page_sum = two_count(t, body)
                          else:
                              page_sum = count(t, body)
                          matrix[pos][p]=page_sum
                      final_urls[thisPage]=url
                      print("Done!")
                      break
                   else:
                      pass
      except:
          print("No snapshot or can't decode", thisPage)
          final_urls[thisPage]=""
          matrix[pos]=999

In [None]:
# Report results
for pos,term in enumerate(terms):
    for p, page in enumerate(pages):
        t = ""
        if type(term) == list:
            for part in term:
                t += part + " "
            t = t[:-1]
        else:
            t = term
        print("'"+ t +"': "+ str(matrix[p][pos]) +" on " + pages[p] + " ("+final_urls[pages[p]]+")")