In [95]:
# These packages are necessary to run a REST API. The Wayback availability API returns JSON responses.
import json
import urllib

# For the large-scale URL processing
import pandas as pd
import numpy as np

In [96]:
# Any URL can go here. Requires HTTP/WWW/full address.
requested_url = 'google.com'
base_url = 'http://archive.org/wayback/available?url='
constructed_url = base_url + requested_url
data = json.load(urllib.request.urlopen(constructed_url))
print(data) # Will output whatever happened

{'url': 'google.com', 'archived_snapshots': {'closest': {'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20181107191344/http://www.google.com/', 'timestamp': '20181107191344'}}}


In [97]:
# Now, checking if Google's homepage from May 30th, 2000 is available. Must be formatted YYYYMMDD.
timestamp = '20000530'
constructed_url = base_url + requested_url + '&timestamp=' + timestamp

# Same process as before
data = json.load(urllib.request.urlopen(constructed_url))
print(data)

{'url': 'google.com', 'timestamp': '20000530', 'archived_snapshots': {'closest': {'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20000520032820/http://www.google.com:80/', 'timestamp': '20000520032820'}}}


In [98]:
# Here's what the none output looks like. Google wasn't invented in 1950, so this should error.
timestamp = '16000530'
constructed_url = base_url + requested_url + '&timestamp=' + timestamp

# Same process as before
data = json.load(urllib.request.urlopen(constructed_url))
print(data)

# The length of 'archived-snapshots' tells us if the website is available. Here, there is no explicit 'unavailable' message, but there are no snapshots.

# Now, going to make a method to take care of processing any list of URLs.
def url_processor(urls, timestamp = ''):
    base_url = 'http://archive.org/wayback/available?url='
    available_urls = 0 # Going to keep track of how many actually worked
    total_urls = len(urls)
    for url in urls:
        # If a timestamp parameter was specified, then add it, otherwise don't consider a timestamp in the request
        if timestamp:
            constructed_url = base_url + url + '&timestamp=' + timestamp
        else:
            constructed_url = base_url + url
        # Now will make the request and see if it is available on Wayback machine
        data = json.load(urllib.request.urlopen(constructed_url))
        result_length = len(data['archived_snapshots'])
        if result_length > 0: # Meaning if anything was scraped
            available_urls += 1 
    print("Out of "+ str(total_urls) + " URLS, " + str(available_urls) + " were found on the Wayback Machine.")


# Now we have this fancy method, let's try it out on ten charter URLs.
sample_charters = ['http://ayaprun.lksd.org/', 'http://www.tongassschool.org/', 'https://www.kgbsd.org/ketchikancharter', 'https://paideia.asdk12.org/', 'http://anccs.asdk12.org/', 'https://rilkeschule.asdk12.org/', 'http://highlandacademy.asdk12.org/', 'http://www.frontiercs.org/', 'http://www.winterberrycharterschool.com/', 'https://familypartnership.asdk12.org/', 'https://aquarian.asdk12.org/']
url_processor(sample_charters, '20170504')


{'url': 'google.com', 'timestamp': '16000530', 'archived_snapshots': {}}
Out of 11 URLS, 11 were found on the Wayback Machine.


In [100]:
# Now the method can be run on a larger list.
df_URLs = pd.read_csv("../../nowdata/backups/charter_URLs_2016.csv", low_memory = False, usecols=["NCESSCH", "URL"]) 
#df_URLs = df_URLs[["NCESSCH", "URL"]]
large_URLlist = df_URLs['URL'].tolist()

In [101]:
len(large_URLlist)

7400

In [104]:
url_processor(large_URLlist[:100], '20181106')

Out of 100 URLS, 87 were found on the Wayback Machine.
