### Phase IA: Wikipedia API

#### API (Older implementation)

In [None]:
# Library requirements for this section
import requests

In [63]:
url = "https://en.wikipedia.org/w/api.php" # API endpoint

params = {
    "action": "parse",
    "page": "Coffee",           # Page name
    "prop": "externallinks",    # Get external links from the page
    "format": "json"            # Output to JSON format
}

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

In [64]:
content = requests.get(url, headers = headers, params=params).json() # Make request for external links

In [65]:
f = open('outputs/phase1A.txt', 'w')
for link in content['parse']['externallinks']:
    f.write(link + '\n')
f.close()

#### Page scrape (Current implementation)

In [109]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup

In [110]:
url = 'https://en.wikipedia.org/wiki/Coffee'                    # URL for Wikipedia page

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

response = requests.get(url, headers = headers)                 # Make request for page

In [111]:
soup = BeautifulSoup(response.text, 'html.parser') # Parse HTML

def extractReferences(soup):
    spans = soup.find_all('span', class_ = 'reference-text')                      # Find all of the instances of the References container
    texts = [span.get_text() for span in spans]                                   # Get text for each reference
    links = [[link.get('href') for link in span.find_all('a')] for span in spans] # Get links for each reference
    return texts, links

texts, links = extractReferences(soup)

In [118]:
for i in range(5):
    print(texts[i])
    print(links[i])
    print('')


Ukers WH (1922). All About Coffee. Tea and Coffee Trade Journal Company. p. 5.
Johns Hopkins University Studies in Historical and Political Science. Johns Hopkins University Press. 1967. p. 25.
Elzebroek AT (2008). Guide to Cultivated Plants. CABI. p. 7. ISBN 978-1-84593-356-2.

['https://books.google.com/books?id=4O_RAAAAMAAJ', 'https://books.google.com/books?id=GSw7AAAAIAAJ', 'https://books.google.com/books?id=YvU1XnUVxFQC', '/wiki/CAB_International', '/wiki/ISBN_(identifier)', '/wiki/Special:BookSources/978-1-84593-356-2']

"Global Hot Drinks Market Size, Share | Industry Trends Report, 2025". www.grandviewresearch.com. Retrieved 18 July 2023.
['https://www.grandviewresearch.com/industry-analysis/hot-drinks-market']

Weinberg & Bealer 2001, pp. 3–4
['#CITEREFWeinbergBealer2001']

"A Guide To Different Types Of Coffee Beans, Roasts & Drinks". 13 August 2021. Retrieved 16 January 2023.
['https://fluentincoffee.com/types-of-coffee/']

"33+ Buzzing Coffee Industry Statistics [2023]: Ca

### Phase IB: Internet Archive API

In [122]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup

In [5]:
url = 'https://www.odu.edu/'

apiURL = f'http://archive.org/wayback/available?url={url}' # API endpoint

content = requests.get(apiURL, headers = headers).json() # Make request for archived site

In [6]:
if content['archived_snapshots']:                                    # If an archived snapshot exists
    closest = content['archived_snapshots']['closest']               # Get the closest/latest snapshot
    if closest['available']:
        closestURL = closest['url']                                  # Get the URL for the last snapshot
        content = requests.get(closestURL, headers = headers)        # Make request
        f = open('outputs/phase1B.html', 'w', encoding = 'utf-8')
        f.write(content.text)
        f.close()
else:
    print('No archives available')

In [8]:
with open('outputs/phase1B.html', 'r') as file:
    htmlContent = file.read()

soup = BeautifulSoup(htmlContent, 'html.parser')

text = soup.get_text()

text = text.replace('\n', '')

f = open('outputs/phase1B.txt', 'w', encoding = 'utf-8')
f.write(text)
f.close()

### Phase IC: Google API

### Phase II

In [13]:
import time

In [17]:
with open('outputs\phase1A.txt', 'r') as file:
    urls = file.readlines()

urls = [url.strip() for url in urls]

for url in urls:
    print(url)
    try:
        print(requests.get(url, headers = headers, timeout = 5).status_code)
    except requests.exceptions.RequestException as e:
        print('An error occurred:', e)
    time.sleep(1)
    print('')



https://books.google.com/books?id=4O_RAAAAMAAJ
200

https://books.google.com/books?id=GSw7AAAAIAAJ
200

https://books.google.com/books?id=YvU1XnUVxFQC
200

https://www.grandviewresearch.com/industry-analysis/hot-drinks-market
200

https://fluentincoffee.com/types-of-coffee/
200

https://www.zippia.com/advice/coffee-industry-statistics/
200

https://archive.org/stream/oed02arch#page/588/mode/2up
200

https://www.jstor.org/stable/602112
403

https://doi.org/10.2307%2F602112
403

https://www.worldcat.org/issn/0003-0279
200

https://www.etymonline.com/word/coffee
200


KeyboardInterrupt: 