### Phase IA: Wikipedia API

#### API (Older implementation)

In [1]:
# Library requirements for this section
import requests

In [2]:
url = "https://en.wikipedia.org/w/api.php" # API endpoint

params = {
    "action": "parse",
    "page": "Coffee",           # Page name
    "prop": "externallinks",    # Get external links from the page
    "format": "json"            # Output to JSON format
}

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

In [3]:
content = requests.get(url, headers = headers, params=params).json() # Make request for external links

In [4]:
f = open('outputs/phase1A.txt', 'w')
for link in content['parse']['externallinks']:
    f.write(link + '\n')
f.close()

#### Page scrape (Current implementation)

In [5]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup

In [6]:
url = 'https://en.wikipedia.org/wiki/Coffee'                    # URL for Wikipedia page

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

response = requests.get(url, headers = headers)                 # Make request for page

In [7]:
soup = BeautifulSoup(response.text, 'html.parser') # Parse HTML

def extractReferences(soup):
    spans = soup.find_all('span', class_ = 'reference-text')                      # Find all of the instances of the References container
    texts = [span.get_text() for span in spans]                                   # Get text for each reference
    links = [[link.get('href') for link in span.find_all('a')] for span in spans] # Get links for each reference
    return texts, links

texts, links = extractReferences(soup)

In [8]:
for i in range(5):
    print(texts[i])
    print(links[i])
    print('')


Ukers WH (1922). All About Coffee. Tea and Coffee Trade Journal Company. p. 5.
Johns Hopkins University Studies in Historical and Political Science. Johns Hopkins University Press. 1967. p. 25.
Elzebroek AT (2008). Guide to Cultivated Plants. CABI. p. 7. ISBN 978-1-84593-356-2.

['https://books.google.com/books?id=4O_RAAAAMAAJ', 'https://books.google.com/books?id=GSw7AAAAIAAJ', 'https://books.google.com/books?id=YvU1XnUVxFQC', '/wiki/CAB_International', '/wiki/ISBN_(identifier)', '/wiki/Special:BookSources/978-1-84593-356-2']

"Global Hot Drinks Market Size, Share | Industry Trends Report, 2025". www.grandviewresearch.com. Retrieved 18 July 2023.
['https://www.grandviewresearch.com/industry-analysis/hot-drinks-market']

Weinberg & Bealer 2001, pp. 3–4
['#CITEREFWeinbergBealer2001']

"A Guide To Different Types Of Coffee Beans, Roasts & Drinks". 13 August 2021. Retrieved 16 January 2023.
['https://fluentincoffee.com/types-of-coffee/']

"33+ Buzzing Coffee Industry Statistics [2023]: Ca

### Phase IB: Internet Archive API

In [9]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup

In [10]:
url = 'https://www.odu.edu/'

apiURL = f'http://archive.org/wayback/available?url={url}' # API endpoint

content = requests.get(apiURL, headers = headers).json() # Make request for archived site

In [11]:
if content['archived_snapshots']:                                    # If an archived snapshot exists
    closest = content['archived_snapshots']['closest']               # Get the closest/latest snapshot
    if closest['available']:
        closestURL = closest['url']                                  # Get the URL for the last snapshot
        content = requests.get(closestURL, headers = headers)        # Make request
        f = open('outputs/phase1B.html', 'w', encoding = 'utf-8')
        f.write(content.text)
        f.close()
else:
    print('No archives available')

In [12]:
with open('outputs/phase1B.html', 'r') as file:
    htmlContent = file.read()

soup = BeautifulSoup(htmlContent, 'html.parser')

text = soup.get_text()

text = text.replace('\n', '')

f = open('outputs/phase1B.txt', 'w', encoding = 'utf-8')
f.write(text)
f.close()

### Phase IC: Google API

#### Page scrape (Older implementation)

In [13]:
# Library requirements for this section
from googlesearch import search

In [14]:
results = search('A Guide To Different Types Of Coffee Beans, Roasts & Drinks". 13 August 2021. Retrieved 16 January 2023', num_results = 10)

In [15]:
for i in results:
    print(i)

https://www.instacart.com/company/ideas/types-of-coffee/
https://www.allrecipes.com/article/types-of-coffee/
https://library.sweetmarias.com/using-sight-to-determine-degree-of-roast/
https://www.thepioneerwoman.com/food-cooking/a35153729/types-of-coffee/
https://www.instructables.com/Choosing-Coffee-Beans/
https://library.sweetmarias.com/how-to-roast-your-own-coffee/
http://scottjanish.com/guide-to-coffee-beers/
https://www.7-eleven.com/blog/food-and-drink/taste-the-difference-the-different-types-of-coffee-beans-and-roasts


#### Google Custom Search API (Current implementation)

In [16]:
# Library requirements for this section
import requests

In [17]:
def googleSearch(query, APIkey, CSEid, numResults = 10):
    url = "https://www.googleapis.com/customsearch/v1"                # API endpoint
    params = {
        'key': APIkey,
        'cx': CSEid,
        'q': query,
        'num': numResults
    }
    response = requests.get(url, params = params)                     # Make request to endpoint
    searchResults = response.json()
    links = [item['link'] for item in searchResults.get('items', [])] # Get links of results
    return links

In [19]:
APIkey = input("Enter API Key: ")
CSEid = input("Enter CSE ID: ")
query = "Coffee"
links = googleSearch(query, APIkey, CSEid)

print('Query:', query, '\n')
for i, link in enumerate(links):
    print(link)

Query: Coffee 

https://en.wikipedia.org/wiki/Coffee
https://www.starbucks.com/
https://www.ncausa.org/About-Coffee/What-is-Coffee
https://coffeebean.com/
https://www.reddit.com/r/Coffee/
https://www.medicalnewstoday.com/articles/270202
https://philzcoffee.com/
https://www.peets.com/
https://www.healthline.com/nutrition/top-evidence-based-health-benefits-of-coffee
https://blanchardscoffee.com/


### Phase II

In [20]:
import time

In [21]:
with open('outputs\phase1A.txt', 'r') as file:
    urls = file.readlines()

urls = [url.strip() for url in urls]

for url in urls:
    print(url)
    try:
        print(requests.get(url, headers = headers, timeout = 5).status_code)
    except requests.exceptions.RequestException as e:
        print('An error occurred:', e)
    time.sleep(1)
    print('')



https://books.google.com/books?id=4O_RAAAAMAAJ
200

https://books.google.com/books?id=GSw7AAAAIAAJ
200

https://books.google.com/books?id=YvU1XnUVxFQC
200

https://www.grandviewresearch.com/industry-analysis/hot-drinks-market
200

https://fluentincoffee.com/types-of-coffee/
200

https://www.zippia.com/advice/coffee-industry-statistics/
200

https://archive.org/stream/oed02arch#page/588/mode/2up
200

https://www.jstor.org/stable/602112
403

https://doi.org/10.2307%2F602112
403

https://www.worldcat.org/issn/0003-0279
200

https://www.etymonline.com/word/coffee
200

https://www.etymonline.com/index.php?term=coffee
200

https://web.archive.org/web/20151007110923/http://www.etymonline.com/index.php?term=coffee
200

https://archive.org/details/bub_gb_XqtX_0BdDbwC
200

https://archive.org/details/allabouttea00uker
200

https://archive.org/details/allabouttea00uker/page/9


KeyboardInterrupt: 

In [22]:
statusCodes = {
    100: "Continue",
    101: "Switching Protocols",
    102: "Processing",
    200: "OK",
    201: "Created",
    202: "Accepted",
    203: "Non-Authoritative Information",
    204: "No Content",
    205: "Reset Content",
    206: "Partial Content",
    207: "Multi-Status",
    208: "Already Reported",
    226: "IM Used",
    300: "Multiple Choices",
    301: "Moved Permanently",
    302: "Found",
    303: "See Other",
    304: "Not Modified",
    305: "Use Proxy",
    307: "Temporary Redirect",
    308: "Permanent Redirect",
    400: "Bad Request",
    401: "Unauthorized",
    402: "Payment Required",
    403: "Forbidden",
    404: "Not Found",
    405: "Method Not Allowed",
    406: "Not Acceptable",
    407: "Proxy Authentication Required",
    408: "Request Timeout",
    409: "Conflict",
    410: "Gone",
    411: "Length Required",
    412: "Precondition Failed",
    413: "Payload Too Large",
    414: "URI Too Long",
    415: "Unsupported Media Type",
    416: "Range Not Satisfiable",
    417: "Expectation Failed",
    418: "I'm a teapot",
    421: "Misdirected Request",
    422: "Unprocessable Entity",
    423: "Locked",
    424: "Failed Dependency",
    425: "Too Early",
    426: "Upgrade Required",
    428: "Precondition Required",
    429: "Too Many Requests",
    431: "Request Header Fields Too Large",
    451: "Unavailable For Legal Reasons",
    500: "Internal Server Error",
    501: "Not Implemented",
    502: "Bad Gateway",
    503: "Service Unavailable",
    504: "Gateway Timeout",
    505: "HTTP Version Not Supported",
    506: "Variant Also Negotiates",
    507: "Insufficient Storage",
    508: "Loop Detected",
    510: "Not Extended",
    511: "Network Authentication Required"
}