### Phase IA: Wikipedia API

#### API (Older implementation)

In [1]:
# Library requirements for this section
import requests

In [2]:
url = "https://en.wikipedia.org/w/api.php" # API endpoint

params = {
    "action": "parse",
    "page": "Cat",           # Page name
    "prop": "externallinks",    # Get external links from the page
    "format": "json"            # Output to JSON format
}

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

In [3]:
content = requests.get(url, headers = headers, params=params).json() # Make request for external links

In [4]:
f = open('outputs/phase1A.txt', 'w')
for link in content['parse']['externallinks']:
    f.write(link + '\n')
f.close()

#### Page scrape (Current implementation)

In [113]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [107]:
pageName = 'Cat'

url = 'https://en.wikipedia.org/wiki/' + pageName               # URL for Wikipedia page

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

response = requests.get(url, headers = headers)                 # Make request for page

In [108]:
soup = BeautifulSoup(response.text, 'html.parser') # Parse HTML

def extractReferences(soup):
    spans = soup.find_all('span', class_ = 'reference-text')                      # Find all of the instances of the References container
    texts = [span.get_text() for span in spans]                                   # Get text for each reference
    links = [[link.get('href') for link in span.find_all('a')] for span in spans] # Get links for each reference
    return texts, links

texts, links = extractReferences(soup)

In [124]:
df = pd.DataFrame(links)

df.insert(0, 'text', texts)

df.to_csv('outputs/phase1A.csv')

In [101]:
f = open('outputs/phase1A.txt', 'w')

for link in links:
    for sublink in link:
        if 'http' in sublink:
            f.write(sublink + '\n')

f.close()

### Phase IB: Internet Archive API

In [13]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup

In [30]:
url = 'https://www.youtube.com/'

apiURL = f'http://archive.org/wayback/available?url={url}' # API endpoint

content = requests.get(apiURL, headers = headers).json() # Make request for archived site

In [31]:
content

{'url': 'https://www.youtube.com/',
 'archived_snapshots': {'closest': {'status': '200',
   'available': True,
   'url': 'http://web.archive.org/web/20240626121548/https://www.youtube.com/',
   'timestamp': '20240626121548'}}}

In [33]:
if content['archived_snapshots']:                                    # If an archived snapshot exists
    closest = content['archived_snapshots']['closest']               # Get the closest/latest snapshot
    if closest['available']:
        closestURL = closest['url']                                  # Get the URL for the last snapshot
        content = requests.get(closestURL, headers = headers)        # Make request
        f = open('outputs/phase1B.html', 'w', encoding='utf-8')
        f.write(content.text)
        f.close()
else:
    print('No archives available')

In [35]:
with open('outputs/phase1B.html', 'r', encoding = 'utf-8') as file:
    htmlContent = file.read()

soup = BeautifulSoup(htmlContent, 'html.parser')

text = soup.get_text()

text = text.replace('\n', '')

f = open('outputs/phase1B.txt', 'w', encoding = 'utf-8')
f.write(text)
f.close()

### Phase IC: Google API

#### Page scrape (Older implementation)

In [13]:
# Library requirements for this section
from googlesearch import search

In [14]:
results = search('A Guide To Different Types Of Coffee Beans, Roasts & Drinks". 13 August 2021. Retrieved 16 January 2023', num_results = 10)

In [15]:
for i in results:
    print(i)

https://www.instacart.com/company/ideas/types-of-coffee/
https://www.allrecipes.com/article/types-of-coffee/
https://library.sweetmarias.com/using-sight-to-determine-degree-of-roast/
https://www.thepioneerwoman.com/food-cooking/a35153729/types-of-coffee/
https://www.instructables.com/Choosing-Coffee-Beans/
https://library.sweetmarias.com/how-to-roast-your-own-coffee/
http://scottjanish.com/guide-to-coffee-beers/
https://www.7-eleven.com/blog/food-and-drink/taste-the-difference-the-different-types-of-coffee-beans-and-roasts


#### Google Custom Search API (Current implementation)

In [36]:
# Library requirements for this section
import requests

In [37]:
def googleSearch(query, APIkey, CSEid, numResults = 10):
    url = "https://www.googleapis.com/customsearch/v1"                # API endpoint
    params = {
        'key': APIkey,
        'cx': CSEid,
        'q': query,
        'num': numResults
    }
    response = requests.get(url, params = params)                     # Make request to endpoint
    searchResults = response.json()
    links = [item['link'] for item in searchResults.get('items', [])] # Get links of results
    return links

In [38]:
APIkey = input("Enter API Key: ")
CSEid = input("Enter CSE ID: ")
query = "Cat"
links = googleSearch(query, APIkey, CSEid)

print('Query:', query, '\n')
for i, link in enumerate(links):
    print(link)

Query: Cat 

https://www.cat.com/global-selector.html
https://en.wikipedia.org/wiki/Cat
https://www.cat.com/en_US.html
https://www.caterpillar.com/
https://www.catphones.com/
http://www.catfootwear.com/en/home
https://www.ohchr.org/en/treaty-bodies/cat
https://www.facebook.com/YusufCatStevens/
https://www.narescue.com/combat-application-tourniquet-c-a-t.html
https://www.vetstreet.com/cats


### Phase II

In [2]:
import time
import requests
import pandas as pd

In [3]:
statusCodes = {
    100: "Continue",
    101: "Switching Protocols",
    102: "Processing",
    200: "OK",
    201: "Created",
    202: "Accepted",
    203: "Non-Authoritative Information",
    204: "No Content",
    205: "Reset Content",
    206: "Partial Content",
    207: "Multi-Status",
    208: "Already Reported",
    226: "IM Used",
    300: "Multiple Choices",
    301: "Moved Permanently",
    302: "Found",
    303: "See Other",
    304: "Not Modified",
    305: "Use Proxy",
    307: "Temporary Redirect",
    308: "Permanent Redirect",
    400: "Bad Request",
    401: "Unauthorized",
    402: "Payment Required",
    403: "Forbidden",
    404: "Not Found",
    405: "Method Not Allowed",
    406: "Not Acceptable",
    407: "Proxy Authentication Required",
    408: "Request Timeout",
    409: "Conflict",
    410: "Gone",
    411: "Length Required",
    412: "Precondition Failed",
    413: "Payload Too Large",
    414: "URI Too Long",
    415: "Unsupported Media Type",
    416: "Range Not Satisfiable",
    417: "Expectation Failed",
    418: "I'm a teapot",
    421: "Misdirected Request",
    422: "Unprocessable Entity",
    423: "Locked",
    424: "Failed Dependency",
    425: "Too Early",
    426: "Upgrade Required",
    428: "Precondition Required",
    429: "Too Many Requests",
    431: "Request Header Fields Too Large",
    451: "Unavailable For Legal Reasons",
    500: "Internal Server Error",
    501: "Not Implemented",
    502: "Bad Gateway",
    503: "Service Unavailable",
    504: "Gateway Timeout",
    505: "HTTP Version Not Supported",
    506: "Variant Also Negotiates",
    507: "Insufficient Storage",
    508: "Loop Detected",
    510: "Not Extended",
    511: "Network Authentication Required"
}

In [4]:
df = pd.read_csv('outputs/phase1A.csv')

In [5]:
referenceText = []
urls = []

for index, row in df.iterrows():
    for i in range(len(row)):
       if 'http' in str(row.iloc[i]):
            referenceText.append(row.iloc[1])
            urls.append(row.iloc[i])  

In [6]:
headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

def get_status_code(url):
    try:
        response = requests.get(url, headers = headers, timeout = 4)
        return response.status_code
    except requests.exceptions.RequestException as e:
        return 'Error'
    
statuses = []

for url in urls:
    f = open('outputs/phase2.txt', 'a')
    status = get_status_code(url)
    if status != 'Error':
        line = 'URL:' + url + ' Status Code:' + str(status) + ' - ' + statusCodes[status] + '\n'
        f.write(line)
        f.close()
        print(line)
    else:
        line = 'URL: ' + url + 'Status Code: Error' + '\n'
        f.write(line)
        f.close()
        print(line)
        
    statuses.append(status)

time.sleep(1)

URL:https://archive.org/details/mobot31753000798865/page/42 Status Code:200 - OK

URL: http://www.departments.bucknell.edu/biology/resources/msw3/browse.asp?id=14000031Status Code:Error

URL:http://www.google.com/books?id=JgAMbNSt8ikC&pg=PA534–535 Status Code:200 - OK

URL:https://www.worldcat.org/oclc/62265494 Status Code:200 - OK

URL:https://archive.org/details/iochristpolycerx00erxl/page/520 Status Code:200 - OK

URL:https://archive.org/details/englishwordsthei00mckn/page/300 Status Code:200 - OK

URL:https://books.google.com/books?id=n1_qqgNTsX8C&pg=PA407 Status Code:200 - OK

URL:https://web.archive.org/web/20210331062414/https://books.google.com/books?id=n1_qqgNTsX8C&pg=PA407 Status Code:200 - OK

URL:http://www.oed.com/view/Entry/155147#eid27609702 Status Code:200 - OK

URL:https://web.archive.org/web/20150903215025/http://www.oed.com/view/Entry/155147#eid27609702 Status Code:200 - OK

URL:http://www.oed.com/view/Entry/203100#eid18281825 Status Code:200 - OK

URL:http://www.oed

KeyboardInterrupt: 