### Phase IA: Wikipedia Page Scrape

In [93]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [104]:
pageName = 'Boot'

url = 'https://en.wikipedia.org/wiki/' + pageName               # URL for Wikipedia page

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

response = requests.get(url, headers = headers)                 # Make request for page

In [105]:
soup = BeautifulSoup(response.text, 'html.parser') # Parse HTML

def extractReferences(soup):
    spans = soup.find_all('span', class_ = 'reference-text')                      # Find all of the instances of the References container
    texts = [span.get_text() for span in spans]                                   # Get text for each reference
    links = [[link.get('href') for link in span.find_all('a')] for span in spans] # Get links for each reference
    return texts, links

texts, links = extractReferences(soup)

In [106]:
df = pd.DataFrame(links)

df.insert(0, 'reference', texts)

df.to_csv('outputs/phase1A.csv', index = False)

#### **phase1A.csv:**

In [8]:
pd.read_csv('outputs/phase1A.csv').head()

Unnamed: 0,reference,0,1,2,3
0,Fiona McDonald (30 July 2006). Shoes and Boots...,https://books.google.com/books?id=WCyp2q7nQAkC,/wiki/ISBN_(identifier),/wiki/Special:BookSources/978-0-8368-6857-9,
1,"""Making Sure Your Work Boots Make the Grade"". ...",https://drewsboots.com/content/making-sure-you...,,,
2,"Collection, Thomas George. ""What are the most ...",https://tgcollection.com.au/blogs/tgc-blog/mos...,,,
3,Margo DeMello (1 September 2009). Feet and foo...,https://books.google.com/books?id=5QdKSxajwP0C...,/wiki/ISBN_(identifier),/wiki/Special:BookSources/978-0-313-35714-5,https://web.archive.org/web/20130602195904/htt...
4,"Alex Henderson (Jan 8, 2014). ""Kinky Boots: An...",https://www.xbiz.com/features/173135/kinky-boo...,,,


### Phase IB: Internet Archive API

In [13]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup

In [30]:
url = 'https://www.youtube.com/'

apiURL = f'http://archive.org/wayback/available?url={url}' # API endpoint

content = requests.get(apiURL, headers = headers).json() # Make request for archived site

In [31]:
content

{'url': 'https://www.youtube.com/',
 'archived_snapshots': {'closest': {'status': '200',
   'available': True,
   'url': 'http://web.archive.org/web/20240626121548/https://www.youtube.com/',
   'timestamp': '20240626121548'}}}

In [33]:
if content['archived_snapshots']:                                    # If an archived snapshot exists
    closest = content['archived_snapshots']['closest']               # Get the closest/latest snapshot
    if closest['available']:
        closestURL = closest['url']                                  # Get the URL for the last snapshot
        content = requests.get(closestURL, headers = headers)        # Make request
        f = open('outputs/phase1B.html', 'w', encoding='utf-8')
        f.write(content.text)
        f.close()
else:
    print('No archives available')

In [35]:
with open('outputs/phase1B.html', 'r', encoding = 'utf-8') as file:
    htmlContent = file.read()

soup = BeautifulSoup(htmlContent, 'html.parser')

text = soup.get_text()

text = text.replace('\n', '')

f = open('outputs/phase1B.txt', 'w', encoding = 'utf-8')
f.write(text)
f.close()

### Phase IC: Google Custom Search API

In [36]:
# Library requirements for this section
import requests

In [37]:
def googleSearch(query, APIkey, CSEid, numResults = 10):
    url = "https://www.googleapis.com/customsearch/v1"                # API endpoint
    params = {
        'key': APIkey,
        'cx': CSEid,
        'q': query,
        'num': numResults
    }
    response = requests.get(url, params = params)                     # Make request to endpoint
    searchResults = response.json()
    links = [item['link'] for item in searchResults.get('items', [])] # Get links of results
    return links

In [38]:
APIkey = input("Enter API Key: ")
CSEid = input("Enter CSE ID: ")
query = "Cat"
links = googleSearch(query, APIkey, CSEid)

print('Query:', query, '\n')
for i, link in enumerate(links):
    print(link)

Query: Cat 

https://www.cat.com/global-selector.html
https://en.wikipedia.org/wiki/Cat
https://www.cat.com/en_US.html
https://www.caterpillar.com/
https://www.catphones.com/
http://www.catfootwear.com/en/home
https://www.ohchr.org/en/treaty-bodies/cat
https://www.facebook.com/YusufCatStevens/
https://www.narescue.com/combat-application-tourniquet-c-a-t.html
https://www.vetstreet.com/cats


### Phase II

In [2]:
import time
import requests
import pandas as pd
import time

In [3]:
statusCodes = {
    100: "Continue", 101: "Switching Protocols", 102: "Processing",
    200: "OK", 201: "Created", 202: "Accepted", 203: "Non-Authoritative Information", 204: "No Content", 205: "Reset Content", 206: "Partial Content", 207: "Multi-Status", 208: "Already Reported", 226: "IM Used",
    300: "Multiple Choices", 301: "Moved Permanently", 302: "Found", 303: "See Other", 304: "Not Modified", 305: "Use Proxy", 307: "Temporary Redirect", 308: "Permanent Redirect", 400: "Bad Request",
    401: "Unauthorized", 402: "Payment Required", 403: "Forbidden", 404: "Not Found", 405: "Method Not Allowed", 406: "Not Acceptable", 407: "Proxy Authentication Required", 408: "Request Timeout", 409: "Conflict",
    410: "Gone", 411: "Length Required", 412: "Precondition Failed", 413: "Payload Too Large", 414: "URI Too Long", 415: "Unsupported Media Type", 416: "Range Not Satisfiable", 417: "Expectation Failed", 418: "I'm a teapot", 421: "Misdirected Request", 
    422: "Unprocessable Entity", 423: "Locked", 424: "Failed Dependency", 425: "Too Early", 426: "Upgrade Required", 428: "Precondition Required", 429: "Too Many Requests", 431: "Request Header Fields Too Large", 451: "Unavailable For Legal Reasons", 
    500: "Internal Server Error", 501: "Not Implemented", 502: "Bad Gateway", 503: "Service Unavailable", 504: "Gateway Timeout", 505: "HTTP Version Not Supported", 506: "Variant Also Negotiates", 507: "Insufficient Storage", 508: "Loop Detected", 510: "Not Extended", 511: "Network Authentication Required",
    'Error': 'Error'
}

In [4]:
df = pd.read_csv('outputs/phase1A.csv')
df.head()

Unnamed: 0,reference,0,1,2,3
0,Fiona McDonald (30 July 2006). Shoes and Boots...,https://books.google.com/books?id=WCyp2q7nQAkC,/wiki/ISBN_(identifier),/wiki/Special:BookSources/978-0-8368-6857-9,
1,"""Making Sure Your Work Boots Make the Grade"". ...",https://drewsboots.com/content/making-sure-you...,,,
2,"Collection, Thomas George. ""What are the most ...",https://tgcollection.com.au/blogs/tgc-blog/mos...,,,
3,Margo DeMello (1 September 2009). Feet and foo...,https://books.google.com/books?id=5QdKSxajwP0C...,/wiki/ISBN_(identifier),/wiki/Special:BookSources/978-0-313-35714-5,https://web.archive.org/web/20130602195904/htt...
4,"Alex Henderson (Jan 8, 2014). ""Kinky Boots: An...",https://www.xbiz.com/features/173135/kinky-boo...,,,


In [5]:
referenceText = []
urls = []

for index, row in df.iterrows():
    for i in range(len(row)):
       if 'http' in str(row.iloc[i]):
            referenceText.append(row.iloc[0])
            urls.append(row.iloc[i])

In [6]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

def get_status_code(url):
    try:
        response = requests.get(url, headers = headers, timeout = 5)
        return response.status_code
    except requests.exceptions.RequestException as e:
        return 'Error'
    
statuses = []
description = []

for url in urls:
    status = get_status_code(url)

    print(url + ' - ' + str(status) + ' : ' + statusCodes[status])
    
    statuses.append(status)
    description.append(statusCodes[status])

    time.sleep(1)


df = pd.DataFrame({
    'reference': referenceText,
    'URL': urls,
    'status': statuses,
    'description': description
})

df.to_csv('outputs/phase2.csv', index = False)

https://books.google.com/books?id=WCyp2q7nQAkC - 200 : OK
https://drewsboots.com/content/making-sure-your-work-boots-make-grade - 200 : OK
https://tgcollection.com.au/blogs/tgc-blog/most-comfortable-mens-boots - 200 : OK
https://books.google.com/books?id=5QdKSxajwP0C&pg=PA65 - 200 : OK
https://web.archive.org/web/20130602195904/http://books.google.com/books?id=5QdKSxajwP0C&pg=PA65 - 200 : OK
https://www.xbiz.com/features/173135/kinky-boots-an-enduring-symbol-in-fetish-fashion - 200 : OK
https://www.westernbootbarn.com.au/blog/the-history-of-cowboy-boots/ - 200 : OK
https://buffalojackson.com/blogs/journal/the-history-of-cowboy-boots - 200 : OK
https://www.thezoereport.com/fashion/cowboy-boot-trend - 200 : OK
https://www.vogue.co.uk/fashion/article/cowboy-boots-trend - 200 : OK
https://www.hellomagazine.com/shopping/20220430139158/best-cowboy-boots-to-shop-now/ - 404 : Not Found
http://www.macmillandictionary.com/thesaurus/british/tough#as-tough-as-old-boots_1 - 200 : OK
https://web.arc

#### **phase2.csv:**

In [7]:
pd.read_csv('outputs/phase2.csv')

Unnamed: 0,reference,URL,status,description
0,Fiona McDonald (30 July 2006). Shoes and Boots...,https://books.google.com/books?id=WCyp2q7nQAkC,200,OK
1,"""Making Sure Your Work Boots Make the Grade"". ...",https://drewsboots.com/content/making-sure-you...,200,OK
2,"Collection, Thomas George. ""What are the most ...",https://tgcollection.com.au/blogs/tgc-blog/mos...,200,OK
3,Margo DeMello (1 September 2009). Feet and foo...,https://books.google.com/books?id=5QdKSxajwP0C...,200,OK
4,Margo DeMello (1 September 2009). Feet and foo...,https://web.archive.org/web/20130602195904/htt...,200,OK
5,"Alex Henderson (Jan 8, 2014). ""Kinky Boots: An...",https://www.xbiz.com/features/173135/kinky-boo...,200,OK
6,"""The History Of Cowboy Boots"". Western Boot Ba...",https://www.westernbootbarn.com.au/blog/the-hi...,200,OK
7,"Co, Buffalo Jackson Trading. ""The History Of C...",https://buffalojackson.com/blogs/journal/the-h...,200,OK
8,"""Hand-Painted Floral Cowboy Boots? Yes, Please...",https://www.thezoereport.com/fashion/cowboy-bo...,200,OK
9,"""Cowboy Boots Will Be In Every Festival Field ...",https://www.vogue.co.uk/fashion/article/cowboy...,200,OK
