### Phase IA: Wikipedia Page Scrape

In [17]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [18]:
pageName = 'Hat'

url = 'https://en.wikipedia.org/wiki/' + pageName               # URL for Wikipedia page

headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

response = requests.get(url, headers = headers)                 # Make request for page

soup = BeautifulSoup(response.text, 'html.parser') # Parse HTML

def extractReferences(soup):
    spans = soup.find_all('span', class_ = 'reference-text')                      # Find all of the instances of the References container
    texts = [span.get_text() for span in spans]                                   # Get text for each reference
    links = [[link.get('href') for link in span.find_all('a')] for span in spans] # Get links for each reference
    return texts, links

texts, links = extractReferences(soup)

df = pd.DataFrame(links)

df.insert(0, 'reference', texts)

df.to_csv('outputs/phase1A.csv', index = False)

#### **phase1A.csv:**

In [21]:
pd.read_csv('outputs/phase1A.csv').head()

Unnamed: 0,reference,0,1,2,3,4,5,6,7,8,9,10,11
0,"Pauline Thomas (2007-09-08). ""The Wearing of H...",http://www.fashion-era.com/hats-hair/hats_hair...,,,,,,,,,,,
1,"""The social meanings of hats"". University of C...",http://www.press.uchicago.edu/Misc/Chicago/117...,,,,,,,,,,,
2,"""Insignia:The Way You Tell Who's Who in the Mi...",https://web.archive.org/web/20120414202044/htt...,/wiki/United_States_Department_of_Defense,http://www.defense.gov/news/newsarticle.aspx?i...,,,,,,,,,
3,"""What are Church Hats?"". Southern Living. Arch...",https://web.archive.org/web/20210514230453/htt...,/wiki/Southern_Living,https://www.southernliving.com/culture/church-...,,,,,,,,,
4,"""BBC News | SCI/TECH | World's oldest hat reve...",http://news.bbc.co.uk/2/hi/science/nature/7258...,,,,,,,,,,,


### Phase IB: Internet Archive API

In [16]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup

In [17]:
def getArchiveText(url):
    apiURL = f'http://archive.org/wayback/available?url={url}' # API endpoint

    headers = { "User-Agent": "DASC 690 (mattguilloty@gmail.com)" } # Set User-Agent

    content = requests.get(apiURL, headers = headers).json() # Make request for archived site

    if content['archived_snapshots']:                                    # If an archived snapshot exists
        closest = content['archived_snapshots']['closest']               # Get the closest/latest snapshot
        if closest['available']:
            closestURL = closest['url']                                  # Get the URL for the last snapshot
            content = requests.get(closestURL, headers = headers)        # Make request
            soup = BeautifulSoup(content.text, 'html.parser')

            text = soup.get_text()

            text = text.replace('\n', '')

            return text
    else:
        return None

### Phase IC: Google Custom Search API

In [53]:
# Library requirements for this section
import requests

In [54]:
def googleSearch(query, APIkey, CSEid, numResults = 10):
    url = "https://www.googleapis.com/customsearch/v1"                # API endpoint
    params = {
        'key': APIkey,
        'cx': CSEid,
        'q': query,
        'num': numResults
    }
    response = requests.get(url, params = params)                     # Make request to endpoint
    searchResults = response.json()
    links = [item['link'] for item in searchResults.get('items', [])] # Get links of results
    return links

In [56]:
APIkey = input("Enter API Key: ")
CSEid = input("Enter CSE ID: ")
query = "Malema under fire over slur on Indians"
links = googleSearch(query, APIkey, CSEid)

print('Query:', query)
for i, link in enumerate(links):
    print(link)

Query: Malema under fire over slur on Indians
https://www.the-star.co.ke/news/world/2018-06-18-malema-under-fire-over-racist-indians-slur/
http://www.indianexpress.com/news/sa-leader-fires-racial-slur-against-indians/862763/
https://www.youtube.com/watch?v=ZEX7CAP0pjU
https://en.wikipedia.org/wiki/Racism_in_South_Africa
https://www.reddit.com/r/afrikaans/comments/1auelp0/what_do_you_guys_think_about_black_americans/
https://en.wikipedia.org/wiki/Coolie
https://www.dailymaverick.co.za/opinionista/2015-11-15-anti-indian-statements-are-racism-of-the-worst-order/
https://www.icfj.org/sites/default/files/2022-11/ICFJ_UNESCO_The%20Chilling_2022_1.pdf
https://www.studocu.com/row/document/zhejiang-university-of-science-and-technology/asian-economy/unit6multiple-choice-questions-2001-04-085754/50970178
https://www.saflii.org/za/journals/PER/2020/12.html


### Phase II

In [14]:
import time
import requests
import pandas as pd
import time

In [15]:
statusCodes = {
    100: "Continue", 101: "Switching Protocols", 102: "Processing",
    200: "OK", 201: "Created", 202: "Accepted", 203: "Non-Authoritative Information", 204: "No Content", 205: "Reset Content", 206: "Partial Content", 207: "Multi-Status", 208: "Already Reported", 226: "IM Used",
    300: "Multiple Choices", 301: "Moved Permanently", 302: "Found", 303: "See Other", 304: "Not Modified", 305: "Use Proxy", 307: "Temporary Redirect", 308: "Permanent Redirect", 400: "Bad Request",
    401: "Unauthorized", 402: "Payment Required", 403: "Forbidden", 404: "Not Found", 405: "Method Not Allowed", 406: "Not Acceptable", 407: "Proxy Authentication Required", 408: "Request Timeout", 409: "Conflict",
    410: "Gone", 411: "Length Required", 412: "Precondition Failed", 413: "Payload Too Large", 414: "URI Too Long", 415: "Unsupported Media Type", 416: "Range Not Satisfiable", 417: "Expectation Failed", 418: "I'm a teapot", 421: "Misdirected Request", 
    422: "Unprocessable Entity", 423: "Locked", 424: "Failed Dependency", 425: "Too Early", 426: "Upgrade Required", 428: "Precondition Required", 429: "Too Many Requests", 431: "Request Header Fields Too Large", 439: "Application Inactive", 451: "Unavailable For Legal Reasons", 
    500: "Internal Server Error", 501: "Not Implemented", 502: "Bad Gateway", 503: "Service Unavailable", 504: "Gateway Timeout", 505: "HTTP Version Not Supported", 506: "Variant Also Negotiates", 507: "Insufficient Storage", 508: "Loop Detected", 510: "Not Extended", 511: "Network Authentication Required",
    'Error': 'Error'
}

In [33]:
df = pd.read_csv('outputs/phase1A.csv')
df.head()

Unnamed: 0,reference,0,1,2,3,4,5,6,7,8,9,10,11
0,"Pauline Thomas (2007-09-08). ""The Wearing of H...",http://www.fashion-era.com/hats-hair/hats_hair...,,,,,,,,,,,
1,"""The social meanings of hats"". University of C...",http://www.press.uchicago.edu/Misc/Chicago/117...,,,,,,,,,,,
2,"""Insignia:The Way You Tell Who's Who in the Mi...",https://web.archive.org/web/20120414202044/htt...,/wiki/United_States_Department_of_Defense,http://www.defense.gov/news/newsarticle.aspx?i...,,,,,,,,,
3,"""What are Church Hats?"". Southern Living. Arch...",https://web.archive.org/web/20210514230453/htt...,/wiki/Southern_Living,https://www.southernliving.com/culture/church-...,,,,,,,,,
4,"""BBC News | SCI/TECH | World's oldest hat reve...",http://news.bbc.co.uk/2/hi/science/nature/7258...,,,,,,,,,,,


In [34]:
referenceText = []
urls = []

for index, row in df.iterrows():
    for i in range(len(row)):
       if 'http' in str(row.iloc[i]):
            referenceText.append(row.iloc[0])
            urls.append(row.iloc[i])

In [35]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

def get_status_code(url):
    try:
        response = requests.get(url, headers = headers, timeout = 5)
        return response.status_code
    except requests.exceptions.RequestException as e:
        return 'Error'
    
statuses = []
description = []

for url in urls:
    status = get_status_code(url)
    
    statuses.append(status)
    description.append(statusCodes[status])

    time.sleep(1)


df = pd.DataFrame({
    'reference': referenceText,
    'URL': urls,
    'status': statuses,
    'description': description
})

df.to_csv('outputs/phase2.csv', index = False)

http://www.fashion-era.com/hats-hair/hats_hair_1_wearing_hats_fashion_history.htm - 200 : OK
http://www.press.uchicago.edu/Misc/Chicago/117987.html - 200 : OK
https://web.archive.org/web/20120414202044/http://www.defense.gov//News/NewsArticle.aspx?ID=42199 - 200 : OK
http://www.defense.gov/news/newsarticle.aspx?id=42199 - 200 : OK
https://web.archive.org/web/20210514230453/https://www.southernliving.com/culture/church-hats - 200 : OK
https://www.southernliving.com/culture/church-hats - 404 : Not Found
http://news.bbc.co.uk/2/hi/science/nature/725803.stm - 200 : OK
https://www.theguardian.com/science/2016/aug/18/it-becometh-the-iceman-otzi-clothing-study-reveals-stylish-secrets-of-leather-loving-ancient - 200 : OK
https://web.archive.org/web/20160830164637/https://www.theguardian.com/science/2016/aug/18/it-becometh-the-iceman-otzi-clothing-study-reveals-stylish-secrets-of-leather-loving-ancient - 200 : OK
https://web.archive.org/web/20160819105927/http://news.nationalgeographic.com/2016

#### **phase2.csv:**

In [36]:
pd.read_csv('outputs/phase2.csv')

Unnamed: 0,reference,URL,status,description
0,"Pauline Thomas (2007-09-08). ""The Wearing of H...",http://www.fashion-era.com/hats-hair/hats_hair...,200,OK
1,"""The social meanings of hats"". University of C...",http://www.press.uchicago.edu/Misc/Chicago/117...,200,OK
2,"""Insignia:The Way You Tell Who's Who in the Mi...",https://web.archive.org/web/20120414202044/htt...,200,OK
3,"""Insignia:The Way You Tell Who's Who in the Mi...",http://www.defense.gov/news/newsarticle.aspx?i...,200,OK
4,"""What are Church Hats?"". Southern Living. Arch...",https://web.archive.org/web/20210514230453/htt...,200,OK
5,"""What are Church Hats?"". Southern Living. Arch...",https://www.southernliving.com/culture/church-...,404,Not Found
6,"""BBC News | SCI/TECH | World's oldest hat reve...",http://news.bbc.co.uk/2/hi/science/nature/7258...,200,OK
7,"Davis, Nicola (30 August 2016). ""It becometh t...",https://www.theguardian.com/science/2016/aug/1...,200,OK
8,"Davis, Nicola (30 August 2016). ""It becometh t...",https://web.archive.org/web/20160830164637/htt...,200,OK
9,"Romey, Kristin (18 August 2016). ""Here's What ...",https://web.archive.org/web/20160819105927/htt...,200,OK


### Phase III

In [23]:
# Library requirements for this section
import requests
from bs4 import BeautifulSoup
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer # For Cosine Similarity and Euclidean Distance
from sklearn.metrics.pairwise import cosine_similarity # For Cosine Similarity

import Levenshtein # For Levenshtein Distance

from scipy.spatial import distance # For Euclidean Distance

In [24]:
df = pd.read_csv('outputs/phase2.csv')

codesToIgnore = ['200', '201', '202', '206']

df = df[~df['status'].isin(codesToIgnore)].reset_index(drop=True)

#### Cosine Similarity

 Scale of 0 to 1; 1 is most similar and 0 is least

In [25]:
def cosineSim(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0][1]

text1 = "I love programming in Python"
text2 = "Python programming is fun"

print(f"Cosine Similarity: {cosineSim(text1, text2)}")

Cosine Similarity: 0.5


#### Jaccard Index

Scale of 0 to 1; 1 is most similar and 0 is least

In [26]:
def jaccard_index(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    return len(set1.intersection(set2)) / len(set1.union(set2))

text1 = "I love programming in Python"
text2 = "I love programming in Java"
print(f"Jaccard Index: {jaccard_index(text1, text2)}")

Jaccard Index: 0.6666666666666666


#### Levenshtein Distance

Scale of 0 to length of the longer string; the higher the number the least similar the text is

In [27]:
def levenshteinDist(text1, text2):
    return Levenshtein.distance(text1, text2)

text1 = "kitten"
text2 = "apple"
print(f"Levenshtein Distance: {levenshteinDist(text1, text2)}")

Levenshtein Distance: 5


#### Euclidean Distance

Scale of 0 to length of the longer string; the higher the number the least similar the text is

In [28]:
def euclidean_dist(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return distance.euclidean(vectors[0], vectors[1])

text1 = "I love programming in Python"
text2 = "I love programming in Java"
print(f"Euclidean Distance: {euclidean_dist(text1, text2)}")

Euclidean Distance: 1.4142135623730951


In [52]:
# for index, row in df.iterrows():
#     print(row['reference'])
#     print(row['URL'])
#     print(getArchiveText(row['URL']))
#     print('')

"What are Church Hats?". Southern Living. Archived from the original on 14 May 2021. Retrieved 10 May 2022. Church hats have been a key part of churchgoers' Sunday best for years, and are still an important aspect of dress in some churches today. The practice of covering one's head for church originally came from the Bible—1 Corinthians 11:15, to be precise. The simple head covering has been adapted and expanded to become a stylish part of Southern women's churchgoing attire. At the turn of the century, many Southern ladies wore simple hats to church out of respect, reverence for the service, and continuity with passed-down traditions. The church hat tradition continues today, with hats—sometimes called crowns—in bright colors, bold patterns, and eye-catching styles at Sunday services across the South.
https://www.southernliving.com/culture/church-hats
What are Church Hats? | Southern Living  Skip to contentTop Navigation ExploreSouthern LivingSouthern LivingFoodHolidays & Entertaining

In [29]:
df

Unnamed: 0,reference,URL,status,description
0,"""What are Church Hats?"". Southern Living. Arch...",https://www.southernliving.com/culture/church-...,404,Not Found
1,"""The Tollund Man – Appearance"". The Tollund Ma...",http://www.tollundman.dk/udseende.asp,404,Not Found
2,"""Hat history"". Hatsuk.com. Archived from the o...",http://www.hatsuk.com/hatsuk/hatsukhtml/bible/...,403,Forbidden
3,"""History of Women's Hats"". Vintagefashionguild...",http://vintagefashionguild.org/fashion-history...,404,Not Found
4,"Lauren Turner (2012-06-21). ""New dress code a ...",https://ghostarchive.org/archive/20220509/http...,503,Service Unavailable
5,"""Hats in History: The Kentucky Derby"". Hats-pl...",http://www.hats-plus.com/chronicles/?p=634,439,Application Inactive
6,Philip Treacy: King of Royal wedding hats Arch...,http://www.independent.ie/world-news/royal-wed...,404,Not Found
7,"""Cavanagh Hats"". Bernard Hats. Retrieved 2019-...",https://bernardhats.com/hat-companies/cavanagh...,502,Bad Gateway
8,"Klinkenborg, Verlyn (2009-02-03). ""Season of t...",http://www.iht.com/articles/2009/01/23/opinion...,404,Not Found
9,"""Malema under fire over slur on Indians"". News...",http://www.news24.com/SouthAfrica/Politics/Mal...,404,Not Found


In [32]:
for i in range(len(df)):
    text = getArchiveText(df['URL'][i])
    print(text)
    print('')

What are Church Hats? | Southern Living  Skip to contentTop Navigation ExploreSouthern LivingSouthern LivingFoodHolidays & EntertainingHome & GardenStyle & CultureNewsVideo SearchCloseProfile Menu Your AccountYour AccountAccountJoin Now Email Preferences Newsletters Manage Your Subscription this link opens in a new tab Logout MoreGive a Gift Subscription this link opens in a new tab Southern Living Books this link opens in a new tab  Login           Subscribe         PinFBClose this dialog windowExplore Southern Living Southern LivingSouthern Living SearchExploreExploreThe Most Popular Hairstyles of 2021 The Most Popular Hairstyles of 2021 The trendy haircuts you’ll be seeing everywhere this year.Read More How To Season A Cast-Iron Skillet How To Season A Cast-Iron Skillet Learn how to season this Southern kitchen staple in five easy steps.Read More The Right Way to Heat a Pre-Cooked Ham The Right Way to Heat a Pre-Cooked Ham It's so easy, trust us.Read More FoodFoodSee All FoodHow Lon