In [None]:
# In an attempt to make this notebook as organized (re: readable) as possible, I will explain its structure here. 
## The first cell is dedicated to initializing all dependencies used. 
## Each cell after is then dedicated to defining each function in the module
### Finally, the last cell is this project's driver. That is where we will be putting the pieces of the puzzle together.

In [13]:
# Imports. Requests for https requests. BeautifulSoup for html scraping. Pandas for data analysis. 
# sklearn for similarity functions, such as word counter and cosine similarity

import requests
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# This can get initialized up here, as it will be constant throughout. 
count_vectorizer = CountVectorizer(stop_words='english')

In [32]:
# wikiArticle class. Named 'wikiArticle' for lack of inspiration. Will hold all relevant data on an article. 

class WikiArticle:
    def __init__(self, url):
        self.url = url
        self.soup = BeautifulSoup(requests.get(self.url).text, "html")
        self.main_title = self.soup.find_all("h1")[0].get_text()
        self.secondary_titles = ""
        
    def get_secondary_titles(self):
        # Check length to make sure secondary_titles list hasnt already been filled. Don't want duplicate data messing us up. 
        if(len(self.secondary_titles) == 0):
            for secondary_title in self.soup.find_all("h2"):
                self.secondary_titles += " " + secondary_title.get_text()
            


In [33]:
def jaccard_analysis(article_one, article_two):
    """Parameters
       ----------
       Right now this function takes two strings as its parameters (article_one, article_two). In the future, it should take 
       WikiArticle instances to allow multiple sub-headers to be analyzed together. 
       
       Returns
       --------
       Jaccard Similarity Percentage."""
    a = set(article_one.split(" "))
    b = set(article_two.split(" "))
    comparison = a.intersection(b)
    return float(len(comparison)) / (len(a) + len(b) - len(comparison))

In [None]:
def is_over_threshold(similarity, **keyword_parameters):
    """Parameters
       ----------
       similarity (float): similarity value that will be checked against threshold.
       threshold (float): Optional paramter to provide value for threshold. Must be passed as "threshold = (value)". Default is 50.
    
       Returns
       ----------
       Boolean value. True if threshold limit is met or exceeded, else False.
    """
    
    if('threshold' in keyword_paramaters):
        threshold = keyword_paramaters['threshold']
    else:
        threshold = 50
    return (similarity >= threshold)

In [44]:
### Driver. 
# This cell is used as the driver for the project. 

article_one = WikiArticle("https://en.wikipedia.org/wiki/IBM_mainframe")
article_two = WikiArticle("https://en.wikipedia.org/wiki/History_of_IBM")

print("Main title similarity " + str(jaccard_analysis(article_one.main_title, article_two.main_title)))

Main title similarity 0.25
Secondary title similarity 0.3548387096774194


 Contents First and second generation[edit] Smaller machines[edit] IBM System/360[edit] Today's systems[edit] See also[edit] References[edit] Further reading[edit] External links[edit] Navigation menu
----------------------------
 Contents Chronology[edit] Twentieth-century market power and antitrust[edit] Products and technologies[edit] Organization[edit] See also[edit] Notes and references[edit] Further reading[edit] External links[edit] Navigation menu
