In [None]:
# In an attempt to make this notebook as organized (re: readable) as possible, I will explain its structure here. 
## The first cell is dedicated to initializing all dependencies used. 
## Each cell after is then dedicated to defining each function in the module
### Finally, the last cell is this project's driver. That is where we will be putting the pieces of the puzzle together.

In [3]:
# Imports. 'Requests' for https requests. 'BeautifulSoup' for html scraping. 'Pandas' for data analysis. 
# 'sklearn' for similarity functions, such as word counter and cosine similarity. 'gensim' for Word2Vec.
# 'nltk' for pre-processing main text. 're' for regex. 'scipy' for spacial cosine. 

import requests
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import re
from scipy import spatial

# This can get initialized up here, as it will be constant throughout. 
count_vectorizer = CountVectorizer(stop_words='english')

In [22]:
# wikiArticle class. Named 'wikiArticle' for lack of inspiration. Will hold all relevant data on an article. 

class WikiArticle:
    def __init__(self, url):
        self.url = url
        self.soup = BeautifulSoup(requests.get(self.url).text, "html")
        self.main_title = self.soup.find_all("h1")[0].get_text()
        self.secondary_titles = ""
        self.main_text = ""
        
    def get_secondary_titles(self):
        # Check length to make sure secondary_titles list hasn't already been filled. Don't want duplicate data messing us up. 
        if(len(self.secondary_titles) == 0):
            for secondary_title in self.soup.find_all("h2"):
                self.secondary_titles += " " + secondary_title.get_text()
                
    def get_main_text(self):
        """
        Function: self.main_text set to <string> pre-processed main text of article.
        ============================================================================
           Parameters
           ----------
           Takes no parameters.

           Returns
           ----------
           Returns nothing."""
        
        # Gets text from the article
        paragraphs = self.soup.find_all("p")
        for p in paragraphs:
            article_text = p.text
        
        # Prepares text for analysis.
        vocabulary = self.pre_process(article_text)
        print(vocabulary)
    
    
    def pre_process(self, text):
        """
        Function: pre-processes text to prepare for analysis. 
        =====================================================
           Parameters
           ----------
           Takes <string> text to be pre-processed.

           Returns
           ----------
           Returns <string> pre-processed text."""
        
        # Cleaing the text
        processed_article = text.lower()
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
        processed_article = re.sub(r'\s+', ' ', processed_article)
        
        # Preparing the dataset
        all_sentences = nltk.sent_tokenize(processed_article)
        all_words = [nltk.word_tokenize(sentence) for sentence in all_sentences]
        
        # Removing Stop Words
        for i in range(len(all_words)):
            all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]
        
        word2vec = Word2Vec(all_words, min_count = 1)
        vocabulary = word2vec.wv.vocab
        return vocabulary
        
    
    
            


In [5]:
def jaccard_analysis(article_one, article_two):
    
    """Parameters
       ----------
       Right now this function takes two strings as its parameters (article_one, article_two). In the future, it should take 
       WikiArticle instances to allow multiple sub-headers to be analyzed together. 
       
       Returns
       --------
       Jaccard Similarity Percentage."""
    
    a = set(article_one.split(" "))
    b = set(article_two.split(" "))
    comparison = a.intersection(b)
    return float(len(comparison)) / (len(a) + len(b) - len(comparison))

In [6]:
def is_over_threshold(similarity, **keyword_parameters):
    
    """Parameters
       ----------
       similarity (float): similarity value that will be checked against threshold.
       threshold (float): Optional paramter to provide value for threshold. Must be passed as "threshold = (value)". Default is 50.
    
       Returns
       ----------
       Boolean value. True if threshold limit is met or exceeded, else False."""
    
    if('threshold' in keyword_paramaters):
        threshold = keyword_paramaters['threshold']
    else:
        threshold = 50
    return (similarity >= threshold)

In [44]:
### Driver ###
##          ##
# ========== #


article_one = WikiArticle("https://en.wikipedia.org/wiki/IBM_mainframe")
article_two = WikiArticle("https://en.wikipedia.org/wiki/History_of_IBM")

print("Main title similarity " + str(jaccard_analysis(article_one.main_title, article_two.main_title)))

Main title similarity 0.25
Secondary title similarity 0.3548387096774194


In [23]:
### Scratch Work ###
##                ##
# ================ #

article_one = WikiArticle("https://en.wikipedia.org/wiki/IBM_mainframe")
article_two = WikiArticle("https://en.wikipedia.org/wiki/History_of_IBM")
article_one.get_main_text()


{'software': <gensim.models.keyedvectors.Vocab object at 0x11b077748>, 'based': <gensim.models.keyedvectors.Vocab object at 0x11b0777f0>, 'emulators': <gensim.models.keyedvectors.Vocab object at 0x11b0778d0>, 'system': <gensim.models.keyedvectors.Vocab object at 0x11b077940>, 'z': <gensim.models.keyedvectors.Vocab object at 0x11b077978>, 'hardware': <gensim.models.keyedvectors.Vocab object at 0x11b0779e8>, 'including': <gensim.models.keyedvectors.Vocab object at 0x11b077a20>, 'flex': <gensim.models.keyedvectors.Vocab object at 0x11b077a90>, 'es': <gensim.models.keyedvectors.Vocab object at 0x11b077b00>, 'runs': <gensim.models.keyedvectors.Vocab object at 0x11b077c18>, 'unixware': <gensim.models.keyedvectors.Vocab object at 0x11b077cf8>, 'linux': <gensim.models.keyedvectors.Vocab object at 0x11b077e10>, 'freely': <gensim.models.keyedvectors.Vocab object at 0x11b077e80>, 'available': <gensim.models.keyedvectors.Vocab object at 0x11b077ef0>, 'hercules': <gensim.models.keyedvectors.Vocab o

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1056)>


False