In [1]:
# In an attempt to make this notebook as organized (re: readable) as possible, I will explain its structure here. 
## The first cell is dedicated to initializing all dependencies used. 
## Each cell after is then dedicated to defining each function in the module
### Finally, the last cell is this project's driver. That is where we will be putting the pieces of the puzzle together.

In [2]:
# Imports. 'Requests' for https requests. 'BeautifulSoup' for html scraping. 'Pandas' for data analysis. 
# 'sklearn' for similarity functions, such as word counter and cosine similarity. 'gensim' for Doc2Vec.
# 'nltk' for pre-processing main text. 're' for regex. 'scipy' for spacial cosine. 

import requests
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import doc2vec
import nltk
from nltk.corpus import stopwords
import re
from scipy import spatial
from gensim.test.utils import common_texts
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# initializes training of doc2vec model. 
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
model = Doc2Vec.load(fname)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

# This can get initialized up here, as it will be constant throughout. 
count_vectorizer = CountVectorizer(stop_words='english')



In [74]:
# wikiArticle class. Named 'wikiArticle' for lack of inspiration. Will hold all relevant data on an article. 

class WikiArticle:
    def __init__(self, url):
        self.url = url
        self.soup = BeautifulSoup(requests.get(self.url).text, "html")
        self.main_title = self.soup.find_all("h1")[0].get_text()
        self.secondary_titles = ""
        self.main_text = ""
        self.related = []
        
    def get_secondary_titles(self):
        # Check length to make sure secondary_titles list hasn't already been filled. Don't want duplicate data messing us up. 
        if(len(self.secondary_titles) == 0):
            for secondary_title in self.soup.find_all("h2"):
                self.secondary_titles += " " + secondary_title.get_text()
                
    def get_main_text(self):
        """Function: self.main_text set to <string> pre-processed main text of article.
           ============================================================================
              Parameters
              ----------
              Takes no parameters.

              Returns
              ----------
              Returns nothing."""
        
        # Gets text from the article
        paragraphs = self.soup.find_all("p")
        article_text = ""
        for p in paragraphs:
            article_text += p.text
            
        print(article_text)
        
        # Prepares text for analysis.
        vocabulary = self.pre_process(article_text)
        self.main_text = vocabulary
    
    
    def pre_process(self, text):
        """ Function: pre-processes text to prepare for analysis. 
            =====================================================
               Parameters
               ----------
               Takes <string> text to be pre-processed.

               Returns
               ----------
               Returns <dict> Doc2Vec of pre-processed text."""
        
        # Cleaing the text
        processed_article = text.lower()
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
        
        # Preparing the dataset
        ## I think this is where the issue lies.
        all_words = nltk.word_tokenize(processed_article)
        
        # Removing Stop Words
        processed_text = ""
        for i in range(len(all_words)):
            processed_text = [w for w in all_words[i] if w not in stopwords.words('english')]

        return model.infer_vector(processed_text)
    
    def get_related(self):
        """ Function: Get list of related articles
            =====================================================
                Parameters
                ----------
                This function takes no paramater 
                
                Returns
                -------
                This function returns the array of wikiArticle objects from listed related articles. 
               
               """
        related_list = self.soup.find(id="See_also").parent.find_next('ul').findChildren('li')
        
        articles = []
        for item in related_list:
            link = item.findChild('a')
            articles.append(WikiArticle("https://en.wikipedia.org"+link.get('href')))
        
        self.related = articles
        return self.related
        
        
    

In [75]:
def jaccard_analysis(article_one, article_two):
    
    """Parameters
       ----------
       Right now this function takes two strings as its parameters (article_one, article_two). In the future, it should take 
       WikiArticle instances to allow multiple sub-headers to be analyzed together. 
       
       Returns
       --------
       Jaccard Similarity Percentage."""
    
    a = set(article_one.split(" "))
    b = set(article_two.split(" "))
    comparison = a.intersection(b)
    return float(len(comparison)) / (len(a) + len(b) - len(comparison))

In [76]:
def cosine_analysis(article_one, article_two):
    sim = 1 - spatial.distance.cosine(article_one, article_two)
    return sim

In [77]:
def is_over_threshold(similarity, *args):
    
    """Parameters
       ----------
       similarity (float): similarity value that will be checked against threshold.
       threshold (float): Optional paramter to provide value for threshold. Must be passed as "threshold = (value)". Default is 50.
    
       Returns
       ----------
       Boolean value. True if threshold limit is met or exceeded, else False."""
    
    if(len(args) == 1):
        threshold = args[0]
    else:
        threshold = 50
    return (similarity >= threshold)

In [78]:
### Driver ###
##          ##
# ========== #


article_one = WikiArticle("https://en.wikipedia.org/wiki/IBM_mainframe")
article_two = WikiArticle("https://en.wikipedia.org/wiki/History_of_IBM")

# Check if main title similarity is over threshold
if(is_over_threshold(jaccard_analysis(article_one.main_title, article_two.main_title), 0.10)):
    print("is over threshold.")
else:
    print("is not over threshold")

print(jaccard_analysis(article_one.main_title, article_two.main_title))

is over threshold.
0.25


In [80]:
### Scratch Work ###
##                ##
# ================ #


print(article_one.get_related()[1].url)

https://en.wikipedia.org/wiki/Amdahl_Corporation
