In [None]:
# In an attempt to make this notebook as organized (re: readable) as possible, I will explain its structure here. 
## The first cell is dedicated to initializing all dependencies used. 
## Each cell after is then dedicated to defining each function in the module
### Finally, the last cell is this project's driver. That is where we will be putting the pieces of the puzzle together.

In [3]:
# Imports. 'Requests' for https requests. 'BeautifulSoup' for html scraping. 'Pandas' for data analysis. 
# 'sklearn' for similarity functions, such as word counter and cosine similarity. 'gensim' for Doc2Vec.
# 'nltk' for pre-processing main text. 're' for regex. 'scipy' for spacial cosine. 

import requests
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import doc2vec
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
from scipy import spatial
from gensim.test.utils import common_texts
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import Counter
import copy

# initializes training of doc2vec model. 
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
model = Doc2Vec.load(fname)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

# This can get initialized up here, as it will be constant throughout. 
count_vectorizer = CountVectorizer(stop_words='english')



In [156]:
# Corpus class. This will hold all the article objects in our corpus and allow us to execute some corpus-wide methods.

class Corpus:
    def __init__(self):
        self.articles = []
        self.current_article_ndx = 0
        self.corpus_stopwords = []
    
    def new_article(self, url):
        """Function: creates and stores a new article object. 
           ============================================================================
           Parameters
           ----------
           (Wiki) Article url

           Returns
           ----------
           Returns index of new article."""
        self.articles.append(WikiArticle(url))
        return len(self.articles) - 1
    
    def filter_corpus_by_frequency(self, *args):
        """Function: Finds a variable amount (default 3) of the most frequent words in the corpus. 
           These words are then removed from all of the article.word_frequency[] dictionaries. 
           ============================================================================
           Parameters
           ----------
           (Optional) <int> Number of stop words to get. Default is 3. 

           Returns
           ----------
           No return. Filters most frequent words in corpus out of all articles word_frequency dictionaries."""

        
        ## Lets loop through the articles in the corpus, get each articles word_frequency count, and merge them into a common dictionary.
        # ...this will be fun
        total_frequency = {}
        for article in self.articles:
            total_frequency = mergeDict(total_frequency, article.get_word_frequency())
        total_frequency = Counter(total_frequency)
        ## Okay now that's done, let's get the most frequently found words in the corpus (aka our new corpus stop words).
        
        # Check if optional paramater was passed. 
        if(len(args) == 1):
            count = args[0]
        else:
            count = 3
        
        print(type(total_frequency))
        # Get 3 most frequently found words and store them in corpus_stopwords list. 
        print(total_frequency.most_common(count))
        
        # Loop through all articles in corpus, filtering out the newly obtained corpus stop words.
        for article in self.articles:
            article.filter_corpus_stopwords(self.corpus_stopwords)
        
        
# Class agnostic function to help merging word_frequency dicts
def mergeDict(dict1, dict2):
    ''' Merge dictionaries and keep values of common keys in list'''
    dict3 = {**dict1, **dict2}
    for key, value in dict3.items():
        if key in dict1 and key in dict2:
            dict3[key] = [value + dict1[key]]
    return Counter(dict3)

In [157]:
# wikiArticle class. Named 'wikiArticle' for lack of inspiration. Will hold all relevant data on an article. 

class WikiArticle:
    def __init__(self, url):
        self.url = url
        self.soup = BeautifulSoup(requests.get(self.url).text, "html")
        self.main_title = self.soup.find_all("h1")[0].get_text()
        self.secondary_titles = ""
        self.main_text = ""
        self.word_frequency = {}
        
    def get_secondary_titles(self):
        # Check length to make sure secondary_titles list hasn't already been filled. Don't want duplicate data messing us up. 
        if(len(self.secondary_titles) == 0):
            for secondary_title in self.soup.find_all("h2"):
                self.secondary_titles += " " + secondary_title.get_text()
                
    def get_main_text(self):
        """
        Function: self.main_text set to <string> pre-processed main text of article.
        ============================================================================
           Parameters
           ----------
           Takes no parameters.

           Returns
           ----------
           Returns nothing."""
        
        # Gets text from the article
        paragraphs = self.soup.find_all("p")
        article_text = ""
        for p in paragraphs:
            article_text = article_text + " " + p.text
        
        # Prepares text for analysis.
        self.main_text = self.pre_process(article_text)
    
    def get_word_frequency(self):
        """ 
        Function: gets word frequencies for article and stores in self.word_frequency dictionary. 
        ===========================================
           Parameters
           ----------
           Takes no paramaters

           Returns
           ----------
           Returns word frequency. Result is stored in self.word_frequency"""
        
        self.word_frequency = Counter(self.main_text)
        return self.word_frequency
        
        
    def filter_corpus_stopwords(self, corpus_stop_words):
        """ 
        Function: removes all occurences of the most frequent words in the corpus from self.word_frequency. This function should only be called from within a corpus class method.
        ===========================================
        Parameters
        ----------
        <list> corpus stop words

        Returns
        ----------
        No return. self.word_frequency"""
        
        filtered_text = Counter({})
        for k, v in self.word_frequency:
            if not k in corpus_stop_words:
                filtered_text[k] = v
                
        self.word_frequency = filtered_text
    
    
    def pre_process(self, text):
        """
        Function: pre-processes text to prepare for analysis. 
        =====================================================
           Parameters
           ----------
           Takes <string> text to be pre-processed.

           Returns
           ----------
           Returns <dict> Doc2Vec of pre-processed text."""
        
        # Cleaing the text
        processed_article = text.lower()
        
        # Preparing the dataset
        all_words = word_tokenize(processed_article)
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
        processed_article = re.sub(r'\s+', ' ', processed_article)
        
        # Removing Stop Words
        processed_text = []
        for w in all_words:
            if not w in stopwords.words('english'):
                processed_text.append(w)
        
        return processed_text

    
    def get_related(self):
        """ Function: Get list of related articles
            =====================================================
                Parameters
                ----------
                This function takes no paramater 
                
                Returns
                -------
                This function returns the array of wikiArticle objects from listed related articles. 
               
               """
        related_list = self.soup.find(id="See_also").parent.find_next('ul').findChildren('li')
        
        articles = []
        for item in related_list:
            link = item.findChild('a')
            articles.append(WikiArticle("https://en.wikipedia.org"+link.get('href')))
        
        self.related = articles
        return self.related



In [158]:
def jaccard_analysis(article_one, article_two):
    
    """Parameters
       ----------
       Right now this function takes two strings as its parameters (article_one, article_two). In the future, it should take 
       WikiArticle instances to allow multiple sub-headers to be analyzed together. 
       
       Returns
       --------
       Jaccard Similarity Percentage."""
    
    a = set(article_one.split(" "))
    b = set(article_two.split(" "))
    comparison = a.intersection(b)
    return float(len(comparison)) / (len(a) + len(b) - len(comparison))

In [159]:
def cosine_analysis(text_one, text_two):
    # count word frequency. 
    # We can't reuse the WikiArticle.get_word_frequency() method since we're dealing with strings and not article objects... 
    # ... that's okay cause this gives us more flexibility as to what we can perform cosine similarity analysis on.  
    article_one_vals = Counter(text_one)
    article_two_vals = Counter(text_two)

    # convert to word-vectors
    words  = list(article_one_vals.keys() | article_two_vals.keys())
    a_vect = [article_one_vals.get(word, 0) for word in words]        
    b_vect = [article_two_vals.get(word, 0) for word in words]       

    # find cosine
    len_a  = sum(av*av for av in a_vect) ** 0.5             
    len_b  = sum(bv*bv for bv in b_vect) ** 0.5             
    dot    = sum(av*bv for av,bv in zip(a_vect, b_vect))    
    cosine = dot / (len_a * len_b)
    
    # return cosine
    return cosine

In [160]:
def is_over_threshold(similarity, *args):
    
    """Parameters
       ----------
       similarity (float): similarity value that will be checked against threshold.
       threshold (float): Optional paramater to provide value for threshold. Must be passed as "threshold = (value)". Default is 50.
    
       Returns
       ----------
       Boolean value. True if threshold limit is met or exceeded, else False."""
    
    if(len(args) == 1):
        threshold = args[0]
    else:
        threshold = 50
    return (similarity >= threshold)

In [161]:
### Driver ###
##          ##
# ========== #

corpus = Corpus()

corpus.new_article("https://en.wikipedia.org/wiki/IBM_mainframe")
corpus.new_article("https://en.wikipedia.org/wiki/History_of_IBM")

# Check if main title similarity is over threshold
if(is_over_threshold(jaccard_analysis(corpus.articles[0].main_title, corpus.articles[1].main_title), 0.10)):
    print("is over threshold.")
else:
    print("is not over threshold")

print(jaccard_analysis(corpus.articles[0].main_title, corpus.articles[1].main_title))

is over threshold.
0.25


In [166]:
### Scratch Work ###
##                ##
# ================ #
%xmode context

corpus.articles[0].get_main_text()
corpus.articles[1].get_main_text()

corpus.filter_corpus_by_frequency(1)

print(corpus.corpus_stopwords)




Exception reporting mode: Context
<class 'collections.Counter'>


TypeError: '>' not supported between instances of 'int' and 'list'

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1056)>


False