In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
from datasketch import MinHash, MinHashLSHForest
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import re

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leandrocorona/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leandrocorona/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
data = pd.read_csv('top30.csv')
data['lenght']=data['content'].apply(len)
data=data.sort_values(by='lenght',ascending=False)[4:]
data = data[data['author']!='Breitbart News']
names = data.author.value_counts().index.tolist()
minarticles = data.author[data.author == names[-1]].value_counts()
#names.remove('Breitbart News')

In [11]:
#Preprocess split a text into individual tokens based on whitespace.
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [53]:
def get_forest(data, perms):    
    #A mapping of the whole dataset as minhashes so it is stored and it can be used with different purposes
    minhash = []
    for text in data['content']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
    #So the indices are searchable:
    forest.index()    
    return forest

In [55]:
permutations = 128
forest = get_forest(data,permutations)

In [232]:
def get_minhash(text):
        #list of minhash for the input text
        minhash1 = []
        #list of minhash for the text to be compared
        minhash2 = []
        #dict that will store the name of the document plus the Jaccard Similarity
        similarity = {}
        
        tokens1 = preprocess(text)
        m1 = MinHash(num_perm=128)
        for s in tokens1:
            m1.update(s.encode('utf8'))
        minhash1.append(m1)
        #getting the indices of the top 5 articles by Jaccard Similarity (if the input article is from the the same dataset the top 1 will be itself)
        result =forest.query(m1, 5)
        for item in result:
            tokens = preprocess(data.content[item])
            m2 = MinHash(num_perm=128)
            for t in tokens:
                m2.update(t.encode('utf8'))
            minhash2.append(m2)
            #key with the Title and value with the Jaccard Similarity
            similarity[data.title[item]] = m1.jaccard(m2)
        return similarity

In [233]:
data['jaccard_sim']= data.content.apply(get_minhash)

In [294]:
#Example of the top 5 articles by Jaccard Similarity, 
data.jaccard_sim.iloc[0]

{'DNC Chair Brazile: Hillary Speech Leaks ’Crap’ Postmarked From Russia - Breitbart': 0.125,
 'Arianna Huffington: Trump ’a Clear and Present Danger’ - Breitbart': 0.9765625,
 'Carl Bernstein: Trump Has ’Lied as No President of the United States in My Lifetime Has’ - Breitbart': 0.3203125,
 '‘The View’ Co-host Joy Behar: Trump Is ‘The Most Dangerous Man Alive’ - Breitbart': 0.203125,
 'Arianna Huffington: Trump Is Like Kim Jong-un - ’Dangerous,’ ’a Buffoon’ - Breitbart': 1.0}

In [298]:
data[data['title']=='Arianna Huffington: Trump ’a Clear and Present Danger’ - Breitbart']['content']

16269    Sunday on CNN’s “Reliable Sources,” the   and ...
Name: content, dtype: object

In [299]:
data[data['title']=='Arianna Huffington: Trump Is Like Kim Jong-un - ’Dangerous,’ ’a Buffoon’ - Breitbart']['content']

16345    Sunday on CNN’s “Reliable Sources,” Arianna Hu...
Name: content, dtype: object