In [115]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import json
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from bs4.element import Tag
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
def remove_hidden(l):
    return [el for el in l if el[0] != "."]

def get_relative_path_to_dirs(start_path):
    subdirs = [x[1] for x in os.walk(start_path)][0]
    subdirs = remove_hidden(subdirs)
    subdirs = [start_path + "/" + subdir for subdir in subdirs]
    return subdirs

def get_relative_path_to_files(start_path):
    files = [f for f in listdir(start_path) if isfile(join(start_path, f))]
    files = remove_hidden(files)
    files = [start_path + "/" + file for file in files]
    return files

In [3]:
home_articles_directory = "../articles"

subdirs = get_relative_path_to_dirs(home_articles_directory)
subdirs

['../articles/medium',
 '../articles/splinters',
 '../articles/thehistoryblog',
 '../articles/tutorialspoint',
 '../articles/chemistry-blog',
 '../articles/wikihow',
 '../articles/kdnuggets',
 '../articles/smartdatacollective']

## Read dataset

In [4]:
dataset = []

for subdir in subdirs:
    subsubdirs = get_relative_path_to_dirs(subdir)
    for subsubdir in subsubdirs:
        onlyfiles = get_relative_path_to_files(subsubdir)
        read_json_list = []
        for file in onlyfiles:
            with open(file, 'r') as infile:
                d = json.load(infile)
                dataset.append(d)

In [5]:
len(dataset)

1294

## TF-IDF

In [6]:
idf = pd.read_csv("../resources/wiki-30k-10-IDF.csv")

In [7]:
idf

Unnamed: 0,term,freq,idf,logidf
0,,18,882.000000,6.782192
1,kalmar,55,288.654545,5.665231
2,rickman,58,273.724138,5.612121
3,multi-point,17,933.882353,6.839350
4,jiankang,10,1587.600000,7.369979
5,henstridg,12,1323.000000,7.187657
6,rebel,2188,7.255941,1.981821
7,gainsbourg,29,547.448276,6.305268
8,mclaurin,12,1323.000000,7.187657
9,blassi,12,1323.000000,7.187657


In [8]:
idf = idf.set_index('term')
idf

Unnamed: 0_level_0,freq,idf,logidf
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,18,882.000000,6.782192
kalmar,55,288.654545,5.665231
rickman,58,273.724138,5.612121
multi-point,17,933.882353,6.839350
jiankang,10,1587.600000,7.369979
henstridg,12,1323.000000,7.187657
rebel,2188,7.255941,1.981821
gainsbourg,29,547.448276,6.305268
mclaurin,12,1323.000000,7.187657
blassi,12,1323.000000,7.187657


### Take sample content

In [15]:
sample = dataset[369]["content"]
sample

'\n I keep bumping into this question.  Casey Perin gave a talk on it at UCR; Daniel Greco has a forthcoming paper on it in Phil Review.  Benj Hellie launched an extended Facebook conversation about it.  Can the radical skeptic live his skepticism?  I submit the following for your consideration.\n\nFirst, a bit about belief.  I\'ve argued that to believe some proposition P is nothing more or less than to be disposed to act and react in a broadly belief-that-P-ish way -- that is, to be disposed, circumstances to being right, to say things like "P", to build one\'s plans on the likelihood of P\'s truth, to feel surprised should P prove false, etc.  Among the relevant dispositions is the disposition to consciously judge that P is the case, that is, to momentarily explicitly regard P as true, to endorse P intellectually (though not necessarily in language).  Dispositions to judge that P often pull apart from the other dispositions constitutive of belief, for example in self-deception, impl

## Compute tf-idf for each article (against wikipedia corpus)

In [28]:
import os
import sys

import nltk
import nltk.data
from nltk.tag.perceptron import PerceptronTagger
from nltk.probability import FreqDist

#Set tokenizers, tagger and stemmer
tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
sentTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = nltk.stem.snowball.EnglishStemmer()
tagger = PerceptronTagger()

import pandas as pd
import string
import re

In [18]:
def to_token_list(content):
    textList = nltk.word_tokenize(content)
    tokenList = []
    for token in textList:
        try:
            thisToken = token
            uselessUnicode = [u'\u2013', u'\u2014', u'\u201d', u'\u201c'] ### don't include these when they are alone
            if thisToken not in uselessUnicode:
                thisToken = thisToken.replace(u'\u201d','') # delete this (unicode quote)
                thisToken = thisToken.replace(u'\u201c','') # delete this (unicode quote)
                tokenList.append(thisToken)
        except:
            tokenList.append('**CODEC_ERROR**')
            # #######################prints word on CODEC ERROR
            print('**CODEC_ERROR**')
            print(token) 
            print('****')
    return tokenList

import string
punctuation = set(string.punctuation)
import re

def cleanTokens(tokenList):
    #Convert all text to lower case
    textList=[word.lower() for word in tokenList]
    
    #Remove punctuation
    textList=[word for word in textList if word not in punctuation]
    textList=["".join(c for c in word if c not in punctuation) for word in textList ]
    
    #convert digits into NUM
    textList=[re.sub("\d+", "NUM", word) for word in textList]  
    
    #Stem words 
    textList=[stemmer.stem(word) for word in textList]
    
    #Remove blanks
    textList=[word for word in textList if word!= ' ']
    textList=[word for word in textList if word!= '']
    
    #Extract tokens
    return textList

In [66]:
def from_sample_to_tfidf(sample):
    tl = to_token_list(sample)
    raw_text = ' '.join(tl)
    tokens = cleanTokens(tl)
    
    ## create FreqDF with word frequencies
    freq = FreqDist(tokens)
    
    # convert it to a data frame
    freqDF = pd.DataFrame.from_dict(freq, orient='index')
    freqDF.columns = ['freq']
    
    ## merge freqDF with idf data frame
    freqit = freqDF.join(idf[['idf', 'logidf']])
    
    # replace null values with max
    maxidf = max(freqit['idf'].dropna())
    maxlogidf = max(freqit['logidf'].dropna())
    freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf
    freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf
    
    ## create tfidf columns
    freqit['tfidf'] = freqit['freq'] * freqit['idf']
    freqit['logtfidf'] = freqit['freq'] * freqit['logidf']
    
    ## order by logtfidf weight
    freqit = freqit.sort_values(by='logtfidf', ascending=False) 
    
    return freqit

In [67]:
for i,sample in enumerate(dataset):
    sample["tfidf"] = from_sample_to_tfidf(sample["content"])
    if i % 50 == 0:
        print(i)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250


## Compute similarity between two articles

In [123]:
import math

In [154]:
def cosine_similarity_on_articles(article_1, article_2):
    d = {}
    try:
        w1 = article_1["tfidf"].loc[list(article_2["tfidf"].index)].dropna()
        w2 = article_2["tfidf"].loc[list(article_1["tfidf"].index)].dropna()
        norm_1 = 0
        norm_2 = 0
        for i,row in article_1["tfidf"].iterrows():
            norm_1 += row["logtfidf"] * row["logtfidf"]
        for i,row in article_2["tfidf"].iterrows():
            norm_2 += row["logtfidf"] * row["logtfidf"]
        cosine_similarity = 0
        for i,row in w1.iterrows():
            v1 = row["logtfidf"]
            v2 = w2.loc[i]["logtfidf"]
            delta = (v1 * v2) / (math.sqrt(norm_1) * math.sqrt(norm_2))
            cosine_similarity += delta
            d[i] = delta
        for k in d.keys():
            d[k] = d[k] / cosine_similarity
        return cosine_similarity, d
    except: # e.g. the case where no index overlaps
        return 0, d

## Compute similarity matrix

In [87]:
N = len(dataset)
sim_matrix = [[0 for j in range(N)] for i in range(N)]
N

1294

In [88]:
print("Computing {0} similarities.".format(N*N))
for i in range(N-1):
    for j in range(i+1, N):
        sim, d = cosine_similarity_on_articles(dataset[i], dataset[j])
        sim_matrix[i][j] = sim
        sim_matrix[j][i] = sim
        num_sim = i*N + j
        if num_sim % 100 == 0:
            print(num_sim)

Computing 1674436 similarities.


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  This is separate from the ipykernel package so we can avoid doing imports until


100
200
300
400
500
600
700
800


KeyError: "None of [Index(['memcach', 'tutori', 'pdf', 'page', 'job', 'search', 'resourc', 'guid',\n       'quick', 'discuss', 'version', 'next', 'previous'],\n      dtype='object')] are in the [index]"

## Recommendation example

In [131]:
def get_random_article(dataset):
    random_index = random.randint(0, len(dataset) - 1)
    print(random_index)
    return dataset[random_index]

In [169]:
def start_recommender(dataset, top_n=5):
    random_article = get_random_article(dataset)
    
    print("Selected the following article:")
    print("Title: {0}".format(random_article["title"]))
    print("URL: {0}".format(random_article["url"]))
    
    print("Computing most similar articles...")
    similarities = []
    for i,article in enumerate(dataset):
        cos_sim, d = cosine_similarity_on_articles(random_article, article)
        d = d.items() # convert to list of tuples (key, value)
        d = sorted(d, key=lambda t:t[1], reverse=True)
        similarities.append((i, cos_sim, d))
    similarities = sorted(similarities, key=lambda t:t[1], reverse=True)
    
    print("The top {0} similar articles are:".format(top_n))
    for i in range(1, top_n+1):
        print("--------------------------------")
        print("Title: {0}".format(dataset[similarities[i][0]]["title"]))
        print("URL: {0}".format(dataset[similarities[i][0]]["url"]))
        print("Similarity score: {0}".format(similarities[i][1]))
        print("Index: {0}".format(similarities[i][0]))
        print("Most important words:")
        for j in range(min(5, len(similarities[i][2]))):
            print("\t{0}, with percentage {1}%".format(similarities[i][2][j][0], similarities[i][2][j][1]))

In [171]:
start_recommender(dataset, 10)

164
Selected the following article:
Title: An Evolving Manifesto — Part 1: The Intro
URL: https://medium.com/empathic-futures-lab/an-evolving-manifesto-part-1-the-intro-e2640358e2ce
Computing most similar articles...
The top 10 similar articles are:
--------------------------------
Title: 
Two Views of the Relationship Between Philosophy and Science Fiction

URL: https://schwitzsplinters.blogspot.com/2014/11/two-views-of-relationship-between.html
Similarity score: 0.06358378035196
Index: 351
Most important words:
	intro, with percentage 0.06358376889835327%
	an, with percentage 1.1453606725156235e-08%
	the, with percentage 0.0%
--------------------------------
Title: Mmm… Futuricious…
URL: http://www.thehistoryblog.com/archives/292
Similarity score: 0.04926920057803024
Index: 463
Most important words:
	manifesto, with percentage 0.04926919460589389%
	an, with percentage 4.480160695100559e-09%
	num, with percentage 1.4919756537115058e-09%
	the, with percentage 0.0%
---------------------

## Extract content headers

In [None]:
hs = ["h1", "h2", "h3"]

for d in dataset:
    headers = []
    soup = BeautifulSoup(d["content_html"])
    for h in hs:
        headers += [el.text for el in soup.select(h)]
    d["headers"] = headers

In [None]:
print("Average number of headers per article: {0}".format(sum([len(d["headers"]) for d in dataset]) / len(dataset)))

In [87]:
import re

def clean_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub("(<!--.*?-->)", "", cleantext, flags=re.DOTALL)
    return cleantext

def remove_newlines(content):
    return content.replace("\n", " ")

def remove_white_spaces(content):
    content = re.sub(' +', ' ', content)
    content = content.strip()
    return content

def remove_urls(content):
    content = re.sub(r'https?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
    content = re.sub(r'http?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
    return content

def remove_code(content):
    content = re.sub(r'(\w+(\.\w+)*\([^\)]*\))', '', content, flags=re.MULTILINE) # matches a.b.c(d)
    return content

def remove_alt_html(content):
    content = content.split("&lt")[0]
    return content

def clean_content(content):
    content = clean_html(content)
    content = remove_newlines(content)
    content = remove_white_spaces(content)
    content = remove_urls(content)
    content = remove_code(content)
    content = remove_alt_html(content)
    return content

In [90]:
print(clean_content(dataset[155]["content"]))

Bringing VR and AR to the Enterprise: A Conversation with Handsome’s New EVP HandsomeBlockedUnblockFollowFollowingOct 24, 2018


In [91]:
dataset[155]["url"]

'https://medium.com/handsome-perspectives/bringing-vr-and-ar-to-the-enterprise-a-conversation-with-handsomes-new-evp-7dc5ed4dc2f3'