In [None]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import json
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from bs4.element import Tag
import random

import warnings
warnings.filterwarnings('ignore')

In [None]:
def remove_hidden(l):
    return [el for el in l if el[0] != "."]

def get_relative_path_to_files(start_path):
    files = [f for f in listdir(start_path) if isfile(join(start_path, f))]
    files = remove_hidden(files)
    files = [start_path + "/" + file for file in files]
    return files

## Read dataset

In [None]:
dataset = []

onlyfiles = get_relative_path_to_files("../preprocessed")
read_json_list = []
for file in onlyfiles:
    with open(file, 'r') as infile:
        d = json.load(infile)
        d["tfidf"] = pd.DataFrame(d["tfidf"])
        dataset.append(d)

In [None]:
len(dataset)

#### Read stem_dictionary

In [None]:
with open('../stemmer/stem_dictionary.json', 'r') as infile:
    stem_dictionary = json.load(infile)

## Compute similarity between two articles

In [None]:
import math

In [112]:
def old_cosine_similarity_on_articles(article_1, article_2):
    d = {}
    try:
        w1 = article_1["tfidf"].loc[list(article_2["tfidf"].index)].dropna()
        w2 = article_2["tfidf"].loc[list(article_1["tfidf"].index)].dropna()
        norm_1 = 0
        norm_2 = 0
        for i,row in article_1["tfidf"].iterrows():
            norm_1 += row["logtfidf"] * row["logtfidf"]
        for i,row in article_2["tfidf"].iterrows():
            norm_2 += row["logtfidf"] * row["logtfidf"]
        cosine_similarity = 0
        for i,row in w1.iterrows():
            v1 = row["logtfidf"]
            v2 = w2.loc[i]["logtfidf"]
            delta = (v1 * v2) / (math.sqrt(norm_1) * math.sqrt(norm_2))
            cosine_similarity += delta
            d[i] = delta
        for k in d.keys():
            d[k] = d[k] / cosine_similarity
        return cosine_similarity, d
    except: # e.g. the case where no index overlaps
        return 0, d

def cosine_similarity_on_articles(article_1, article_2):
    d = {}
    try:
        a = article_1["tfidf"].loc[list(article_2["tfidf"].index)].dropna()
        b = article_2["tfidf"].loc[list(article_1["tfidf"].index)].dropna()
        prod = np.multiply(a["logtfidf"].values, b["logtfidf"].values) 
        norm_1 = np.linalg.norm(article_1["tfidf"]["logtfidf"].values)
        norm_2 = np.linalg.norm(article_2["tfidf"]["logtfidf"].values)
        cosine_similarity = np.sum(prod) / (norm_1 * norm_2) # default is norm 2
        keys = a.index.values
        values = list(zip(prod / sum(prod), a["idf"])) # values are a tuple(perc_in_similarity, idf_of_word)
        d = dict(zip(keys, values))
        return cosine_similarity, d
    except: # e.g. the case where no index overlaps
        return 0, d

In [99]:
%%timeit
a = old_cosine_similarity_on_articles(dataset[100], dataset[200])

85.5 ms ± 7.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [107]:
%%timeit
a = cosine_similarity_on_articles(dataset[100], dataset[200])

5.24 ms ± 194 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Recommendation example

In [124]:
def get_random_article(dataset):
    random_index = random.randint(0, len(dataset) - 1)
    print("Random index: {0}".format(random_index))
    return dataset[random_index]

In [125]:
def retrieve_best_unstemmed_word(stem_word, stem_dictionary):
    d = stem_dictionary[stem_word]
    return max(d, key=d.get)

In [128]:
def start_recommender(dataset, top_n=5, max_words=10, max_sim_explained=0.9, idf_threshold=1.5):
    random_article = get_random_article(dataset)
    
    print("Selected the following article:")
    print("\tTitle: {0}".format(random_article["title"]))
    print("\tURL: {0}".format(random_article["url"]))
    
    print("Computing most similar articles...")
    print("")
    similarities = []
    for i,article in enumerate(dataset):
        cos_sim, d = cosine_similarity_on_articles(random_article, article)
        d = d.items() # convert to list of tuples (key, (perc, idf))
        d = sorted(d, key=lambda t:t[1][0], reverse=True) # sort by word importance
        similarities.append((i, cos_sim, d)) # similarities = [(i, cos_sim, {"w": [(perc, idf), ...], ...})]
    similarities = sorted(similarities, key=lambda t:t[1], reverse=True) # sort by article similarity
    
    print("The top {0} similar articles are:".format(top_n))
    for i in range(1, top_n+1):
        print("--------------------------------")
        print("Title: {0}".format(dataset[similarities[i][0]]["title"]))
        print("URL: {0}".format(dataset[similarities[i][0]]["url"]))
        print("Similarity score: {0:.2f}%".format(similarities[i][1]))
        print("Index: {0}".format(similarities[i][0]))
        print("Most important words:")
        sim_tot = 0
        for j in range(min(max_words, len(similarities[i][2]))):
            sw = similarities[i][2][j][0]
            w = retrieve_best_unstemmed_word(sw, stem_dictionary)
            score = similarities[i][2][j][1][0]
            idf_score = similarities[i][2][j][1][1]
            if idf_score >= idf_threshold:
                print("\t{0}, with percentage {1:.2f}%".format(w.capitalize(), score))
            sim_tot += score
            if sim_tot >= max_sim_explained:
                break

In [130]:
start_recommender(dataset, top_n=10, max_words=10, max_sim_explained=0.9, idf_threshold=2.5)

Random index: 553
Selected the following article:
	Title: Neopets as We Know It Is About to Be Obliterated
	URL: https://medium.com/s/love-hate/neopets-as-we-know-it-is-about-to-be-obliterated-78db2076412c
Computing most similar articles...

The top 10 similar articles are:
--------------------------------
Title: How to get HTTPS working on your local development environment in 5 minutes
URL: https://medium.freecodecamp.org/how-to-get-https-working-on-your-local-development-environment-in-5-minutes-7af615770eec
Similarity score: 0.04%
Index: 694
Most important words:
	Isn, with percentage 0.97%
--------------------------------
Title: What I Gave My Kid Instead of a Smartphone
URL: https://humanparts.medium.com/what-i-gave-my-kid-instead-of-a-smartphone-27c0f028ea78
Similarity score: 0.04%
Index: 886
Most important words:
	Isn, with percentage 0.46%
	App, with percentage 0.13%
	Smartphone, with percentage 0.10%
	Kids, with percentage 0.07%
	Opt, with percentage 0.02%
	Media, with percen

## Extract content headers

In [None]:
hs = ["h1", "h2", "h3"]

for d in dataset:
    headers = []
    soup = BeautifulSoup(d["content_html"])
    for h in hs:
        headers += [el.text for el in soup.select(h)]
    d["headers"] = headers

In [None]:
print("Average number of headers per article: {0}".format(sum([len(d["headers"]) for d in dataset]) / len(dataset)))