In [1]:
import numpy as np
import pandas as pd
import math
from collections import namedtuple
import random

import nbimporter
from preprocessed_data_reader import ReaderPreprocessedData
from utils_os import UtilsOS

import warnings
warnings.filterwarnings('ignore')

Importing Jupyter notebook from preprocessed_data_reader.ipynb
Importing Jupyter notebook from utils_os.ipynb


# Recommender

In [2]:
class UserState:
    def __init__(self):
        self.last_read_articles = []
        
    def add_read_article(self, article):
        self.last_read_articles.append(article)

In [3]:
class Recommender:
    WordInfo = namedtuple('WordInfo', 'word perc_sim idf')
    SimData = namedtuple('SimData', 'index similarity words_importance_list')
    
    def recommend_articles(self, UserState, how_many=-1):
        raise NotImplementedException

# Recommender based on TF-IDF

In [4]:
class RecommenderTFIDF(Recommender):
    def _cosine_similarity_on_tfidf_vectors(self, tfidf_1, tfidf_2, on="tfidf"):
        admissible_on = ["tfidf", "logtfidf"]
        if on not in admissible_on:
            raise ValueError("on must be one of {0}".format(admissible_on))

        try:
            a = tfidf_1.loc[list(tfidf_2.index)].dropna()
            b = tfidf_2.loc[list(tfidf_1.index)].dropna()
            prod = np.multiply(a[on].values, b[on].values) 
            norm_1 = np.linalg.norm(tfidf_1[on].values)
            norm_2 = np.linalg.norm(tfidf_2[on].values)
            cosine_similarity = np.sum(prod) / (norm_1 * norm_2) # default is norm 2
            keys = a.index.values # same as b.index.values
            perc_in_similarity = prod / sum(prod)
            idf_of_word = a["idf"]# values are a tuple(perc_in_similarity, idf_of_word)
            words_importance_list = list(zip(keys, perc_in_similarity, idf_of_word)) # [(word, perc_in_similarity, idf_of_word), ...]
            words_importance_list = [Recommender.WordInfo(*t) for t in words_importance_list] # [WordInfo, ...]
            return cosine_similarity, words_importance_list
        except: # e.g. the case where no index overlaps
            return 0, []
        
    def _order_dataset_by_similarity(self, reference_article):
        similarities = []
        for i, article in enumerate(self._dataset):
            cos_sim, words_importance_list = self._cosine_similarity_on_tfidf_vectors(reference_article["tfidf"], article["tfidf"], on="logtfidf")
            words_importance_list = sorted(words_importance_list, key=lambda t:t.perc_sim, reverse=True) # sort by word importance
            similarities.append(Recommender.SimData(i, cos_sim, words_importance_list)) # similarities = [SimData, ...]
        similarities = sorted(similarities, key=lambda t:t.similarity, reverse=True) # sort by article similarity
        return similarities
    
    def __init__(self, dataset):
        self._dataset = dataset
        
    def recommend_articles(self, user_state, how_many=-1):
        """user_state is of type UserState
           returns the indices of the best articles in the dataset"""
        last_article_read = self._dataset[user_state.last_read_articles[-1]]
        similarities = self._order_dataset_by_similarity(last_article_read)
        
        similarities = [t.index for t in similarities]
        
        if how_many != -1:
            return similarities[:how_many]
        return similarities

# Recommender based on BERT

In [5]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn import preprocessing

#!python -m spacy download en_core_web_md #you will need to install this on first load
#import spacy
#from spacy.lang.en import English
#from spacy import displacy
#nlp = spacy.load('en_core_web_md')
from IPython.display import HTML

import logging
logging.getLogger('tensorflow').disabled = True #OPTIONAL - to disable outputs from Tensorflow

from sklearn.metrics.pairwise import cosine_similarity

W0430 18:05:24.005646 4575921600 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [13]:
class RecommenderBERT(Recommender):
    def _load_bert(self):
        url = "https://tfhub.dev/google/elmo/2"
        return hub.Module(url)
    
    def _from_texts_to_vectors(self, texts):
        embeddings = self._bert(
            texts,
            signature="default",
            as_dict=True)["default"]

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            x = sess.run(embeddings)

        return x # array with length len(texts), where each element is an array with lenght 1024
    
    def _get_n_most_similar_to_search(self, search_string, n):
        """dataset should be a list of lists long 1024"""
        search_vect = self._from_texts_to_vectors([search_string], self._bert)
        cosine_similarities = pd.Series(cosine_similarity(search_vect, self._dataset_bert_vectorized).flatten())
        return cosine_similarities.nlargest(n)

    def __init__(self, dataset):
        self._dataset = dataset
        self._bert = self._load_bert()
        
        contents = [" ".join(sample["content_tokenized"]) for sample in dataset]
        self._dataset_bert_vectorized = self._from_texts_to_vectors(contents)
        
    def recommend_articles(self, user_state, how_many=-1):
        """returns the indices of the best articles in the datasets"""
        last_read_articles = [self._dataset[i] for i in user_state.last_read_articles]
        search_vectors = self._from_texts_to_vectors([" ".join(art["content_tokenized"]) for art in last_read_articles], self._bert)
        search_vect = np.mean(search_vectors)
        cosine_similarities = pd.Series(cosine_similarity(search_vect, self._dataset_bert_vectorized).flatten())
        if how_many == -1:
            return [i for i,j in cosine_similarities.nlargest(how_many).iteritems()]
        else:
            return [i for i,j in cosine_similarities.nlargest(how_many).iteritems()][:how_many]

In [7]:
dataset = ReaderPreprocessedData.read_data("../preprocessed")

In [None]:
def get_random_article(dataset):
    random_index = random.randint(0, len(dataset) - 1)
    return dataset[random_index]

counter = 0
recommender = RecommenderBERT(dataset)
user_state = UserState()
top_n = 10

while True:
    if counter == 0:
        print("Titles:")
        articles = [get_random_article(dataset) for j in range(top_n)]
        for j,article in enumerate(articles):
            print("{0} - Title: {1}".format(j + 1, article["title"]))
            print("URL: {0}".format(article["url"]))
            print("--------------------------------")

    print()
    print()
    print()
    if counter == 0:
        time_to_read = int(input("Indicate available time [min]: "))
        time_to_read *= 60
    chosen_index = int(input("Choose an article: "))
    user_state.add_read_article(chosen_index)

    if counter == 0:
        chosen_article = articles[chosen_index]
    else:
        #chosen_article = dataset[similarities[i].index]
        chosen_article = dataset[similarities[i]]

    similarities = recommender.recommend_articles(user_state, how_many=top_n)
    #similarities = [sim for sim in similarities if dataset[sim[0]]["read_time"] < time_to_read]
    similarities = [sim for sim in similarities if dataset[sim]["read_time"] < time_to_read]

    print("The top {0} similar articles are:".format(top_n))
    for i in range(top_n):
        print("--------------------------------")
        print("{0} - Title: {1}".format(i + 1, dataset[similarities[i]]["title"]))
        print("URL: {0}".format(dataset[similarities[i]]["url"]))
        print("Time to read [min]: {0:.2f}".format(dataset[similarities[i]]["read_time"] / 60))

    counter += 1

In [1]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def to_tsne(x):
    """From 1024 to 50 with PCA, from 50 to 2 with TSNE"""
    y = PCA(n_components=50).fit_transform(x)
    y = TSNE(n_components=2).fit_transform(y)
    return y

In [2]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

def scatter_plot(y):
    """Scatter plot of sentences by their 2 dimensions"""
    data = [
        go.Scatter(
            x=[i[0] for i in y],
            y=[i[1] for i in y],
            mode='markers',
            text=[i for i in sentences],
        marker=dict(
            size=16,
            color = [len(i) for i in sentences], #set color equal to a variable
            opacity= 0.8,
            colorscale='Viridis',
            showscale=False
        )
        )
    ]
    layout = go.Layout()
    layout = dict(
                  yaxis = dict(zeroline = False),
                  xaxis = dict(zeroline = False)
                 )
    fig = go.Figure(data=data, layout=layout)
    file = plot(fig, filename='Sentence encode.html') 

In [None]:
dataset_bert_vectorized = self._from_texts_to_vectors(contents)

In [None]:
tsne = to_tsne()

In [None]:
scatter_plot()