In [None]:
import numpy as np
import pandas as pd
import math
from collections import namedtuple

import nbimporter
from preprocessed_data_reader import ReaderPreprocessedData
from utils_os import UtilsOS

# Recommender

In [None]:
class Recommender:
    WordInfo = namedtuple('WordInfo', 'word perc_sim idf')
    SimData = namedtuple('SimData', 'index similarity words_importance_list')
    
    def recommend_articles(self, article, how_many=-1):
        raise NotImplementedException

# Recommender based on TF-IDF

In [None]:
class RecommenderTFIDF(Recommender):
    def _cosine_similarity_on_tfidf_vectors(self, tfidf_1, tfidf_2, on="tfidf"):
        admissible_on = ["tfidf", "logtfidf"]
        if on not in admissible_on:
            raise ValueError("on must be one of {0}".format(admissible_on))

        try:
            a = tfidf_1.loc[list(tfidf_2.index)].dropna()
            b = tfidf_2.loc[list(tfidf_1.index)].dropna()
            prod = np.multiply(a[on].values, b[on].values) 
            norm_1 = np.linalg.norm(tfidf_1[on].values)
            norm_2 = np.linalg.norm(tfidf_2[on].values)
            cosine_similarity = np.sum(prod) / (norm_1 * norm_2) # default is norm 2
            keys = a.index.values # same as b.index.values
            perc_in_similarity = prod / sum(prod)
            idf_of_word = a["idf"]# values are a tuple(perc_in_similarity, idf_of_word)
            words_importance_list = list(zip(keys, perc_in_similarity, idf_of_word)) # [(word, perc_in_similarity, idf_of_word), ...]
            words_importance_list = [Recommender.WordInfo(*t) for t in words_importance_list] # [WordInfo, ...]
            return cosine_similarity, words_importance_list
        except: # e.g. the case where no index overlaps
            return 0, []
        
    def _order_dataset_by_similarity(self, reference_article):
        similarities = []
        for i, article in enumerate(self._dataset):
            cos_sim, words_importance_list = self._cosine_similarity_on_tfidf_vectors(reference_article["tfidf"], article["tfidf"], on="logtfidf")
            words_importance_list = sorted(words_importance_list, key=lambda t:t.perc_sim, reverse=True) # sort by word importance
            similarities.append(Recommender.SimData(i, cos_sim, words_importance_list)) # similarities = [SimData, ...]
        similarities = sorted(similarities, key=lambda t:t.similarity, reverse=True) # sort by article similarity
        return similarities
    
    def __init__(self, dataset):
        self._dataset = dataset
        
    def recommend_articles(self, article, how_many=-1):
        similarities = self._order_dataset_by_similarity(article)
        if how_many != -1:
            return similarities[:how_many]
        return similarities