In [64]:
import pandas
import sys
import numpy as np
from utils import jaccard_similarity, load_data
from functools import reduce


In [65]:
dataset = load_data("./active1000")

In [66]:
# Remove unused columns.
dataset.drop(columns=["eventId", "category", "activeTime", "title", "publishtime", "time", "documentId"], inplace=True)

# Remove events related to the home page.
dataset.drop(index=dataset[dataset.url == "http://adressa.no"].index, inplace=True)

print(dataset)

                                                       url  \
1        http://adressa.no/nyheter/trondheim/2017/03/17...   
7        http://adressa.no/nyheter/okonomi/2017/03/20/h...   
8        http://adressa.no/100sport/idrettspolitikk/her...   
9        http://adressa.no/pluss/nyheter/2017/03/20/ott...   
10       http://adressa.no/pluss/nyheter/2017/03/20/i-e...   
...                                                    ...   
2207599  http://adressa.no/nyheter/trondheim/2017/03/07...   
2207600  http://adressa.no/100sport/fotball/slik-reager...   
2207603  http://adressa.no/100sport/meninger/na-tar-kar...   
2207604  http://adressa.no/pluss/nyheter/2017/03/07/reg...   
2207606  http://adressa.no/nyheter/nordtrondelag/2017/0...   

                                              userId  
1           cx:13675964253951224092221:1wgstmuzo9vjl  
7                   cx:i7m47c8k5538rd2u:sdl67619goo2  
8            cx:1359724185772431277628:1bo1nngcb9hcr  
9           cx:13675964253951224092

In [67]:
# Create sets of articles per user
user_sets = dataset.groupby(by="userId")["url"].apply(set)

In [68]:
print(user_sets)
print(type(user_sets))

userId
cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7    {http://adressa.no/pluss/okonomi/2017/03/05/ko...
cx:11asuyo07mbi13b849lp87hlje:1i9z1hu1xx694    {http://adressa.no/nyheter/sortrondelag/2017/0...
cx:13077170178592105431908:2so1nc52xur4d       {http://adressa.no/pluss/kultur/2017/03/02/ise...
cx:13082126012361420762846:3nhzg71lk2zqc       {http://adressa.no/100sport/vintersport/langre...
cx:13082926635761580649288:30q2cgmwa3mzi       {http://adressa.no/100sport/vintersport/langre...
                                                                     ...                        
cx:ymgd4qz84t6z31b22zecmrpz3:2bclyyq01sfwd     {http://adressa.no/100sport/vintersport/sa-mye...
cx:ywxiqgtmxyfc24eqhyzpz7jer:28kehwib7kyi8     {http://adressa.no/100sport/vintersport/langre...
cx:z5nmgvym79d91yxajr8rncjp3:dd23bgkvv19x      {http://adressa.no/meninger/2017/03/21/tren-de...
cx:zdxhidle29xquzztafvsik92:2w17zz40qj619      {http://adressa.no/pluss/okonomi/2017/03/05/ko...
cx:ztquyfd3pug92dd4jpqz

In [69]:
def jaccard_user(user_id: str, user_sets: pandas.Series) -> pandas.DataFrame:
  """ Find the jaccard similarity between the user with supplied ID and all other users.

  Args:
    user_id: The ID of the user to compare to other users.
    user_sets: The series of sets of articles read by each user.
  """
  sim_map = {}
  user_set = user_sets[user_id]

  for other_user, other_user_set in user_sets.iteritems():
    jacc = jaccard_similarity(user_set, other_user_set)
    sim_map[other_user] = jacc

  return pandas.DataFrame.from_dict(sim_map, columns=['Similarity'], orient='index')

In [70]:
jaccard_user('cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7', user_sets)

Unnamed: 0,Similarity
cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,1.000000
cx:11asuyo07mbi13b849lp87hlje:1i9z1hu1xx694,0.075800
cx:13077170178592105431908:2so1nc52xur4d,0.049149
cx:13082126012361420762846:3nhzg71lk2zqc,0.127131
cx:13082926635761580649288:30q2cgmwa3mzi,0.137699
...,...
cx:ymgd4qz84t6z31b22zecmrpz3:2bclyyq01sfwd,0.085466
cx:ywxiqgtmxyfc24eqhyzpz7jer:28kehwib7kyi8,0.120022
cx:z5nmgvym79d91yxajr8rncjp3:dd23bgkvv19x,0.035971
cx:zdxhidle29xquzztafvsik92:2w17zz40qj619,0.127793


In [71]:
def knn_user(user_id: str, k: int, similarities: pandas.DataFrame, user_sets: pandas.Series) -> pandas.Series:
  """ Given the supplied similarity series, find the recommended articles based on kNN.

  Args:
    user_id: The ID of the user to find recommended articles for.
    k: The number of similar users recommendations should be created from.
    similarities: Similarities between the supplied user and all other users.
    user_sets: The series of sets of articles read by each user.
  """
  knn = similarities.drop(user_id).nlargest(k, columns=['Similarity'])

  weighted_articles = {}

  # Iterate all similar users
  for other_user_id, row in knn.iterrows():
    other_user_set = user_sets[other_user_id]

    # Add influence by user on article
    for article in other_user_set:
      if article not in weighted_articles:
        weighted_articles[article] = 0
      weighted_articles[article] += row['Similarity']

  # Remove articles already seen by user
  for article in user_sets[user_id]:
    weighted_articles.pop(article, None)

  # Sort as list and return
  weighted_articles = sorted(weighted_articles.items(), key=lambda x: x[1], reverse=True)
  return weighted_articles
      

  

In [104]:
# Jaccard example
user_id = 'cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7'
for article, weight in knn_user(user_id, 20, jaccard_user(user_id, user_sets), user_sets)[0:20]:
  print(weight, article)

2.8858390960725 http://adressa.no/100sport/fotball/kona-til-bruttern-blir-styreleder-i-klubben-der-kare-fikk-sparken-230166b.html
2.7287577072358755 http://adressa.no/100sport/vintersport/langrenn/full-krangel-etter-at-northug-ble-vraket-229290b.html
2.5939002495354164 http://adressa.no/100sport/vintersport/langrenn/emil-iversen-roper-arsaken-til-den-kraftige-reaksjonen---jeg-gikk-for-en-kompis-som-har-hatt-det-toft-229576b.html
2.5922344449097094 http://adressa.no/100sport/fotball/fotballtopp-idrettsforbundet-er-verre-enn-donald-trump-231685b.html
2.5913971325535554 http://adressa.no/100sport/vintersport/langrenn/sjokkmaling-viser-krise-for-skiforbundet---vi-kan-ikke-leve-med-dette-231298b.html
2.587172276224065 http://adressa.no/100sport/vintersport/langrenn/-bjorgens-eks-trener--jeg-fikk-skylda-uten-a-kunne-forsvare-meg-231655b.html
2.58016827056009 http://adressa.no/100sport/vintersport/tanken-pa-at-lofshus-far-sparken_-skremmer-meg-231134b.html
2.577516172247162 http://adressa.no/

In [73]:
articles = dataset["url"].unique()

def recommend_random(n, user_id):
  recommended_articles = []
  for i in np.random.randint(0, len(articles), n*5):
    recommended_articles.append(articles[i])

  # Remove articles already seen by user
  for article in user_sets[user_id]:
    try: 
      recommended_articles.remove(article)
    except:
      pass

  return recommended_articles[0:n]

In [74]:
# Random example
user_id = 'cx:ztquyfd3pug92dd4jpqzsvxfw:dixlo02oz5eu'
for article in recommend_random(20, user_id):
  print(article)

http://adressa.no/meninger/2017/02/06/mer-enn-pennestr%c3%b8k-m%c3%a5-til-for-%c3%a5-avskaffe-fattigdom-14178267.ece
http://adressa.no/100sport/fotball/svensson-pa-tribunen-da-friday-scoret-to-i-alkmaar-tap-230000b.html
http://adressa.no/nyheter/okonomi/2015/10/27/mener-%c3%b8nsket-om-bilfri-midtby-er-%c2%abbasert-p%c3%a5-f%c3%b8lelser-ikke-fakta%c2%bb-11737603.ece
http://adressa.no/100sport/fotball/erling-braut-haland-spiller-testkamp-for-fc-kobenhavn-228733b.html
http://adressa.no/pluss/okonomi/2016/11/27/hver-arbeidsdag-er-en-konkurranse-13819845.ece
http://adressa.no/digital/fra-1g-til-4g-4538b.html
http://adressa.no/video/article14114338.ece
http://adressa.no/bolig/bygger-mikrohus-pa-under-20-kvm-mener-nordmenn-ma-slutte-a-breie-seg-9518b.html
http://adressa.no/100sport/vintersport/langrenn/northug---tromso-bor-bli-norges-nye-hovedstad-216539b.html
http://adressa.no/kultur/2016/06/15/rune-langlo-kan-f%c3%a5-fire-amanda-priser-12894922.ece
http://adressa.no/nyheter/utenriks/2017/03

In [101]:
"""
Perform tests of the jaccard recommendations and compare them to the random-recomendation benchmark. 
The tests calculates average precision and MRR for recommendation with 100 users and a 80%-20% train-test data split. 
"""

# Save precisions for different users
precisions = []
precisions_random = []

# Save MRRs for different users
MRRs = []
MRRs_random =[]

# Save unique arrticles that have been recommended – used to calculate catalogue coverrage.
randomly_predicted = set()
jaccard_predicted = set()

user_sets = dataset.groupby(by="userId")["url"].apply(set)

for user_id in user_sets.keys()[0:100]:
  # Copy users sets to avoid lastnig changes
  user_sets_2 = user_sets.copy()
  train_set = user_sets_2[user_id]

  # Create a test consisting of 20% of the users data.
  # Predictions are tested against this test set. 
  test_set = set()
  for _ in range(0, len(train_set)//5):
    # 20% of data is removes from train set and addded to test set
    test_set.add(train_set.pop())
    # Skip empty users
  if(len(test_set) == 0):
    continue
  # Hide test set from training data
  user_sets_2[user_id] = train_set

  # Jaccard precision
  predictions = knn_user(user_id, 30, jaccard_user(user_id, user_sets_2), user_sets_2)[0: len(test_set)]
  predictions_set = set([prediction[0] for prediction in predictions])
  correct_guesses = len(predictions_set.intersection(test_set))
  precision = correct_guesses/len(predictions_set)
  precisions.append(precision)

  # Save Jaccard catalogue coverage
  jaccard_predicted = jaccard_predicted.union(predictions_set)

  # Jaccard MRR 
  ranks = []
  for corrct_guess in predictions_set.intersection(test_set):
    # Janky way to get rank, as it is stores in test_set
    rank = [prediction[0] for prediction in predictions].index(corrct_guess)+1
    ranks.append(1/rank)
  if correct_guesses > 0:
    MRR = sum(ranks)/correct_guesses
  else:
    MRR = 0
  MRRs.append(MRR)

  # Random precision
  predictions_random = recommend_random(len(test_set), user_id)
  predictions_random_set = set(predictions_random)
  correct_random_guesses = len(predictions_random_set.intersection(test_set))
  precision_random = correct_random_guesses/len(predictions_random_set)
  precisions_random.append(precision_random)

  # Save random catalogue coverage
  randomly_predicted = randomly_predicted.union(predictions_random_set)

  # Random MRR
  ranks = []
  for corrct_guess in predictions_random_set.intersection(test_set):
    rank = predictions_random.index(corrct_guess) + 1
    ranks.append(1/rank)
  if correct_random_guesses > 0:
    MRR = sum(ranks)/correct_random_guesses
  else:
    MRR = 0
  MRRs_random.append(MRR)

# Calculate averages
average_precision = sum(precisions)/len(precisions)
average_precision_random = sum(precisions_random)/len(precisions_random)
average_mrr = sum(MRRs)/len(MRRs)
average_mrr_random = sum(MRRs_random)/len(MRRs_random)

# Print results
print("Average precision, jacc: ", average_precision)
print("Average precision, rand: ", average_precision_random)
print("Max precision, jacc:", max(precisions))
print("Max precision, random:", max(precisions_random))
print("Average MRR, jacc:", average_mrr)
print("Average MRR, random:", average_mrr_random)
print("Highest MRR, jacc:", max(MRRs))
print("Highest MRR, random:", max(MRRs_random))
print("Catalogue coverage, jacc:", len(jaccard_predicted)/len(articles))
print("Catalogue coverage, random:", len(randomly_predicted)/len(articles))



Average precision, jacc:  0.1322572793838549
Average precision, rand:  0.006915479883608633
Max precision, jacc: 0.36423841059602646
Max precision, random: 0.025252525252525252
Average MRR, jacc: 0.030083034463995455
Average MRR, random: 0.014388325612142412
Highest MRR, jacc: 0.13798665290142204
Highest MRR, random: 0.14285714285714285
Catalogue coverage, jacc: 0.06554285494361697
Catalogue coverage, random: 0.49998075664857794


In [102]:
user_sets_2[user_id]
for article, weight in knn_user(user_id, 20, jaccard_user(user_id, user_sets), user_sets)[0:20]:
  print(weight, article)

2.9178567939308335 http://adressa.no/pluss/nyheter/2017/01/04/da-han-kom-tilbake-fra-alpinbakken-s%c3%a5-bilen-slik-ut-14017129.ece
2.7188937043408234 http://adressa.no/pluss/okonomi/2017/01/28/fylte-et-tomt-lokale-med-barnekl%c3%a6r-14112692.ece
2.6082256904356793 http://adressa.no/nyheter/trondheim/2017/01/22/sovnet-i-dusjen-og-vekket-nabo-med-oversv%c3%b8mmelse-14096923.ece
2.6045121486920175 http://adressa.no/pluss/magasin/2017/02/22/malins-s%c3%b8nn-d%c3%b8de-i-magen-14296129.ece
2.5861598490239284 http://adressa.no/bolig/eksklusivt-hjemmehotell-solgte-alle-moblene-for-a-fa-hotellfolelsen-hjemme-9607b.html
2.58428599462314 http://adressa.no/nyheter/trondheim/2017/01/16/ble-forbikj%c3%b8rt-av-bussen-til-skistua-savner-ekstrabusser-til-bymarka-i-helgene-14066765.ece
2.5837552004056183 http://adressa.no/pluss/okonomi/2017/01/31/39-mister-jobben-etter-konkurs-14145829.ece
2.582731438094611 http://adressa.no/pluss/nyheter/2017/02/06/kun-den-som-har-mistet-et-barn-selv-kan-vite-hvor-hje