In [64]:
import pandas
import sys
import numpy as np
from utils import jaccard_similarity, load_data
from functools import reduce


In [65]:
dataset = load_data("./active1000")

In [66]:
# Remove unused columns.
dataset.drop(columns=["eventId", "category", "activeTime", "title", "publishtime", "time", "documentId"], inplace=True)

# Remove events related to the home page.
dataset.drop(index=dataset[dataset.url == "http://adressa.no"].index, inplace=True)

print(dataset)

                                                       url  \
1        http://adressa.no/nyheter/trondheim/2017/03/17...   
7        http://adressa.no/nyheter/okonomi/2017/03/20/h...   
8        http://adressa.no/100sport/idrettspolitikk/her...   
9        http://adressa.no/pluss/nyheter/2017/03/20/ott...   
10       http://adressa.no/pluss/nyheter/2017/03/20/i-e...   
...                                                    ...   
2207599  http://adressa.no/nyheter/trondheim/2017/03/07...   
2207600  http://adressa.no/100sport/fotball/slik-reager...   
2207603  http://adressa.no/100sport/meninger/na-tar-kar...   
2207604  http://adressa.no/pluss/nyheter/2017/03/07/reg...   
2207606  http://adressa.no/nyheter/nordtrondelag/2017/0...   

                                              userId  
1           cx:13675964253951224092221:1wgstmuzo9vjl  
7                   cx:i7m47c8k5538rd2u:sdl67619goo2  
8            cx:1359724185772431277628:1bo1nngcb9hcr  
9           cx:13675964253951224092

In [67]:
# Create sets of articles per user
user_sets = dataset.groupby(by="userId")["url"].apply(set)

In [68]:
print(user_sets)
print(type(user_sets))

userId
cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7    {http://adressa.no/pluss/okonomi/2017/03/05/ko...
cx:11asuyo07mbi13b849lp87hlje:1i9z1hu1xx694    {http://adressa.no/nyheter/sortrondelag/2017/0...
cx:13077170178592105431908:2so1nc52xur4d       {http://adressa.no/pluss/kultur/2017/03/02/ise...
cx:13082126012361420762846:3nhzg71lk2zqc       {http://adressa.no/100sport/vintersport/langre...
cx:13082926635761580649288:30q2cgmwa3mzi       {http://adressa.no/100sport/vintersport/langre...
                                                                     ...                        
cx:ymgd4qz84t6z31b22zecmrpz3:2bclyyq01sfwd     {http://adressa.no/100sport/vintersport/sa-mye...
cx:ywxiqgtmxyfc24eqhyzpz7jer:28kehwib7kyi8     {http://adressa.no/100sport/vintersport/langre...
cx:z5nmgvym79d91yxajr8rncjp3:dd23bgkvv19x      {http://adressa.no/meninger/2017/03/21/tren-de...
cx:zdxhidle29xquzztafvsik92:2w17zz40qj619      {http://adressa.no/pluss/okonomi/2017/03/05/ko...
cx:ztquyfd3pug92dd4jpqz

In [69]:
def jaccard_user(user_id: str, user_sets: pandas.Series) -> pandas.DataFrame:
  """ Find the jaccard similarity between the user with supplied ID and all other users.

  Args:
    user_id: The ID of the user to compare to other users.
    user_sets: The series of sets of articles read by each user.
  """
  sim_map = {}
  user_set = user_sets[user_id]

  for other_user, other_user_set in user_sets.iteritems():
    jacc = jaccard_similarity(user_set, other_user_set)
    sim_map[other_user] = jacc

  return pandas.DataFrame.from_dict(sim_map, columns=['Similarity'], orient='index')

In [70]:
jaccard_user('cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7', user_sets)

Unnamed: 0,Similarity
cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,1.000000
cx:11asuyo07mbi13b849lp87hlje:1i9z1hu1xx694,0.075800
cx:13077170178592105431908:2so1nc52xur4d,0.049149
cx:13082126012361420762846:3nhzg71lk2zqc,0.127131
cx:13082926635761580649288:30q2cgmwa3mzi,0.137699
...,...
cx:ymgd4qz84t6z31b22zecmrpz3:2bclyyq01sfwd,0.085466
cx:ywxiqgtmxyfc24eqhyzpz7jer:28kehwib7kyi8,0.120022
cx:z5nmgvym79d91yxajr8rncjp3:dd23bgkvv19x,0.035971
cx:zdxhidle29xquzztafvsik92:2w17zz40qj619,0.127793


In [71]:
def knn_user(user_id: str, k: int, similarities: pandas.DataFrame, user_sets: pandas.Series) -> pandas.Series:
  """ Given the supplied similarity series, find the recommended articles based on kNN.

  Args:
    user_id: The ID of the user to find recommended articles for.
    k: The number of similar users recommendations should be created from.
    similarities: Similarities between the supplied user and all other users.
    user_sets: The series of sets of articles read by each user.
  """
  knn = similarities.drop(user_id).nlargest(k, columns=['Similarity'])

  weighted_articles = {}

  # Iterate all similar users
  for other_user_id, row in knn.iterrows():
    other_user_set = user_sets[other_user_id]

    # Add influence by user on article
    for article in other_user_set:
      if article not in weighted_articles:
        weighted_articles[article] = 0
      weighted_articles[article] += row['Similarity']

  # Remove articles already seen by user
  for article in user_sets[user_id]:
    weighted_articles.pop(article, None)

  # Sort as list and return
  weighted_articles = sorted(weighted_articles.items(), key=lambda x: x[1], reverse=True)
  return weighted_articles
      

  

In [72]:
# Jaccard example
user_id = 'cx:ztquyfd3pug92dd4jpqzsvxfw:dixlo02oz5eu'
for article, weight in knn_user(user_id, 20, jaccard_user(user_id, user_sets), user_sets)[0:20]:
  print(weight, article)

4.358620085401448 http://adressa.no/nyheter/trondheim/2017/02/26/kastet-m%c3%b8bler-ut-vinduet-14314808.ece
4.358620085401448 http://adressa.no/nyheter/sortrondelag/2017/01/31/skulle-til-hitra-havnet-p%c3%a5-frosta-14145824.ece
4.358620085401448 http://adressa.no/pluss/nyheter/2017/02/28/frost-kj%c3%b8per-meny-eiendommen-p%c3%a5-lade-14327926.ece
4.14008998375954 http://adressa.no/pluss/okonomi/2017/03/08/legger-ned-butikk-i-midtbyen-etter-30-%c3%a5r-14406848.ece
4.139332873661406 http://adressa.no/nyheter/sortrondelag/2017/01/05/vi-skal-ikke-drive-med-biljakt-14018144.ece
3.9293355279310402 http://adressa.no/100sport/vintersport/langrenn/pappa-northugs-vm-tabbe-her-ma-han-forlate-lahti-rett-for-sprinten-231071b.html
3.923755048914975 http://adressa.no/nyheter/moreromsdal/2017/01/02/navnet-p%c3%a5-druknet-40-%c3%a5ring-er-frigitt-14006579.ece
3.9103626382207994 http://adressa.no/100sport/vintersport/langrenn/svensk-skistjerne-tror-ikke-pa-northug---har-ikke-bevist-mye-230452b.html
3.71

In [73]:
articles = dataset["url"].unique()

def recommend_random(n, user_id):
  recommended_articles = []
  for i in np.random.randint(0, len(articles), n*5):
    recommended_articles.append(articles[i])

  # Remove articles already seen by user
  for article in user_sets[user_id]:
    try: 
      recommended_articles.remove(article)
    except:
      pass

  return recommended_articles[0:n]

In [74]:
# Random example
user_id = 'cx:ztquyfd3pug92dd4jpqzsvxfw:dixlo02oz5eu'
for article in recommend_random(20, user_id):
  print(article)

http://adressa.no/meninger/2017/02/06/mer-enn-pennestr%c3%b8k-m%c3%a5-til-for-%c3%a5-avskaffe-fattigdom-14178267.ece
http://adressa.no/100sport/fotball/svensson-pa-tribunen-da-friday-scoret-to-i-alkmaar-tap-230000b.html
http://adressa.no/nyheter/okonomi/2015/10/27/mener-%c3%b8nsket-om-bilfri-midtby-er-%c2%abbasert-p%c3%a5-f%c3%b8lelser-ikke-fakta%c2%bb-11737603.ece
http://adressa.no/100sport/fotball/erling-braut-haland-spiller-testkamp-for-fc-kobenhavn-228733b.html
http://adressa.no/pluss/okonomi/2016/11/27/hver-arbeidsdag-er-en-konkurranse-13819845.ece
http://adressa.no/digital/fra-1g-til-4g-4538b.html
http://adressa.no/video/article14114338.ece
http://adressa.no/bolig/bygger-mikrohus-pa-under-20-kvm-mener-nordmenn-ma-slutte-a-breie-seg-9518b.html
http://adressa.no/100sport/vintersport/langrenn/northug---tromso-bor-bli-norges-nye-hovedstad-216539b.html
http://adressa.no/kultur/2016/06/15/rune-langlo-kan-f%c3%a5-fire-amanda-priser-12894922.ece
http://adressa.no/nyheter/utenriks/2017/03

In [99]:
# Test Jacc

recalls = []
recalls_random = []

MRRs = []
MRRs_random =[]

randomly_predicted = set()
jaccard_predicted = set()

user_sets = dataset.groupby(by="userId")["url"].apply(set)

predictions_made = 0

print("Recall, predictions_made")
for user_id in user_sets.keys()[0:100]:
  user_sets_2 = user_sets.copy()
  this_user_set = user_sets_2[user_id]

  test_set = set()
  for _ in range(0, len(this_user_set)//5):
    test_set.add(this_user_set.pop())

  if(len(test_set) == 0):
    continue

  predictions_made += len(test_set)

  user_sets_2[user_id] = this_user_set

  ### Jaccasrd
  predictions = knn_user(user_id, 30, jaccard_user(user_id, user_sets_2), user_sets_2)[0: len(test_set)]
  predictions_2 = set([prediction[0] for prediction in predictions])
  jaccard_predicted = jaccard_predicted.union(predictions_2)
  correct_guesses = len(predictions_2.intersection(test_set))
  recall = correct_guesses/len(predictions_2)
  recalls.append(recall)


  ### MRRR 
  ranks = []
  for corrct_guess in predictions_2.intersection(test_set):
    # Janky way to get rank, as it is stores in test_set
    rank = [prediction[0] for prediction in predictions].index(corrct_guess)+1
    ranks.append(1/rank)
  if correct_guesses > 0:
    MRR = sum(ranks)/correct_guesses
  else:
    MRR = 0
  MRRs.append(MRR)
  print(MRR, len(predictions_2))


  # print(recall, len(predictions_2))

  #### Random benchmark

  predictions_random = recommend_random(len(test_set), user_id)
  predictions_random_2 = set(predictions_random)
  randomly_predicted = randomly_predicted.union(predictions_random_2)
  correct_random_guesses = len(predictions_random_2.intersection(test_set))
  recall_random = correct_random_guesses/len(predictions_random_2)
  recalls_random.append(recall_random)

  ### MRRR 
  ranks = []
  for corrct_guess in predictions_random_2.intersection(test_set):
    rank = predictions_random.index(corrct_guess) + 1
    ranks.append(1/rank)
  if correct_random_guesses > 0:
    MRR = sum(ranks)/correct_random_guesses
  else:
    MRR = 0
  MRRs_random.append(MRR)


#average_ctr = reduce(lambda x, y: x + y, ctrs)/len(ctrs)
average_recall = sum(recalls)/len(recalls)
average_recall_random = sum(recalls_random)/len(recalls_random)

average_mrr = sum(MRRs)/len(MRRs)
average_mrr_random = sum(MRRs_random)/len(MRRs_random)

print("Average recall jacc: ", average_recall)
print("Max recall jacc", max(recalls))
print("Average recall rand: ", average_recall_random)
print("Max random recall", max(recalls_random))
print("Unique articles predicted, jacc", len(jaccard_predicted)/len(articles))
print("Unique articles predicted, random", len(randomly_predicted)/len(articles))
print("Average MRR, jacc", average_mrr)
print("Average MRR, random", average_mrr_random)
print("Highest MRR, jacc", max(MRRs))
print("Highest MRR, random", max(MRRs_random))



Recall, predictions_made
0.06409188411494875 173
0.07864651715593901 136
0.04866975390217758 48
0.1027211211609051 144
0.0625612919598121 198
0.06243320259488796 179
0.13798665290142204 125
0.03225806451612903 34
0.017975679535560837 149
0.12165817971506528 103
0.09119102924402822 95
0.06354185024667523 105
0.030018018933301412 222
0.0611945988969977 112
0.05239571972724163 177
0.053594942266019456 118
0.10281264710548964 129
0.03331730590337459 250
0.028442471311716875 219
0.03735581427488453 212
0.0232971833003618 198
0.03988721757643216 334
0.024062665749589894 192
0.02284780675529793 142
0.05961855331011165 181
0.049584676571418865 123
0.017836319459585914 126
0.03990446462698049 129
0.020168067226890754 86
0.053248407751367105 135
0.031537267318851106 160
0.04308967195533653 204
0.021404018013530913 144
0.038115982840677756 243
0.013797675472975317 169
0.0254902062565321 151
0.052786519780598506 214
0.027020328393594983 203
0.033591863276889025 207
0.029688413776749096 142
0.03200

In [None]:
# test random


In [None]:
user_sets_2[user_id]
for article, weight in knn_user(user_id, 20, jaccard_user(user_id, user_sets), user_sets)[0:20]:
  print(weight, article)