In [2]:
import pandas
import sys
from utils import jaccard_similarity, load_data

In [3]:
dataset = load_data("./active1000")

In [4]:
# Remove unused columns.
dataset.drop(columns=["eventId", "category", "activeTime", "title", "publishtime", "time", "documentId"], inplace=True)

# Remove events related to the home page.
dataset.drop(index=dataset[dataset.url == "http://adressa.no"].index, inplace=True)

print(dataset)

                                                       url  \
2        http://adressa.no/nyheter/trondheim/2016/12/31...   
4        http://adressa.no/pluss/kultur/2016/12/31/bare...   
15       http://adressa.no/nyheter/2016/12/31/se-lesern...   
16       http://adressa.no/nyheter/trondheim/2016/12/31...   
17       http://adressa.no/nyheter/2016/12/31/se-lesern...   
...                                                    ...   
2207591  http://adressa.no/nyheter/moreromsdal/2017/03/...   
2207598  http://adressa.no/nyheter/trondheim/2017/03/31...   
2207599  http://adressa.no/nyheter/trondheim/2017/03/31...   
2207603  http://adressa.no/pluss/magasin/2017/03/31/tre...   
2207607  http://adressa.no/incoming/2017/03/31/transpar...   

                                              userId  
2                  cx:iimz2wwcwxu7d721:2r8odp9zhg5yp  
4                  cx:iimz2wwcwxu7d721:2r8odp9zhg5yp  
15                 cx:il0sdznsjgg9uxgy:3bi2ksost85yi  
16                  cx:ihnzu06beuaz

In [23]:
# Create sets of articles per user
user_sets = dataset.groupby(by="userId")["url"].apply(set)

In [28]:
print(user_sets)
print(type(user_sets))

userId
cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7    {http://adressa.no/100sport/fotball/bendtners-...
cx:11asuyo07mbi13b849lp87hlje:1i9z1hu1xx694    {http://adressa.no/pluss/okonomi/2017/01/05/i-...
cx:13077170178592105431908:2so1nc52xur4d       {http://adressa.no/pluss/kultur/2017/03/15/thr...
cx:13082126012361420762846:3nhzg71lk2zqc       {http://adressa.no/nyheter/utenriks/2017/01/14...
cx:13082926635761580649288:30q2cgmwa3mzi       {http://adressa.no/familieogoppvekst/sjekk-hvo...
                                                                     ...                        
cx:ymgd4qz84t6z31b22zecmrpz3:2bclyyq01sfwd     {http://adressa.no/nyheter/utenriks/2017/01/14...
cx:ywxiqgtmxyfc24eqhyzpz7jer:28kehwib7kyi8     {http://adressa.no/familieogoppvekst/sjekk-hvo...
cx:z5nmgvym79d91yxajr8rncjp3:dd23bgkvv19x      {http://adressa.no/kultur/2017/01/02/det-er-fu...
cx:zdxhidle29xquzztafvsik92:2w17zz40qj619      {http://adressa.no/familieogoppvekst/sjekk-hvo...
cx:ztquyfd3pug92dd4jpqz

In [56]:
def jaccard_user(user_id: str, user_sets: pandas.Series) -> pandas.DataFrame:
  """ Find the jaccard similarity between the user with supplied ID and all other users.

  Args:
    user_id: The ID of the user to compare to other users.
    user_sets: The series of sets of articles read by each user.
  """
  sim_map = {}
  user_set = user_sets[user_id]

  for other_user, other_user_set in user_sets.iteritems():
    jacc = jaccard_similarity(user_set, other_user_set)
    sim_map[other_user] = jacc

  return pandas.DataFrame.from_dict(sim_map, columns=['Similarity'], orient='index')

In [26]:
jaccard_user('cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7', user_sets)

Unnamed: 0,Jaccard
cx:10k2wzm1b3jsk2y5ym1utnjo97:2kefzqaxe9jx7,1.000000
cx:11asuyo07mbi13b849lp87hlje:1i9z1hu1xx694,0.075800
cx:13077170178592105431908:2so1nc52xur4d,0.049149
cx:13082126012361420762846:3nhzg71lk2zqc,0.127131
cx:13082926635761580649288:30q2cgmwa3mzi,0.137699
...,...
cx:ymgd4qz84t6z31b22zecmrpz3:2bclyyq01sfwd,0.085466
cx:ywxiqgtmxyfc24eqhyzpz7jer:28kehwib7kyi8,0.120022
cx:z5nmgvym79d91yxajr8rncjp3:dd23bgkvv19x,0.035971
cx:zdxhidle29xquzztafvsik92:2w17zz40qj619,0.127793


In [75]:
def knn_user(user_id: str, k: int, similarities: pandas.DataFrame, user_sets: pandas.Series) -> pandas.Series:
  """ Given the supplied similarity series, find the recommended articles based on kNN.

  Args:
    user_id: The ID of the user to find recommended articles for.
    k: The number of similar users recommendations should be created from.
    similarities: Similarities between the supplied user and all other users.
    user_sets: The series of sets of articles read by each user.
  """
  knn = similarities.drop(user_id).nlargest(k, columns=['Similarity'])

  weighted_articles = {}

  # Iterate all similar users
  for other_user_id, row in knn.iterrows():
    other_user_set = user_sets[other_user_id]

    # Add influence by user on article
    for article in other_user_set:
      if article not in weighted_articles:
        weighted_articles[article] = 0
      weighted_articles[article] += row['Similarity']

  # Remove articles already seen by user
  for article in user_sets[user_id]:
    weighted_articles.pop(article, None)

  # Sort as list and return
  weighted_articles = sorted(weighted_articles.items(), key=lambda x: x[1], reverse=True)
  print(weighted_articles[0:20])
      

  

In [78]:
user_id = 'cx:13082926635761580649288:30q2cgmwa3mzi'
knn_user(user_id, 20, jaccard_user(user_id, user_sets), user_sets)

[('http://adressa.no/pluss/okonomi/2017/02/19/klesbutikk-lagt-ned-etter-25-%c3%a5r-i-midtbyen-14258480.ece', 3.459773815462648), ('http://adressa.no/nyheter/nordtrondelag/2017/03/19/trafikkuhell-ved-bensinstasjon-14471307.ece', 3.284526579199455), ('http://adressa.no/nyheter/trondheim/2017/01/13/fotgjenger-raste-mot-syklist-p%c3%a5-gangstien-14055414.ece', 3.2839320196315414), ('http://adressa.no/nyheter/okonomi/2017/02/09/turoperat%c3%b8r-er-konkurs-14191484.ece', 3.2818640418052016), ('http://adressa.no/pluss/nyheter/2017/02/27/et-halvt-%c3%a5r-etter-blir-det-levert-i-trondheim-14321349.ece', 3.280898313807316), ('http://adressa.no/pluss/okonomi/2017/01/15/city-syd-kan-bli-st%c3%b8rst-igjen-i-jubileums%c3%a5ret-14048772.ece', 3.277353953463934), ('http://adressa.no/100sport/langrenn_old/skimammaens-dod-skaper-frykt-i-smoreboden-229674b.html', 3.2754299061156638), ('http://adressa.no/nyheter/sortrondelag/2017/03/19/fant-skjelett-trolig-fra-istida-14471445.ece', 3.269390050484574), ('h