In [66]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares
from tqdm import tqdm_notebook

In [47]:
user_taggedbookmarks = pd.read_csv('hetrec2011-delicious-2k/user_taggedbookmarks.dat', sep='\t')
bookmarks = pd.read_csv('hetrec2011-delicious-2k/bookmarks.dat', sep='\t', encoding='koi8-r')

In [48]:
user_taggedbookmarks.head()

Unnamed: 0,userID,bookmarkID,tagID,day,month,year,hour,minute,second
0,8,1,1,8,11,2010,23,29,22
1,8,2,1,8,11,2010,23,25,59
2,8,7,1,8,11,2010,18,55,1
3,8,7,6,8,11,2010,18,55,1
4,8,7,7,8,11,2010,18,55,1


In [49]:
bookmarks = bookmarks.drop(["md5","url", "title",  "md5Principal"], axis = 1)


In [78]:
bookmarks.columns = ["bookmarkID","urlPrincipal" ]
bookmarks.head()

Unnamed: 0,bookmarkID,urlPrincipal
0,1,www.ifla.org
1,2,archive.ifla.org
2,7,www.edselect.com
3,8,www.collectionscanada.gc.ca
4,9,www.kidsreads.com


In [51]:
added_bkmrks = user_taggedbookmarks.groupby(by=['userID', 'bookmarkID']).count().reset_index()[['userID', 'bookmarkID']]

In [52]:
added_bkmrks['usr_num'] = added_bkmrks.userID.astype("category").cat.codes
added_bkmrks['bmk_num'] = added_bkmrks.bookmarkID.astype("category").cat.codes

In [53]:

added_bkmrks.head()

Unnamed: 0,userID,bookmarkID,usr_num,bmk_num
0,8,1,0,0
1,8,2,0,1
2,8,7,0,2
3,8,8,0,3
4,8,9,0,4


In [54]:
common_table = pd.merge(added_bkmrks , bookmarks,  on = "bookmarkID").

In [60]:
common_table.sort_values(by=["usr_num"])

Unnamed: 0,userID,bookmarkID,usr_num,bmk_num,urlPrincipal
0,8,1,0,0,www.ifla.org
143,8,65,0,43,www.loc.gov
141,8,64,0,42,www.loc.gov
138,8,63,0,41,www.slideshare.net
135,8,62,0,40,www.gigglepoetry.com
...,...,...,...,...,...
51263,108035,29934,1866,20443,hbr.org
15252,108035,7626,1866,5452,www.cosplaygate.com
35948,108035,19790,1866,13799,biased-bbc.blogspot.com
52693,108035,30959,1866,21085,googlesystem.blogspot.com


In [63]:
item_lookup = common_table[['bmk_num', 'urlPrincipal']].drop_duplicates()
item_lookup['bmk_num'] = item_lookup.bmk_num.astype(str)

In [64]:
item_lookup.head()

Unnamed: 0,bmk_num,urlPrincipal
0,0,www.ifla.org
2,1,archive.ifla.org
4,2,www.edselect.com
6,3,www.collectionscanada.gc.ca
8,4,www.kidsreads.com


In [67]:
bookmark_id_name = {}

for index, row in tqdm_notebook(item_lookup.iterrows()):
    bookmark_id_name[row.bmk_num] = row.urlPrincipal

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [71]:
users = list(added_bkmrks.userID.unique())
bkmrks = list(added_bkmrks.bookmarkID.unique())
data = [1 for i in range(len(added_bkmrks))]

In [72]:
rows = added_bkmrks.usr_num.astype(int)
cols = added_bkmrks.bmk_num.astype(int)

In [73]:
data_sparse = sparse.csr_matrix((data, (rows, cols)), shape=(len(users), len(bkmrks)))

In [74]:
model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [75]:
user_num = 100

user_items = data_sparse.T.tocsr()
recommendations = model.recommend(user_num, user_items)

In [76]:
recommendations

[(1458, 0.098666295),
 (3, 0.077085234),
 (4, 0.07212396),
 (74, 0.040472507),
 (86, 0.038329937),
 (1538, 0.0369792),
 (1550, 0.034216374),
 (251, 0.033557698),
 (1737, 0.03288185),
 (1480, 0.028505333)]

In [77]:
for r in recommendations:
    print(bookmark_id_name[str(r[0])])

www.diylife.com
www.collectionscanada.gc.ca
www.kidsreads.com
celestinechua.com
www.mixriot.com
www.viewzi.com
www.typo-shark.com
www.thegnomonworkshop.com
roomfordebate.blogs.nytimes.com
lifehacker.com
