In [1]:
!unzip /content/ml-100k.zip -d sample_data

Archive:  /content/ml-100k.zip
   creating: sample_data/ml-100k/
  inflating: sample_data/ml-100k/allbut.pl  
  inflating: sample_data/ml-100k/mku.sh  
  inflating: sample_data/ml-100k/README  
  inflating: sample_data/ml-100k/u.data  
  inflating: sample_data/ml-100k/u.genre  
  inflating: sample_data/ml-100k/u.info  
  inflating: sample_data/ml-100k/u.item  
  inflating: sample_data/ml-100k/u.occupation  
  inflating: sample_data/ml-100k/u.user  
  inflating: sample_data/ml-100k/u1.base  
  inflating: sample_data/ml-100k/u1.test  
  inflating: sample_data/ml-100k/u2.base  
  inflating: sample_data/ml-100k/u2.test  
  inflating: sample_data/ml-100k/u3.base  
  inflating: sample_data/ml-100k/u3.test  
  inflating: sample_data/ml-100k/u4.base  
  inflating: sample_data/ml-100k/u4.test  
  inflating: sample_data/ml-100k/u5.base  
  inflating: sample_data/ml-100k/u5.test  
  inflating: sample_data/ml-100k/ua.base  
  inflating: sample_data/ml-100k/ua.test  
  inflating: sample_data/ml-100

In [2]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
import json

In [3]:
dir = '/content/sample_data/ml-100k'
col_names = ['user id', 'item id', 'rating', 'timestamp']
data = pd.read_csv(os.path.join(dir, 'u.data'), delimiter='\t', names=col_names, header=None)
data['timestamp'] = data['timestamp'].apply(lambda x: datetime.fromtimestamp(x))

with open(os.path.join(dir, 'u.item'), encoding = "ISO-8859-1") as f:
  movie = pd.read_csv(f, delimiter='|', header=None)

movie.columns = ['item id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

with open(os.path.join(dir, 'u.user'), encoding = "ISO-8859-1") as f:
  user = pd.read_csv(f, delimiter='|', header=None)

user.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']

ratings = data.merge(movie[['item id', 'title']], on='item id')

ratings['like'] = ratings['rating'] > 3

In [4]:
ratings.head

Unnamed: 0,user id,item id,rating,timestamp,title,like
0,196,242,3,1997-12-04 15:55:49,Kolya (1996),False
1,63,242,3,1997-10-01 23:06:30,Kolya (1996),False
2,226,242,5,1998-01-04 04:37:51,Kolya (1996),True
3,154,242,3,1997-11-10 05:03:55,Kolya (1996),False
4,306,242,5,1997-10-10 17:16:33,Kolya (1996),True


In [None]:
ratings.to_csv('/content/sample_data/ratings.csv')

In [5]:
ratings.sort_values(by=['user id'], ascending=[True]).head(10)

Unnamed: 0,user id,item id,rating,timestamp,title,like
43606,1,12,5,1997-11-03 07:42:40,"Usual Suspects, The (1995)",True
79998,1,254,1,1997-11-03 07:16:32,Batman & Robin (1997),False
35906,1,189,3,1998-03-01 06:15:28,"Grand Day Out, A (1992)",False
69127,1,87,5,1997-11-03 07:52:21,Searching for Bobby Fischer (1993),True
51383,1,187,4,1997-09-22 22:01:18,"Godfather: Part II, The (1974)",True
71026,1,180,3,1997-09-24 03:42:53,Apocalypse Now (1979),False
97387,1,46,4,1997-10-15 05:27:10,Exotica (1994),True
6611,1,201,3,1997-11-03 07:42:40,Evil Dead II (1987),False
36126,1,64,5,1997-09-24 03:40:04,"Shawshank Redemption, The (1994)",True
6751,1,241,4,1997-11-03 07:45:33,"Last of the Mohicans, The (1992)",True


In [6]:
train_ratio = 0.9
train_size = int(len(ratings)*train_ratio)
ratings_train = ratings.sample(train_size, random_state=42)
ratings_test = ratings[~ratings.index.isin(ratings_train.index)]

In [7]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [8]:
from scipy.sparse import csr_matrix

n_users = ratings_train['user id'].max()
n_item = ratings_train['item id'].max()
ratings_train_pos = ratings_train[ratings_train['like']]
ratings_test_pos = ratings_test[ratings_test['like']]


row=ratings_train_pos['user id'].values - 1
col=ratings_train_pos['item id'].values - 1
data=np.ones(len(ratings_train_pos))
user_item_data = csr_matrix((data, (row, col)), shape=(n_users, n_item))

In [9]:
user_item_data

<943x1682 sparse matrix of type '<class 'numpy.float64'>'
	with 49835 stored elements in Compressed Sparse Row format>

In [10]:
import implicit

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=50, random_state=42)

# train the model on a sparse matrix of user/item/confidence weights
model.fit(user_item_data)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [11]:
from sklearn.metrics import dcg_score, ndcg_score

def precision_k(actuals, recs, k=5):
  return len(set(recs[0:k]).intersection(set(actuals)))/k

def recall_k(actuals, recs, k=5):
  return len(set(recs[0:k]).intersection(set(actuals)))/len(actuals)

def dcg_k(actuals, recs, k=5):
  relevance = np.array([[float(i in actuals) for i in recs[0:k]]])
  score = k - np.arange(k)
  return dcg_score(relevance, score.reshape(1,-1), k=k)

def ndcg_k(actuals, recs, k=5):
  relevance = np.array([[float(i in actuals) for i in recs[0:k]]])
  score = k - np.arange(k)
  return ndcg_score(relevance, score.reshape(1,-1), k=k)

def recall_stage(model, user_id, user_item_data, ratings_train, N):
  filter_items = ratings_train[ratings_train['user id']==user_id]['item id'].values
  filter_items = filter_items - 1
  user_id = user_id - 1

  recs, scores = model.recommend(user_id,
                                 user_item_data[user_id],
                                 filter_items=filter_items,
                                 N=N_recall)
  recs = recs.flatten() + 1
  return recs

def evaluate(user_id, ratings_test_pos, recs, k=5):
  actuals = ratings_test_pos[ratings_test_pos['user id']==user_id]['item id'].values
  return precision_k(actuals, recs, k), recall_k(actuals, recs, k), dcg_k(actuals, recs, k)

In [12]:
# recommend items for a user
N_recall=30
user_id=1
recs = recall_stage(model, user_id, user_item_data, ratings_train, N_recall)
evaluate(user_id, ratings_test_pos, recs, 20)

(0.2, 0.26666666666666666, 1.8389804011272912)

In [25]:
# prompt: i want to show the top 10 recommended movies for user 1

recommended_movies = movie[movie['item id'].isin(recs[0:10])]
recommended_movies[['title', 'item id']]


Unnamed: 0,title,item id
182,Alien (1979),183
196,"Graduate, The (1967)",197
275,Leaving Las Vegas (1995),276
301,L.A. Confidential (1997),302
317,Schindler's List (1993),318
379,Star Trek: Generations (1994),380
461,Like Water For Chocolate (Como agua para choco...,462
473,Dr. Strangelove or: How I Learned to Stop Worr...,474
507,"People vs. Larry Flynt, The (1996)",508
1072,Shallow Grave (1994),1073


In [26]:
# prompt: i want to loop through each user and get the top movie recommendation

user_ids = ratings_train['user id'].unique()
top_movies = []
for user_id in user_ids:
  recs = recall_stage(model, user_id, user_item_data, ratings_train, N_recall)
  recommended_movies = movie[movie['item id'].isin(recs[0:1])]
  top_movies.append(recommended_movies['title'].values[0])

top_movies_df = pd.DataFrame({'user_id': user_ids, 'top_movie': top_movies})
top_movies_df.head()


Unnamed: 0,user_id,top_movie
0,498,"Shining, The (1980)"
1,642,"Jungle Book, The (1994)"
2,58,Terminator 2: Judgment Day (1991)
3,495,Star Trek: The Wrath of Khan (1982)
4,618,One Flew Over the Cuckoo's Nest (1975)


In [23]:
view = ratings_test_pos[ratings_test_pos['user id']==user_id]['item id'].values
view = view - 1
view = pd.DataFrame(view, columns=['item id'])
view['title'] = view['item id'].apply(lambda x: movie[movie['item id']==x]['title'].values[0])
view.head()

Unnamed: 0,item id,title
0,94,Home Alone (1990)
1,24,Rumble in the Bronx (1995)
2,126,"Spitfire Grill, The (1996)"
3,81,"Hudsucker Proxy, The (1994)"
4,182,GoodFellas (1990)
