In [1]:
import pandas as pd
import sys
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

# Read dataset

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users =  pd.read_csv('/home/egor/datasets/movielens/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
print(users.shape)
users

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [3]:
i_cols = ['movie_id', 'movie_title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('/home/egor/datasets/movielens/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')
print(items.shape)
items

(1682, 24)


Unnamed: 0,movie_id,movie_title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('/home/egor/datasets/movielens/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
print(ratings.shape)
ratings

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


## Merge dataset

In [5]:
dataset = pd.merge(pd.merge(items, ratings),users)
dataset.head()

Unnamed: 0,movie_id,movie_title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Thriller,War,Western,user_id,rating,unix_timestamp,age,sex,occupation,zip_code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,308,5,887737890,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,1,0,0,308,4,887739608,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,308,4,887738847,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,308,5,887736696,60,M,retired,95076


## Creating Sparse Matrix
As we'll be using implicit library, so as The implicit library expects data as a item-user matrix so we create two matricies, one for fitting the model (item-user) and one for recommendations (user-item).

In [6]:
sparse_item_user = sparse.csr_matrix((dataset['rating'].astype(float),(dataset['movie_id'], dataset['user_id'])))

In [7]:
sparse_user_item = sparse.csr_matrix((dataset['rating'].astype(float),(dataset['user_id'], dataset['movie_id'])))

# Build model

In [8]:
model = implicit.als.AlternatingLeastSquares(factors=20,regularization=0.1,iterations=20)

  check_blas_config()


In [9]:
# Calculating the Confidence by multiplying it by our Alpha value.

alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

In [10]:
model.fit(data_conf)

  0%|          | 0/20 [00:00<?, ?it/s]

# Find similar

In [11]:
# Finding the 5 most similar movies to Braveheart(movie_id = 22)

item_id = 22
n_similar = 5
indexes, scores = model.similar_items(item_id,n_similar)

for i, id in enumerate(indexes):
    print(dataset.movie_title.loc[dataset.movie_id == id].iloc[0], scores[i])

Braveheart (1995) 1.0
Searching for Bobby Fischer (1993) 0.7926564
Phenomenon (1996) 0.68268716
Silence of the Lambs, The (1991) 0.63409394
Return of Martin Guerre, The (Retour de Martin Guerre, Le) (1982) 0.6304626


# Create User Recommendations

In [12]:
# Let's randomly create recommendations for user with user_id = 936

user_id = 1
indexes, scores = model.recommend(user_id, sparse_user_item[user_id])

movies = []
probability = []

for i, idx in enumerate(indexes):
    movies.append(dataset.movie_title.loc[dataset.movie_id == idx].iloc[0])
    probability.append(scores[i])

recommendations = pd.DataFrame({'movies': movies, 'probability':probability})
recommendations

Unnamed: 0,movies,probability
0,For Whom the Bell Tolls (1943),1.106072
1,Father of the Bride Part II (1995),1.100413
2,Cliffhanger (1993),1.090937
3,Sabrina (1995),1.081941
4,"Chamber, The (1996)",1.081341
5,Nil By Mouth (1997),1.080336
6,Duck Soup (1933),1.077301
7,Washington Square (1997),1.076242
8,"Kid in King Arthur's Court, A (1995)",1.074847
9,Last Man Standing (1996),1.073827
