In [4]:
# to open files
import pandas as pd

# for numerical operations
import numpy as np

# sci-kit learn to measure distances
from sklearn.metrics.pairwise import pairwise_distances

Reading the Data

In [5]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
movielens_data = pd.read_csv('ml-100k/u.data', sep='\t', names=header)
movielens_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
movielens_data.shape

(100000, 4)

In [7]:
n_users, n_movies  = movielens_data['user_id'].nunique(), movielens_data['item_id'].nunique()
n_users, n_movies

(943, 1682)

training the Matrix

In [8]:

train_data_matrix = np.zeros((n_users, n_movies))

for line in movielens_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    
train_data_matrix.shape

(943, 1682)

In [9]:
train_data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [10]:
user_distances = pairwise_distances(train_data_matrix, metric="cosine")

# ".T" below is to transpose our 2D matrix.
train_data_matrix_transpose = train_data_matrix.T
movie_distances = pairwise_distances(train_data_matrix_transpose, metric="cosine")

user_distances.shape, movie_distances.shape

((943, 943), (1682, 1682))

Distances

In [11]:
user_distances

array([[0.        , 0.83306902, 0.95254046, ..., 0.85138306, 0.82049212,
        0.60182526],
       [0.83306902, 0.        , 0.88940868, ..., 0.83851522, 0.82773219,
        0.89420212],
       [0.95254046, 0.88940868, 0.        , ..., 0.89875744, 0.86658385,
        0.97344413],
       ...,
       [0.85138306, 0.83851522, 0.89875744, ..., 0.        , 0.8983582 ,
        0.90488042],
       [0.82049212, 0.82773219, 0.86658385, ..., 0.8983582 , 0.        ,
        0.81753534],
       [0.60182526, 0.89420212, 0.97344413, ..., 0.90488042, 0.81753534,
        0.        ]])

In [12]:
movie_distances


array([[0.        , 0.59761782, 0.66975521, ..., 1.        , 0.95281693,
        0.95281693],
       [0.59761782, 0.        , 0.72693082, ..., 1.        , 0.92170064,
        0.92170064],
       [0.66975521, 0.72693082, 0.        , ..., 1.        , 1.        ,
        0.90312495],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.95281693, 0.92170064, 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.95281693, 0.92170064, 0.90312495, ..., 1.        , 1.        ,
        0.        ]])

Similarities 

In [13]:
user_similarity = 1 - user_distances
movie_similarity = 1 - movie_distances

In [14]:
user_similarity


array([[1.        , 0.16693098, 0.04745954, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.16693098, 1.        , 0.11059132, ..., 0.16148478, 0.17226781,
        0.10579788],
       [0.04745954, 0.11059132, 1.        , ..., 0.10124256, 0.13341615,
        0.02655587],
       ...,
       [0.14861694, 0.16148478, 0.10124256, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.17950788, 0.17226781, 0.13341615, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.39817474, 0.10579788, 0.02655587, ..., 0.09511958, 0.18246466,
        1.        ]])

In [15]:
movie_similarity


array([[1.        , 0.40238218, 0.33024479, ..., 0.        , 0.04718307,
        0.04718307],
       [0.40238218, 1.        , 0.27306918, ..., 0.        , 0.07829936,
        0.07829936],
       [0.33024479, 0.27306918, 1.        , ..., 0.        , 0.        ,
        0.09687505],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.04718307, 0.07829936, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.04718307, 0.07829936, 0.09687505, ..., 0.        , 0.        ,
        1.        ]])

In [16]:
idx_to_movie = {}

with open('ml-100k/u.item', 'r', encoding="ISO-8859-1") as f:
    for line in f.readlines():
        info = line.split('|')
        idx_to_movie[int(info[0])-1] = info[1]

movie_to_idx = {v: k for k, v in idx_to_movie.items()}

Identify Movies by idx

In [17]:
idx_to_movie[1], idx_to_movie[2], idx_to_movie[3], idx_to_movie[4] 

('GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)')

In [18]:
movie_to_idx['Toy Story (1995)'], movie_to_idx['GoldenEye (1995)'], movie_to_idx['Four Rooms (1995)'], movie_to_idx['Get Shorty (1995)'] 

(0, 1, 2, 3)

In [19]:
def top_k_movies(similarity, mapper, movie_idx, k=6):
    return [mapper[x] for x in np.argsort(similarity[movie_idx,:])[:-k-2:-1]]

Chose the movie to analyse 

In [22]:
favorite_movie_name = 'Richard III (1995)'
movie_index = movie_to_idx[favorite_movie_name]
movie_index

9

In [23]:
how_much_movie_to_show = 7

movies = top_k_movies(movie_similarity, idx_to_movie, movie_index, k = how_much_movie_to_show)
movies[1:how_much_movie_to_show + 1]

['Henry V (1989)',
 'Angels and Insects (1995)',
 'Piano, The (1993)',
 'Madness of King George, The (1994)',
 'Othello (1995)',
 'Much Ado About Nothing (1993)',
 'Sense and Sensibility (1995)']