In [1]:
import csv
import numpy as np
import pandas as pd

In [84]:
ratings = np.genfromtxt('ratings.csv', delimiter=',')[1:]
ratings[:,2] = ratings[:,2]*2 #met le rating sur 10
ratings = ratings.astype(int)

movies = pd.read_csv('movies.csv', header=None).values[1:]
movies[:,0] = movies[:,0].astype(int)
for idx, genres in enumerate(movies[:,2]):
    movies[idx][2] = genres.split('|')
    
#creates dictionnary that links idx to movies ids
movie_idx_from_id = {}
for i in range(movies.shape[0]):
    movie_idx_from_id[movies[i,0]] = i
    
#retrieves all existing genres    
genres = set() 
for gs in movies[:,2]:
    for g in gs:
        genres.add(g)

#creates dictionnary that links ids with genres string
id_from_genre = {}
for idx, genre in enumerate(genres):
    id_from_genre[genre] = idx
    
links = np.genfromtxt('links.csv', delimiter=',')[1:].astype(int)

tags = pd.read_csv('tags.csv', header=None).values[1:]
tags[:,0:2] = tags[:,0:2].astype(int)
tags[:,3] = tags[:,3].astype(int)

In [3]:
TF = np.zeros((len(genres), movies.shape[0]))
for idx, gs in enumerate(movies[:,2]):
    for g in gs:
        TF[id_from_genre[g],idx] = 1

In [8]:
#computing the inverse frequency

occurences = np.zeros(TF.shape[0])

for g in genres:
    occurences[id_from_genre[g]] = int(np.sum(TF[:][id_from_genre[g]]))
    
IDF = np.log(TF.shape[1]/occurences)

In [25]:
#computing the TD-IDF score for every pair of feature-item

TF_IDF = IDF*TF.T

In [31]:
TF, IDF, TF_IDF

(array([[1., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([2.76910476, 0.80374505, 3.37306072, 2.833316  , 1.63775544,
        3.23878111, 2.04295659, 1.67322396, 2.68591956, 4.12160668,
        2.29664914, 4.7182936 , 0.95309187, 3.09742699, 2.09495856,
        5.65784119, 1.80894594, 4.0662079 , 2.52619067, 2.29869204]),
 array([[2.76910476, 0.        , 0.        , ..., 0.        , 2.52619067,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 2.52619067,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.80374505, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [2.76910476, 0.        , 0.        , ..., 0.        , 0.        ,
      

In [396]:
nb_ids = len(set(ratings[:,0])) #610 ids with at least 1 rating from each

like_threshold = 8

liked_films = [[] for _ in range(nb_ids)] #real id is +1 because array starts with 0
given_scores = [[] for _ in range(nb_ids)] 

for rating in ratings:
    if(rating[2] >= like_threshold):
        liked_films[rating[0]-1].append(movie_idx_from_id[rating[1]])
        given_scores[rating[0]-1].append(rating[2])

In [401]:
user_profils = np.zeros((nb_ids,len(genres)))

for i in range(nb_ids):
    profil = np.zeros(len(genres))
    for j in range(len(liked_films[i])): #using the weighted average aggregation method
        profil += given_scores[i][j]*TF_IDF[liked_films[i][j]] #sum the profils of liked movies with the weight given by the user rating score
    profil /= max(np.sum(given_scores[i]),1) #avoid division by 0 when user has no liked movie
    user_profils[i] =profil

In [410]:
user_profils[0]

array([0.38959266, 0.25921648, 0.35409837, 0.19318064, 0.34917513,
       0.32598122, 0.75394827, 0.63017526, 0.51160373, 0.        ,
       0.3604049 , 0.02553189, 0.32801214, 0.        , 0.40810881,
       0.        , 0.20751977, 0.11881776, 0.5085189 , 0.09702272])

In [403]:
#cosine similarity avoiding usage of for loop

top = user_profils@TF_IDF.T #scalar products
bottom = (np.linalg.norm(user_profils, axis = 1).reshape((user_profils.shape[0],1))@np.linalg.norm(TF_IDF, axis = 1).reshape((1,TF_IDF.shape[0]))) #mul of norms
bottom[bottom == 0] = 1 #avoids div by 0
scores = top/bottom

In [412]:
user = 120
k = 10

def recommended(user,k): 
    scores_not_watched = scores[user].copy()
    scores_not_watched[liked_films[user]] = 0 #removes already noted movies
    return np.argpartition(scores_not_watched, k*-1)[k*-1:] #gets the k unwatched movies with the highest predicted score

answer = recommended(user,k)
print(f"{k} predicted movies for user_{user}:\n")
for l in answer:
    print(f"id:{l}{(4-int(np.log10(l)))*' '} movieId:{movies[l,0]}{(6-int(np.log10(movies[l,0])))*' '} Title:{movies[l,1]}")

10 predicted movies for user_120:

id:8785  movieId:129354  Title:Focus (2015)
id:5928  movieId:33903   Title:Edukators, The (Die Fetten Jahre sind vorbei) (2004)
id:5682  movieId:27716   Title:Green Butchers, The (Grønne slagtere, De) (2003)
id:7209  movieId:72919   Title:Did You Hear About the Morgans? (2009)
id:4180  movieId:6023    Title:Band of Outsiders (Bande à part) (1964)
id:5774  movieId:31367   Title:Chase, The (1994)
id:400   movieId:459     Title:Getaway, The (1994)
id:2903  movieId:3893    Title:Nurse Betty (2000)
id:1394  movieId:1912    Title:Out of Sight (1998)
id:9106  movieId:144606  Title:Confessions of a Dangerous Mind (2002)


### Verification des résultats

In [405]:
np.argpartition([len(liked_films[i]) for i in range(nb_ids)],10)[:10]  #on prend les id des users ayant le moins de films aimé pour avoir des profils niches

array([ 53, 244, 310, 292, 213, 507, 441, 193, 162, 477], dtype=int64)

In [406]:
len(liked_films[53])

3

In [407]:
user = 53

answer = recommended(user,k)
print(f"{k} predicted movies for user_{user}:\n")
for l in answer:
    print(f"id:{l}{(4-int(np.log10(l)))*' '} movieId:{movies[l,0]}{(6-int(np.log10(movies[l,0])))*' '} Title:{movies[l,1]}")

10 predicted movies for user_53:

id:9106  movieId:144606  Title:Confessions of a Dangerous Mind (2002)
id:6003  movieId:37720   Title:Exorcism of Emily Rose, The (2005)
id:6753  movieId:59604   Title:Girl Next Door, The (2007)
id:7046  movieId:69140   Title:Sweeney Todd (2006)
id:5985  movieId:36529   Title:Lord of War (2005)
id:6698  movieId:58303   Title:Counterfeiters, The (Die Fälscher) (2007)
id:966   movieId:1267    Title:Manchurian Candidate, The (1962)
id:4448  movieId:6567    Title:Buffalo Soldiers (2001)
id:6074  movieId:41527   Title:Paradise Now (2005)
id:1369  movieId:1873    Title:Misérables, Les (1998)


In [409]:
user_profils[user], TF_IDF[1369] #on compare son profil avec un film proposé

(array([0.        , 0.53583003, 0.        , 0.        , 0.54591848,
        1.0795937 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.31769729, 0.        , 1.39663904,
        0.        , 0.60298198, 0.        , 0.        , 0.76623068]),
 array([0.        , 0.80374505, 0.        , 0.        , 0.        ,
        3.23878111, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 2.09495856,
        0.        , 1.80894594, 0.        , 0.        , 0.        ]))

Les profils matchent.