In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
#links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
#tags = pd.read_csv('tags.csv')

In [0]:
newRatings = ratings.merge(movies, on = 'movieId')

In [5]:
newRatings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [0]:
ratings['userId'] = ratings['userId'].astype('str')
ratings['movieId'] = ratings['movieId'].astype('str')
movies['movieId'] = movies['movieId'].astype('str')

# Statistics of the dataset

In [7]:
userId = ratings.userId.unique()
movieId = ratings.movieId.unique()
num_users = len(userId)
num_items =len(movieId)
print('number of unique users:', num_users)
print('number of unique movies:', num_items)

number of unique users: 610
number of unique movies: 9724


In [8]:
sparsity = 1 - len(ratings) / (num_users * num_items)
print('matrix sparsity:',sparsity)

matrix sparsity: 0.9830003169443864


#CF

## Data Sampling

In [9]:
# filtering movies. keeping only those that were rated at least 50 times.
num = 50
# get the number of times a movie has been rated.
movieRatedFreq = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])
# get the ID of all movies that have been rated more than 50 times.
popular_movies = list(set(movieRatedFreq.query('count >= @num').index))
# filter the rating DF to contain only the popular movies.
ratingsPopularMovies = ratings[ratings.movieId.isin(popular_movies)]
print('Shape of ratings', ratings.shape)
print('Shape of ratingsPopularMovies', ratingsPopularMovies.shape)

Shape of ratings (100836, 4)
Shape of ratingsPopularMovies (41360, 4)


In [10]:
# get the number of times a user has rated a movie.
UserRatedMovieFreq = pd.DataFrame(ratingsPopularMovies.groupby('userId').size(), columns=['count'])
active_users = list(set(UserRatedMovieFreq.query('count >= @num').index))
ratingsPopularMoviesAndUsers = ratingsPopularMovies[ratingsPopularMovies.userId.isin(active_users)]
print('Shape of DF after removing both user and movie < 50:', ratingsPopularMoviesAndUsers.shape)

Shape of DF after removing both user and movie < 50: (32999, 4)


In [35]:
from scipy.sparse import csr_matrix

# create the sparse matrix
features = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
matrix_features = csr_matrix(features.values)
features.head()

userId,1,10,100,101,102,103,104,105,106,107,108,109,11,110,111,112,113,114,115,116,117,118,119,12,120,121,122,123,124,125,126,127,128,129,13,130,131,132,133,134,...,63,64,65,66,67,68,69,7,70,71,72,73,74,75,76,77,78,79,8,80,81,82,83,84,85,86,87,88,89,9,90,91,92,93,94,95,96,97,98,99
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,3.0,...,5.0,4.0,0.0,4.0,0.0,2.5,0.0,4.5,0.0,5.0,0.0,4.5,0.0,0.0,0.5,0.0,4.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,3.0,4.0,0.0,3.0,0.0,0.0,5.0,0.0,4.5,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,3.5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,4.0,3.0,0.0,0.0,0.0,0.0,4.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
temp = movies.copy()
temp.title = temp.title+'|'+temp.movieId

In [0]:
#map movie titles to images
movieToIndex = {
    movie: i for i, movie in 
    enumerate(list(temp.set_index('movieId').loc[features.index].title))
}

## KNN

In [38]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit the dataset
model_knn.fit(matrix_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [39]:
matrix_features.shape

(9724, 610)

## Prediction

In [0]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
movie_indices = pd.Series(movies.index, index=movies['title'])

In [0]:
def pred(title):
  #this list is the list of recommended movies' ID
  thislist = []
  idx = movie_indices[title]
  distances, indices = model_knn.kneighbors(matrix_features[idx], n_neighbors=20)

  recommended = pd.DataFrame(data=indices[0], columns=['idx'])
  recommended['distances'] = distances[0]
  recommended = recommended.sort_values(by='distances', ascending=False)

  reverse_mapper = {v: k for k, v in movieToIndex.items()}
  for i,row in recommended.iterrows():
    thislist.append(reverse_mapper[row['idx']].split('|')[1])
    print('{0}, with distance of {1}'.format(reverse_mapper[row['idx']], row['distances']))
  return thislist

##Calculate ratings from neighbors

In [0]:
newRatings = newRatings.drop(labels=['timestamp'],axis=1)

In [0]:
user1 = newRatings.loc[newRatings.userId==257]

In [44]:
user1.movieId = user1.movieId.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [0]:
user1

In [24]:
for i, row in user1.iterrows():
  if(row.movieId in thislist):
    print(row.title)

Toy Story (1995)
Star Wars: Episode IV - A New Hope (1977)
Pulp Fiction (1994)
Forrest Gump (1994)
Jurassic Park (1993)
Star Wars: Episode V - The Empire Strikes Back (1980)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Star Wars: Episode VI - Return of the Jedi (1983)
Groundhog Day (1993)
Back to the Future (1985)
Indiana Jones and the Last Crusade (1989)
Men in Black (a.k.a. MIB) (1997)
Saving Private Ryan (1998)
Matrix, The (1999)


#Case Study

##User 1

In [51]:
recMovieIdUser1 = pred('Dragonheart (1996)')

549
Er ist wieder da (2015)|143969, with distance of 0.18181818181818177
Machete Kills (Machete 2) (2013)|105585, with distance of 0.1425070742874559
Battleship (2012)|94018, with distance of 0.12712843905603044
Contraband (2012)|91842, with distance of 0.10557280900008414
They Call Me Trinity (1971)|26249, with distance of 0.10557280900008414
The Purge: Election Year (2016)|160565, with distance of 0.02381293981604715
Serbian Film, A (Srpski film) (2010)|79251, with distance of 0.007722123286332261
Battle For Sevastopol (2015)|140627, with distance of 0.0
Walk Among the Tombstones, A (2014)|114246, with distance of 0.0
Homefront (2013)|106785, with distance of 0.0
Che: Part Two (2008)|64501, with distance of 0.0
SORI: Voice from the Heart (2016)|158027, with distance of 0.0
Trumbo (2015)|145418, with distance of 0.0
Black Sea (2015)|118706, with distance of 0.0
Book Thief, The (2013)|106441, with distance of 0.0
Agora (2009)|74624, with distance of 0.0
ARQ (2016)|163985, with distance

In [55]:
movies[movies['movieId'].isin(recMovieIdUser1)]

Unnamed: 0,movieId,title,genres
5479,26249,They Call Me Trinity (1971),Comedy|Western
6919,64499,Che: Part One (2008),Drama|War
6920,64501,Che: Part Two (2008),Drama|War
7266,74624,Agora (2009),Adventure|Drama|Romance
7378,79251,"Serbian Film, A (Srpski film) (2010)",Horror|Thriller
7783,91842,Contraband (2012),Action|Crime|Drama|Thriller
7866,94018,Battleship (2012),Action|Sci-Fi|Thriller|IMAX
8276,105585,Machete Kills (Machete 2) (2013),Action|Crime|Thriller
8294,106441,"Book Thief, The (2013)",Children|Drama|War
8306,106785,Homefront (2013),Action|Crime|Thriller


##User 2

In [83]:
movies[movies['movieId']=='59421'].title

6748    What Happens in Vegas... (2008)
Name: title, dtype: object

In [78]:
recMovieIdUser2 = pred('What Happens in Vegas... (2008)')

Helter Skelter (2004)|43549, with distance of 0.24074339763470343
Two for the Money (2005)|38992, with distance of 0.24074339763470343
State Property 2 (2005)|33132, with distance of 0.24074339763470343
City of the Living Dead (a.k.a. Gates of Hell, The) (Paura nella città dei morti viventi) (1980)|3652, with distance of 0.24074339763470343
Black Christmas (2006)|50147, with distance of 0.24074339763470343
Bless the Child (2000)|3857, with distance of 0.24074339763470343
Masterminds (1997)|8241, with distance of 0.24074339763470343
Teeth (2007)|57910, with distance of 0.24074339763470343
Primeval (2007)|50440, with distance of 0.24074339763470343
Bats (1999)|2974, with distance of 0.24074339763470343
Silent Night, Deadly Night (1984)|26523, with distance of 0.24074339763470343
Valentine (2001)|4143, with distance of 0.24074339763470343
Riki-Oh: The Story of Ricky (Lik Wong) (1991)|26736, with distance of 0.24074339763470343
Many Adventures of Winnie the Pooh, The (1977)|31193, with dis

In [79]:
movies[movies['movieId'].isin(recMovieIdUser2)]

Unnamed: 0,movieId,title,genres
2239,2974,Bats (1999),Horror|Thriller
2720,3652,"City of the Living Dead (a.k.a. Gates of Hell,...",Horror
2883,3857,Bless the Child (2000),Thriller
3086,4143,Valentine (2001),Horror|Mystery
4092,5853,Scanners (1981),Horror|Sci-Fi|Thriller
4373,6405,Treasure Island (1950),Adventure|Children
4707,7024,"Salo, or The 120 Days of Sodom (Salò o le 120 ...",Drama
5140,8241,Masterminds (1997),Action|Comedy|Thriller
5341,8906,Cannibal Holocaust (1980),Horror
5518,26523,"Silent Night, Deadly Night (1984)",Horror|Thriller


##User 3

In [84]:
movies[movies['movieId']=='3578'].title

2674    Gladiator (2000)
Name: title, dtype: object

In [86]:
recMovieIdUser3 = pred('Gladiator (2000)')

Punisher, The (2004)|7439, with distance of 0.593134389983557
Alien: Resurrection (1997)|1690, with distance of 0.5869296966227997
Antitrust (2001)|4052, with distance of 0.5860022174397568
28 Weeks Later (2007)|53000, with distance of 0.585189367969372
Evolution (2001)|4343, with distance of 0.5833276373139393
Pitch Black (2000)|3300, with distance of 0.5824988048113962
Ninth Gate, The (1999)|3355, with distance of 0.578314019599964
Gremlins (1984)|2003, with distance of 0.5753826736356247
Mariachi, El (1992)|3267, with distance of 0.5742400415017564
Resident Evil (2002)|5219, with distance of 0.5622399121740249
Silent Hill (2006)|45081, with distance of 0.562053333947105
Pi (1998)|1921, with distance of 0.5606781811228129
Doom (2005)|37380, with distance of 0.5585705126202524
American Psycho (2000)|3535, with distance of 0.5533301351754519
Cabin in the Woods, The (2012)|93840, with distance of 0.5528046155325
Boondock Saints, The (2000)|3275, with distance of 0.5459922672100452
Mache

In [87]:
movies[movies['movieId'].isin(recMovieIdUser3)]

Unnamed: 0,movieId,title,genres
1275,1690,Alien: Resurrection (1997),Action|Horror|Sci-Fi
1402,1921,Pi (1998),Drama|Sci-Fi|Thriller
1478,2003,Gremlins (1984),Comedy|Horror
1662,2232,Cube (1997),Horror|Mystery|Sci-Fi|Thriller
2454,3267,"Mariachi, El (1992)",Action|Crime|Thriller|Western
2462,3275,"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller
2477,3300,Pitch Black (2000),Horror|Sci-Fi|Thriller
2508,3355,"Ninth Gate, The (1999)",Fantasy|Horror|Mystery|Thriller
2641,3535,American Psycho (2000),Crime|Horror|Mystery|Thriller
3028,4052,Antitrust (2001),Crime|Drama|Thriller
