In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
#links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
#tags = pd.read_csv('tags.csv')

In [3]:
movies.shape

(9742, 3)

In [0]:
ratings['userId'] = ratings['userId'].astype('str')
ratings['movieId'] = ratings['movieId'].astype('str')
movies['movieId'] = movies['movieId'].astype('str')

# Statistics of the dataset

In [5]:
userId = ratings.userId.unique()
movieId = ratings.movieId.unique()
num_users = len(userId)
num_items =len(movieId)
print('number of unique users:', num_users)
print('number of unique movies:', num_items)

number of unique users: 610
number of unique movies: 9724


In [6]:
sparsity = 1 - len(ratings) / (num_users * num_items)
print('matrix sparsity:',sparsity)

matrix sparsity: 0.9830003169443864


In [0]:
#Number of movies rated by each user.
numMoviesRatedByUser = ratings.groupby(['userId']).size().reset_index()

In [8]:
numMoviesRatedByUser.sort_values(by=0, ascending=True)

Unnamed: 0,userId,0
175,257,20
105,194,20
529,576,20
521,569,20
120,207,20
...,...,...
194,274,1346
387,448,1864
416,474,2108
554,599,2478


# Content Based

## TF-IDF

In [0]:
movies.genres = movies.genres.str.split('|')

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [11]:
movies.genres = movies.genres.fillna("").astype('str')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II (1995),['Comedy']


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(9742, 177)

In [13]:
tfidf_matrix

<9742x177 sparse matrix of type '<class 'numpy.float64'>'
	with 36628 stored elements in Compressed Sparse Row format>

In [14]:
print(tf.get_feature_names())

['action', 'action adventure', 'action animation', 'action children', 'action comedy', 'action crime', 'action documentary', 'action drama', 'action fantasy', 'action horror', 'action imax', 'action mystery', 'action romance', 'action sci', 'action thriller', 'action war', 'action western', 'adventure', 'adventure animation', 'adventure children', 'adventure comedy', 'adventure crime', 'adventure documentary', 'adventure drama', 'adventure fantasy', 'adventure film', 'adventure horror', 'adventure mystery', 'adventure romance', 'adventure sci', 'adventure thriller', 'adventure war', 'adventure western', 'animation', 'animation children', 'animation comedy', 'animation crime', 'animation documentary', 'animation drama', 'animation fantasy', 'animation film', 'animation horror', 'animation musical', 'animation mystery', 'animation romance', 'animation sci', 'children', 'children comedy', 'children crime', 'children documentary', 'children drama', 'children fantasy', 'children horror', 'c

## Cosine Similarity

In [0]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(tfidf_matrix)

In [16]:
sim.shape

(9742, 9742)

In [17]:
sim[:4, :4]

array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])

##Predictions

In [0]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [0]:
# get movie recommendations based on the cosine similarity score of movie genres
def pred(title):
  # get the index number for the title.
  idx = indices[title]
  #get the row for thar idx as a list.
  sim_scores = list(enumerate(sim[idx]))
  # sort the list.
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  # get the top 10 score indices.
  topScores = sim_scores[1:11]
  # get the movie index
  movie_indices = [i[0] for i in topScores]
  return movies.iloc[movie_indices], movie_indices

#Case Study


##User 1


In [0]:
simMoviesU1, simMovieIndexpredU1 = pred('Dragonheart (1996)')

In [21]:
simMoviesU1

Unnamed: 0,movieId,title,genres
349,393,Street Fighter (1994),"['Action', 'Adventure', 'Fantasy']"
549,653,Dragonheart (1996),"['Action', 'Adventure', 'Fantasy']"
974,1275,Highlander (1986),"['Action', 'Adventure', 'Fantasy']"
1190,1587,Conan the Barbarian (1982),"['Action', 'Adventure', 'Fantasy']"
1266,1681,Mortal Kombat: Annihilation (1997),"['Action', 'Adventure', 'Fantasy']"
1576,2115,Indiana Jones and the Temple of Doom (1984),"['Action', 'Adventure', 'Fantasy']"
1644,2193,Willow (1988),"['Action', 'Adventure', 'Fantasy']"
1775,2373,Red Sonja (1985),"['Action', 'Adventure', 'Fantasy']"
2126,2826,"13th Warrior, The (1999)","['Action', 'Adventure', 'Fantasy']"
2376,3153,"7th Voyage of Sinbad, The (1958)","['Action', 'Adventure', 'Fantasy']"


## User 2

In [0]:
simMoviesU2, simMovieIndexpredU2 = pred('What Happens in Vegas... (2008)')

In [23]:
simMoviesU2

Unnamed: 0,movieId,title,genres
6,7,Sabrina (1995),"['Comedy', 'Romance']"
35,39,Clueless (1995),"['Comedy', 'Romance']"
57,64,Two if by Sea (1996),"['Comedy', 'Romance']"
60,68,French Twist (Gazon maudit) (1995),"['Comedy', 'Romance']"
103,118,If Lucy Fell (1996),"['Comedy', 'Romance']"
106,122,Boomerang (1992),"['Comedy', 'Romance']"
111,129,Pie in the Sky (1996),"['Comedy', 'Romance']"
152,180,Mallrats (1995),"['Comedy', 'Romance']"
157,186,Nine Months (1995),"['Comedy', 'Romance']"
203,237,Forget Paris (1995),"['Comedy', 'Romance']"


##User 3

In [0]:
simMoviesU3, simMovieIndexpredU3 = pred('Gladiator (2000)')

In [25]:
simMoviesU3

Unnamed: 0,movieId,title,genres
986,1287,Ben-Hur (1959),"['Action', 'Adventure', 'Drama']"
1331,1801,"Man in the Iron Mask, The (1998)","['Action', 'Adventure', 'Drama']"
1488,2013,"Poseidon Adventure, The (1972)","['Action', 'Adventure', 'Drama']"
1494,2019,Seven Samurai (Shichinin no samurai) (1954),"['Action', 'Adventure', 'Drama']"
1772,2370,"Emerald Forest, The (1985)","['Action', 'Adventure', 'Drama']"
1821,2421,"Karate Kid, Part II, The (1986)","['Action', 'Adventure', 'Drama']"
2177,2893,Plunkett & MaCleane (1999),"['Action', 'Adventure', 'Drama']"
2187,2905,Sanjuro (Tsubaki Sanjûrô) (1962),"['Action', 'Adventure', 'Drama']"
2674,3578,Gladiator (2000),"['Action', 'Adventure', 'Drama']"
4391,6448,"Flight of the Phoenix, The (1965)","['Action', 'Adventure', 'Drama']"


#Evaluation

In [0]:
user1 = ratings.loc[ratings.userId=='1']

In [0]:
# all movies that user has rated.
user1 = user1.merge(movies, how='left', on='movieId')

In [0]:
user1.movieId = user1.movieId.astype(int)

In [29]:
# all movies that user has rated.
user1

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
1,1,3,4.0,964981247,Grumpier Old Men (1995),"['Comedy', 'Romance']"
2,1,6,4.0,964982224,Heat (1995),"['Action', 'Crime', 'Thriller']"
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),"['Mystery', 'Thriller']"
4,1,50,5.0,964982931,"Usual Suspects, The (1995)","['Crime', 'Mystery', 'Thriller']"
...,...,...,...,...,...,...
227,1,3744,4.0,964980694,Shaft (2000),"['Action', 'Crime', 'Thriller']"
228,1,3793,5.0,964981855,X-Men (2000),"['Action', 'Adventure', 'Sci-Fi']"
229,1,3809,4.0,964981220,What About Bob? (1991),['Comedy']
230,1,4006,4.0,964982903,Transformers: The Movie (1986),"['Adventure', 'Animation', 'Children', 'Sci-Fi']"


In [30]:
simMovies, simMovieIndex = pred('Toy Story (1995)')
simMovies.head(10)

Unnamed: 0,movieId,title,genres
1706,2294,Antz (1998),"['Adventure', 'Animation', 'Children', 'Comedy..."
2355,3114,Toy Story 2 (1999),"['Adventure', 'Animation', 'Children', 'Comedy..."
2809,3754,"Adventures of Rocky and Bullwinkle, The (2000)","['Adventure', 'Animation', 'Children', 'Comedy..."
3000,4016,"Emperor's New Groove, The (2000)","['Adventure', 'Animation', 'Children', 'Comedy..."
3568,4886,"Monsters, Inc. (2001)","['Adventure', 'Animation', 'Children', 'Comedy..."
6194,45074,"Wild, The (2006)","['Adventure', 'Animation', 'Children', 'Comedy..."
6486,53121,Shrek the Third (2007),"['Adventure', 'Animation', 'Children', 'Comedy..."
6948,65577,"Tale of Despereaux, The (2008)","['Adventure', 'Animation', 'Children', 'Comedy..."
7760,91355,Asterix and the Vikings (Astérix et les Viking...,"['Adventure', 'Animation', 'Children', 'Comedy..."
8219,103755,Turbo (2013),"['Adventure', 'Animation', 'Children', 'Comedy..."


In [31]:
simMovieIndex

[1706, 2355, 2809, 3000, 3568, 6194, 6486, 6948, 7760, 8219]

In [0]:
for i, row in user1.iterrows():
  if(row.movieId in simMovieIndex):
    print(row)

In [33]:
user1.loc[user1.movieId == 6194]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
