In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df1=pd.read_csv('/Users/merrillsequeira/Google Drive/UPGRAD_DATASETS/MOVIE_RECC/tmdb_5000_credits.csv')
df2=pd.read_csv('/Users/merrillsequeira/Google Drive/UPGRAD_DATASETS/MOVIE_RECC/tmdb_5000_movies.csv')

In [3]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')

In [4]:
df2.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
df2.shape

(4803, 23)

In [6]:
df2.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'tittle', 'cast', 'crew'],
      dtype='object')

####  RECOMENDER SYSTEM 1 - using a score based system to recomend movies based on no of weighted votes 

In [7]:
C= df2['vote_average'].mean()

m= df2['vote_count'].quantile(0.9)


In [8]:
q_movies = df2.copy().loc[df2['vote_count'] >= m]
q_movies.shape


(481, 23)

In [9]:
# creating a weighted rating function  
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [11]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies as a recomendation 
print('Recomended popular movies are ...')
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Recomended popular movies are ...


Unnamed: 0,title,vote_count,vote_average,score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.92002
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
3337,The Godfather,5893,8.4,7.851236
95,Interstellar,10867,8.1,7.809479
809,Forrest Gump,7927,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727243
1990,The Empire Strikes Back,5879,8.2,7.697884


####  RECOMENDER SYSTEM 2 - plot based 

In [12]:
df2.overview.head(2)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
Name: overview, dtype: object

##### this is designed using term frequency and inverse document frequency that creates a matric of movies in the rows and the relevant frequnt words as columns - I'll be using sci kit learn to produce the matrix 

In [13]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

df2['overview'] = df2['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(df2['overview'])

tfidf_matrix.shape

(4803, 20978)

In [14]:
#### this outputs a grid of 20978 different words against 4803 movies 
tfidf_matrix

<4803x20978 sparse matrix of type '<class 'numpy.float64'>'
	with 125840 stored elements in Compressed Sparse Row format>

##### now we can compute the cosine product of the matrix to give us the similarity score. We are going to define a function that takes in a movie title as an input and outputs a list of the 10 most similar movies  

In [15]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

In [17]:
indices.head()

title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
dtype: int64

In [18]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 20 most similar movies
    sim_scores = sim_scores[1:21]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 20 most similar movies
    return df2['title'].iloc[movie_indices]

In [20]:
 #output the recomendations 
get_recommendations('John Carter')

1254                             Get Carter
4161            The Marine 4: Moving Target
2932                           Raising Cain
3349                              Desperado
1307                          The Hurricane
3068                            Rescue Dawn
345                             Rush Hour 2
581                 Star Trek: Insurrection
2998                                  Devil
4274       Eddie: The Sleepwalking Cannibal
46               X-Men: Days of Future Past
1735                         Ghosts of Mars
2755                          Earth to Echo
2968                   In the Land of Women
1057                           Coach Carter
85      Captain America: The Winter Soldier
322                       The Fifth Element
1300                         The Ugly Truth
2300           The Fall of the Roman Empire
2155                        The World's End
Name: title, dtype: object

####  RECOMENDER SYSTEM 3 - colabarative filtering  
##### colabarative filetring works on the principle of a predicted rating depending on what other users with similar interests have predicted ... here I will use Single Value Decomposition as a method to predict user affinity. The SVD decreases the dimension of the utility matrix A by extracting its latent factors. It maps each user and each item into a dimensional latent space. This mapping facilitates a clear representation of relationships between users and items. 

here i'll be using the module 'surprise' for building and analyzing recommender systems that deal with explicit rating data.

In [25]:
from surprise import Reader, Dataset, SVD


In [26]:
from surprise.model_selection import cross_validate


In [27]:
reader = Reader()

In [28]:
#importing a df with user ID refs & ratings 
ratings = pd.read_csv('/Users/merrillsequeira/Google Drive/UPGRAD_DATASETS/MOVIE_RECC/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [31]:

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8944  0.8980  0.8978  0.8971  0.8935  0.8962  0.0018  
MAE (testset)     0.6879  0.6925  0.6912  0.6913  0.6879  0.6902  0.0019  
Fit time          5.56    6.07    6.17    6.39    7.28    6.30    0.56    
Test time         0.27    0.16    0.16    0.37    0.22    0.23    0.08    


{'test_rmse': array([0.89442636, 0.89798747, 0.89779815, 0.89711081, 0.89354396]),
 'test_mae': array([0.68786794, 0.69247387, 0.69121579, 0.69125315, 0.68794556]),
 'fit_time': (5.562578916549683,
  6.068567991256714,
  6.171259164810181,
  6.393385410308838,
  7.279313802719116),
 'test_time': (0.2654588222503662,
  0.16109228134155273,
  0.159013032913208,
  0.3654909133911133,
  0.21677708625793457)}

In [33]:
trainset = data.build_full_trainset()


In [34]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12432eb10>

In [35]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [38]:
svd.predict(1, 10, 3)

Prediction(uid=1, iid=10, r_ui=3, est=2.7273491429450987, details={'was_impossible': False})

#### for movie ID 302 the model predicts 2.2.7273491429450987 