# RECOMMENDATION ENGINE:

### Notebook Contents:

- Content-Based Filtering using TMDB 5000 Dataset
    
- Collaborative Filtering using MovieLens Dataset
    
- Evaluation
    
- Conclusion

In [1]:
import pandas as pd
import numpy as np

### 1. Content-Based Filtering

In [2]:
import ast
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
movies=pd.read_csv('dataset/tmdb_5000_movies.csv')
credits=pd.read_csv('dataset/tmdb_5000_credits.csv')

'''
from pymongo import MongoClient
client = MongoClient()
db1 = client.moviesdatabase
movies = pd.DataFrame(list(db1.tmdb5000.find()))
db2 = client.creditsdatabase
credits = pd.DataFrame(list(db2.tmdb5000.find()))
'''

'\nfrom pymongo import MongoClient\nclient = MongoClient()\ndb1 = client.moviesdatabase\nmovies = pd.DataFrame(list(db1.tmdb5000.find()))\ndb2 = client.creditsdatabase\ncredits = pd.DataFrame(list(db2.tmdb5000.find()))\n'

In [4]:
movies=movies.merge(credits, on='title')

In [5]:
movies=movies[['id','title','overview','genres','keywords','cast','crew']]

In [6]:
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres']=movies['genres'].apply(convert)
movies['keywords']=movies['keywords'].apply(convert)

In [7]:
def convert2(obj):
    L=[]
    count=0
    for i in ast.literal_eval(obj):
        if count!=3:
            L.append(i['name'])
            count+=1
        else:
            break
    return L

movies['cast']=movies['cast'].apply(convert2)

In [8]:
def convert3(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

movies['crew']=movies['crew'].apply(convert3)

In [9]:
movies['keywords']=movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [10]:
movies['overview']=movies['overview'].apply(lambda x: str(x).split())

In [11]:
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']
new_df=movies[['id','title','tags']]
new_df['tags']=new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags']=new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x: x.lower())


In [12]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [13]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

new_df['tags']=new_df['tags'].apply(stem)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [14]:
cv=CountVectorizer(max_features=5000, stop_words='english')
vectors=cv.fit_transform(new_df['tags']).toarray()

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:6]

[(1216, 0.29986532511593345),
 (2409, 0.28644594961577313),
 (507, 0.2710098294963041),
 (539, 0.2706659809803833),
 (1204, 0.2625754538144587)]

In [16]:
def recommend(movie):
    movie_index=(new_df[new_df['title']==movie].index[0])
    distances=similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [17]:
import pickle
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))
pickle.dump(similarity,open('recommend_1.pkl','wb'))

### 2. Collaborative Filtering

In [18]:
ratings = pd.read_csv('dataset/ratings.csv')

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(ratings, test_size = 0.30, random_state = 42)

user_data = X_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)

In [20]:
# make a copy of train and test datasets
dummy_train = X_train.copy()
dummy_test = X_test.copy()

dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x > 0 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x > 0 else 0)

In [21]:
# The movies not rated by user is marked as 1 for prediction 
dummy_train = dummy_train.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(1)

# The movies not rated by user is marked as 0 for evaluation 
dummy_test = dummy_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# User Similarity Matrix using Cosine similarity as a similarity measure between Users
user_similarity = cosine_similarity(user_data)
user_similarity[np.isnan(user_similarity)] = 0

In [23]:
user_predicted_ratings = np.dot(user_similarity, user_data)
user_final_ratings = np.multiply(user_predicted_ratings, dummy_train)

In [25]:
pickle.dump(user_similarity,open('recommend_2.pkl','wb'))