# Movie Recommender System

## Data

In [1]:
# import modules
import numpy as np
import pandas as pd
import ast

In [2]:
# take the datasets

In [3]:
movie = pd.read_csv('movies_metadata.csv')
movie.head(2)

In [4]:
key = pd.read_csv('keywords.csv')
key['id'] = key['id'].astype(str)
key.head(2)

In [5]:
credit = pd.read_csv('credits.csv')
credit['id'] = credit['id'].astype(str)
credit.head(2)

In [6]:
# removing duplicate values from data
movie = movie.drop_duplicates().reset_index().drop('index', axis=1)
key = key.drop_duplicates().reset_index().drop('index', axis=1)
credit = credit.drop_duplicates().reset_index().drop('index', axis=1)
# movie.duplicated().sum()
# key.duplicated().sum()
# credit.duplicated().sum()

In [7]:
# removing duplicate id, because we need unique ids
movie = movie[~movie['id'].duplicated()]
key = key[~key['id'].duplicated()]
credit = credit[~credit['id'].duplicated()]
print(movie['id'].duplicated().sum())
print(key['id'].duplicated().sum())
print(credit['id'].duplicated().sum())

<div style="color:purple; font-weight:bold; font-size:16px">
    Merge the datasets (using merge() function which is used on dataframe)
<div>

In [8]:
movies=movie.merge(key.merge(credit, on='id'),  on='id')
movies.head(1)

<div style="color:purple; font-weight:bold; font-size:16px">
    picking up the most popular 3000 movies
<div>

In [9]:
pick_movies_count = 3000
movies = movies.sort_values(by='vote_count', ascending=False)[:pick_movies_count].reset_index().drop('index', axis=1)
movies[['title', 'vote_count']].tail(3)

In [10]:
movies.shape

In [11]:
movie.columns

In [12]:
movies.columns

In [13]:
# selecting only required columns
movies = movies[['id', 'imdb_id','title', 'adult', 'belongs_to_collection', 'genres', 'original_language', 'production_companies', 'production_countries', 'keywords', 'cast', 'crew', 'overview']]
movies.head(1)

<div style="color:purple; font-weight:bold; font-size:16px">
    Handling the null values
<div>

In [14]:
movies.isnull().sum()

In [15]:
movies['original_language'] = movies['original_language'].fillna('en')
movies['belongs_to_collection'] = movies['belongs_to_collection'].fillna("{'name': ''}")
movies['imdb_id'] = movies['imdb_id'].fillna("na_imdb_id")
movies['overview'] = movies['overview'].fillna('')

In [16]:
movies.isnull().sum().sum()

In [17]:
# movies2 = movies2.dropna(subset=['title'], axis=0)
# movies2[movies2['title'].isnull()]

In [18]:
# movies2['production_countries'] = movies2['production_countries'].fillna("[{'iso_3166_1': 'US', 'name': 'United States of America'}]")
# movies2.isnull().sum()

<div style="color:purple; font-weight:bold; font-size:16px">
    Making tag for each movie
<div>

In [19]:
# make a new dataframe having only 3 column : movie_id, title, tag (try to make a tag column using the remaining columns)
#     at first convert the ramining data into significant format

In [20]:
movies.sample(2)

In [21]:
movies['adult'] = movies['adult'].apply(lambda x : 'adult' if x=='True' else 'not_adult')
movies['adult'].value_counts()
# since all movies has tag 'not_adult' so we simply remove this
movies.drop('adult', axis=1, inplace=True)

In [22]:
def convert1(collection):
    for i,j in ast.literal_eval(collection).items():
        if i=='name': 
            return ['_'.join(j.split()).lower()]

In [23]:
movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(convert1)

In [24]:
def convert2(genere):
    L = []
    for d in ast.literal_eval(genere):
        L.append('_'.join(d['name'].split()).lower())
    return L

In [25]:
movies['genres'] = movies['genres'].apply(convert2)

In [26]:
movies['production_companies'] = movies['production_companies'].apply(convert2)

In [27]:
movies['production_countries'] = movies['production_countries'].apply(convert2)

In [28]:
movies['keywords'] = movies['keywords'].apply(convert2)

In [29]:
def convert3(cast):
    L = []
    for d in ast.literal_eval(cast)[:3]:
        L.append('_'.join(d['name'].split()).lower())
    return L

In [30]:
movies['cast'] = movies['cast'].apply(convert3)

In [31]:
def convert4(crew):
    L = []
    for d in ast.literal_eval(crew):
        if d['department']=='Directing':
            L.append('_'.join(d['name'].split()).lower())
    return L

In [32]:
movies['crew'] = movies['crew'].apply(convert4)

In [33]:
movies['tag'] = (movies['title'].apply(lambda title : [title])+movies['belongs_to_collection']+movies['genres']+movies['original_language'].apply(lambda lan : [lan])+movies['production_companies']+movies['production_countries']+movies['keywords']+movies['cast']+movies['crew']).apply(lambda tag : ' '.join(tag))
# movies['tag'] = (movies['title'].apply(lambda title : [title])+movies['belongs_to_collection']+movies['genres']+movies['production_companies']).apply(lambda tag : ' '.join(tag))
# movies['tag'] = (movies['belongs_to_collection']).apply(lambda tag : ' '.join(tag))
movies['tag'].iloc[0]

In [34]:
movies1 = movies[['id', 'imdb_id', 'belongs_to_collection', 'title', 'tag']]
movies1.sample(5)

In [35]:
# movies3['tag'][23]

In [36]:
# import nltk

# nltk.download('punkt')  # Downloads the punkt tokenizer
# nltk.download('averaged_perceptron_tagger')  # Downloads the part-of-speech tagger
# nltk.download('wordnet')  # Downloads the WordNet lemmatizer and corpus
# nltk.download('stopwords')  # Downloads the stopwords corpus
# nltk.download('maxent_ne_chunker')  # Downloads the named entity chunker
# nltk.download('words')  # Downloads the list of words


In [37]:
# we are unable to build ML model with this large dataset, so we take a slice of the dataset

# movies4 = movies3.iloc[0:40000:5]
# movies4.shape
# movies4.head()

<div style="color:purple; font-weight:bold; font-size:16px">
    Stemming of tags
<div>

In [38]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
def stemming(sentence):
    words = word_tokenize(sentence)
    stemmed_words = []
    for w in words:
        stemmed_words.append(ps.stem(w))
    return ' '.join(stemmed_words)

In [39]:
movies1['tag'] = movies1['tag'].apply(stemming)

In [40]:
print(movies1['tag'][0])

<div style="color:purple; font-weight:bold; font-size:16px">
    Movie Search Engine  
<div>

In [41]:
def movie_search_engine(movie_name):
    L = []
    for i in range(len(movies1['title'])):
        name = movies1['title'][i]
        if movie_name.lower() in name.lower():
            L.append(i)
    return movies1.iloc[L]
movie_search_engine(movie_name='harry')
# movies.iloc[[2,5]]

<div style="color:purple; font-weight:bold; font-size:16px">
    Vectorization    
<div>

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
# vectorizer = HashingVectorizer(n_features=15000, stop_words='english')
vectors = vectorizer.fit_transform(movies1['tag'])

In [43]:
# a = vectorizer.get_feature_names_out()
# print(a[:20])
# vectorizer.get_stop_words()

In [44]:
vectors.dtype

In [45]:
a = vectorizer.get_feature_names_out()
sorted(a)[50:]

In [46]:
# may take too much space, so we change the dtype
vectors.dtype = np.int32

In [47]:
vectors_arr = vectors.toarray()

In [48]:
vectors_arr.dtype

In [49]:
a = vectors_arr
i = 25
print(np.sum(a[i]==0))
print(np.sum(a[i]!=0))
# print(a[a(i)==0])

<div style="color:purple; font-weight:bold; font-size:16px">
    Cosine Similarities   
<div>

In [50]:
len(vectors_arr)

In [51]:
# def vector_chunks_generator(vectors_arr, chunks):
#     chunk_size = len(vectors_arr) / chunks
#     for i in range(chunks):
#         yield vectors_arr[i*chunk_size : (i+1)*chunk_size+1]
# vector_chunks = vector_chunks_generator(vectors_arr, 10)
# vector_chunks.next()

In [52]:
# from scipy.sparse import csr_matrix
# vectors_arr_sparse = csr_matrix(vectors_arr)

In [53]:
# cosine similarity

from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(vectors_arr)

In [54]:
# def calculate_cosine_similarity(array):
#     n = len(array)
#     similarities = []
#     for i in range(n):
#         arr = []
#         for j in range(n):
#             cs = cosine_similarity([array[i], array[j]])
#             if cs[0][1]: arr.append((j, cs[0][1]))
#         similarities.append(arr)
#         print(i)
#     return similarities
# similarities = calculate_cosine_similarity(vectors_arr)

In [55]:
print(movie_search_engine('harry'))
similarities[32][64]

In [56]:
# sorted(enumerate(similarities[0]), reverse  = True, key=lambda x : x[1])

In [57]:
# def numpy_to_indexed_2d_list(arr):
#     lis = []
#     for a in arr:
#         lis.append(list(a))
#     return lis 

In [58]:
# similarities2 = numpy_to_indexed_2d_list(similarities)
# similarities2

In [59]:
# movies3[movies3['title']=='The American President']
# sorted(similarities[0], reverse=1)
# a = enumerate(similarities[0])
# a = list(a)
# sorted(a, reverse=1, key= lambda x : x[1])

In [60]:
def from_same_series(movie_index):
    series = movies1['belongs_to_collection'][movie_index]
    L = []
    for i in range(movies1.shape[0]):
        if i != movie_index and series != [''] and series == movies1['belongs_to_collection'][i]:
            L.append(i)
    return L
def recommend(movie):
    movie_index = movies1[movies1['title']==movie].index[0]
    all_suggest = sorted(enumerate(similarities[movie_index]), reverse=True, key=lambda x : x[1])
    same_series = from_same_series(movie_index)
    L = []
    suggest_count = 10
    for s in all_suggest:
        if not same_series or not suggest_count: break
        if s[0] in same_series:
            L.append(s[0])
            same_series.remove(s[0])
            suggest_count -= 1
    for s in all_suggest[1:]:
        if not suggest_count: break
        if s[0] not in L: 
            L.append(s[0])
            suggest_count -= 1
    return list(movies1['title'].iloc[L])

In [61]:
recommend("Interstellar")

In [62]:
movie_search_engine('interstellar')

In [63]:
from_same_series(32)

In [65]:
similarities.dtype

In [64]:
import pickle
pickle.dump(movies1, open("movies.pkl", 'wb'))
pickle.dump(similarities, open("similarities.pkl", 'wb'))