In [40]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
import pickle

In [41]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [42]:
movies = movies.merge(credits, on='title')

In [43]:
# Eleminating other data except title, id, keywords, genres, overview, cast, crew
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [44]:
# Data Preprocessing
movies.isnull().sum()
movies.dropna(inplace=True)
movies.duplicated().sum()

0

In [45]:
# Function for genres and keywords
def convert(obj):
    list = []
    for i in ast.literal_eval(obj):
        list.append(i['name'])
    return list
    
# Preprocessing genres & keywords
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Function for Cast
def convertCast(obj):
    list = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            list.append(i['name'])
            count += 1
        else:
            break
    return list

# Preprocessing Cast
movies['cast'] = movies['cast'].apply(convertCast)

# Function for Crew
def convertCrew(obj):
    list = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            list.append(i['name'])
            break
    return list

# Preprocessing Crew
movies['crew'] = movies['crew'].apply(convertCrew)
                                      
# Preprocessing Overview
movies['overview'] = movies['overview'].apply(lambda x:x.split())        

# Removing space by applying the transformer
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [46]:
# Creating tags using five categories
# Converting tags list items to a single string
# converting the string to lowercase
# then developing new datasets for the system

movies['tags'] = movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']
movies['tags'] = movies['tags'].apply(lambda x:" ".join(x))
movies['tags'] = movies['tags'].apply(lambda x:x.lower())
new_df = movies[['movie_id','title','tags']]

In [47]:
# implementing stemming
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [48]:
# Vectorization
cv = CountVectorizer(max_features = 5000, stop_words = 'english')
vectors = cv.fit_transform(new_df['tags']).toarray()

In [49]:
similarity = cosine_similarity(vectors)

In [50]:
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [51]:
sorted(list(enumerate(similarity[0])),reverse = True,key =lambda x:x[1])[1:6]

[(1216, 0.28676966733820225),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [52]:
# movie recommendation fucntions
def recommend(movie):
    movieIndex = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movieIndex]
    movieList = sorted(list(enumerate(distances)),reverse = True,key =lambda x:x[1])[1:6]

    for i in movieList:
        print(new_df.iloc[i[0]].title)

In [53]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [54]:
pickle.dump(new_df.to_dict(),open('movie.pkl','wb'))

In [55]:
pickle.dump(similarity,open('similarity.pkl','wb'))