In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


In [2]:
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [3]:
#movies.head()
#credits.head()

# EDA

In [4]:
df = movies.merge(credits, on='title')

In [5]:
# Columns to remove
# budget, homepage, original_language, original_title, popularity, production_companies, production_countries, release_date, revenue, runtime, spoken_languages, status, tagline, vote_average, vote_count, movie_id

In [6]:
df = df.drop(columns = ['budget', 'homepage', 'original_language', 'original_title', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'vote_average', 'vote_count', 'movie_id'])
#df.head()

In [7]:
#df.isnull().sum()
#df.duplicated().sum()
df.dropna(inplace=True)

# Data Pre-Processing

In [8]:
import ast

In [9]:
def convert_genre(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [10]:
df['genres'] = df['genres'].apply(convert_genre)
df['keywords'] = df['keywords'].apply(convert_genre)
#df.head()

In [11]:
def convert_cast(obj):
    L = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            L.append(i['name'])
            count+=1
        else:
            break
    return L

In [12]:
df['cast'] = df['cast'].apply(convert_cast)
#df.head()

In [13]:
def convert_crew(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [14]:
df['crew'] = df['crew'].apply(convert_crew)
#df.head()

In [15]:
df['overview'] = df['overview'].apply(lambda x:x.split())
#df.head()

In [16]:
df['genres'] = df['genres'].apply(lambda x:[i.replace(' ', '') for i in x])
df['keywords'] = df['keywords'].apply(lambda x:[i.replace(' ', '') for i in x])
df['cast'] = df['cast'].apply(lambda x:[i.replace(' ', '') for i in x])
df['crew'] = df['crew'].apply(lambda x:[i.replace(' ', '') for i in x])
#df.head()

In [17]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']
#df.head()

In [18]:
final_df = df[['id', 'title', 'tags']]
final_df['tags'] = final_df['tags'].apply(lambda x:" ".join(x))
final_df['tags'] = final_df['tags'].apply(lambda x:x.lower())
#final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(lambda x:x.lower())


# Vectorization

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [20]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [21]:
def stem(text) :
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return ' '.join(y)

In [22]:
final_df['tags'] = final_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(stem)


In [23]:
vectors = cv.fit_transform(final_df['tags']).toarray()

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
similarity = cosine_similarity(vectors)

In [26]:
# to get index position along with the similarity number
#list(enumerate(similarity[0]))

In [27]:
# 1. Find index -> 2. Sort the array -> 3. Find top 5
def recommend(movie):
    movie_index =  final_df[final_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(final_df.iloc[i[0]].title)
    return

In [28]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


# Exporting for Deployment

In [29]:
import pickle

In [30]:
pickle.dump(final_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))