In [None]:
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
data=pd.read_csv('movies_dataset.csv')

In [None]:
data.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [None]:
data.shape

(4806, 3)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4806 entries, 0 to 4805
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   tags      4806 non-null   object
dtypes: int64(1), object(2)
memory usage: 112.8+ KB


In [None]:
data.isnull().sum()

Unnamed: 0,0
movie_id,0
title,0
tags,0


In [None]:
data.duplicated().sum()

np.int64(0)

In [None]:
#stemming converts every word to its base from like programmable,programs,programmed ->program
ps=PorterStemmer()
ps.stem("programmable")

'programm'

In [None]:
ps.stem("ate")

'ate'

In [None]:
data['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [None]:
#convert to lowercase
data['tags']=data['tags'].apply(lambda x:x.lower())

In [None]:
#remove punctuations
import string

def remove_punctuations(text):
  clean_text=''

  for i in text:
    if i not in string.punctuation:
      clean_text= clean_text +i

  return clean_text

data['tags']=data['tags'].apply(remove_punctuations)


In [None]:
#stemming the tags
def stem(text):
  words=text.split()
  stemmed_words=[ps.stem(word) for word in words]
  return ' '.join(stemmed_words)

data['tags']=data['tags'].apply(stem)

In [None]:
data['tags'][0]

'in the 22nd centuri a parapleg marin is dispatch to the moon pandora on a uniqu mission but becom torn between follow order and protect an alien civil action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords=stopwords.words('english')
def remove_stopwords(text):
  words=text.split()
  clean_text=[]

  for word in words:
    if word not in stopwords:
      clean_text.append(word)
  return ' '.join(clean_text)

In [None]:
#remove numbers

def remove_numbers(text):
  clean_text=''
  for i in text:
    if not i.isdigit():
      clean_text=clean_text+i
  return clean_text

data['tags']=data['tags'].apply(remove_numbers)

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemma=WordNetLemmatizer()

lemma.lemmatize("bearable")

[nltk_data] Downloading package wordnet to /root/nltk_data...


'bearable'

In [None]:
lemma.lemmatize("eating") #not accurate

'eating'

In [None]:
import spacy
!pip install spacy
!pip -m spacy download en_core_web_sm



Usage:   
  pip3 <command> [options]

no such option: -m


In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
def word_lemmatize(text):
  words=text.split()
  doc=nlp(text)
  lemmatized=[]

  for word in doc:
    lemmatized.append(word.lemma_)

  return ' '.join(lemmatized)

data['tags']=data['tags'].apply(word_lemmatize)

In [None]:
countvec=CountVectorizer(max_features=5000)
vector=countvec.fit_transform(data['tags']).toarray()

In [None]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
similarity=cosine_similarity(vector)

In [None]:
#compares the movie to every other movie present
similarity[0]

array([1.        , 0.2764379 , 0.2393494 , ..., 0.26778222, 0.19215378,
       0.14504813])

In [None]:
data[data['title'].str.contains("The Lord of the Rings: The Fellowship")]

Unnamed: 0,movie_id,title,tags
262,120,The Lord of the Rings: The Fellowship of the Ring,young hobbit frodo baggin after inherit a myst...


In [None]:
index=data[data['title'].str.contains("The Lord of the Rings: The Fellowship")].index[0]

In [None]:
scores=list(enumerate(similarity[index]))

In [None]:
scores

[(0, np.float64(0.2688774478590815)),
 (1, np.float64(0.3539424572287375)),
 (2, np.float64(0.3300290050991338)),
 (3, np.float64(0.3467255099282034)),
 (4, np.float64(0.40946331782446055)),
 (5, np.float64(0.2395350687902042)),
 (6, np.float64(0.23116561100820565)),
 (7, np.float64(0.4945242492550593)),
 (8, np.float64(0.19329200569671784)),
 (9, np.float64(0.27659127289275987)),
 (10, np.float64(0.31416904015569347)),
 (11, np.float64(0.4185625026650041)),
 (12, np.float64(0.2112500740992099)),
 (13, np.float64(0.32653333223400444)),
 (14, np.float64(0.3440991148196579)),
 (15, np.float64(0.5014287473124103)),
 (16, np.float64(0.3239105320715664)),
 (17, np.float64(0.36596906257588235)),
 (18, np.float64(0.4119127712848206)),
 (19, np.float64(0.5434973268216643)),
 (20, np.float64(0.36604471829822377)),
 (21, np.float64(0.384110639798688)),
 (22, np.float64(0.48325675746658886)),
 (23, np.float64(0.3042609649822954)),
 (24, np.float64(0.2535508041519128)),
 (25, np.float64(0.36924666

In [None]:
sorted_scores=sorted(scores,key=lambda x:x[1], reverse=True)[1:6]

In [None]:
sorted_scores

[(330, np.float64(0.5472774683663488)),
 (19, np.float64(0.5434973268216643)),
 (2734, np.float64(0.5409713236187681)),
 (847, np.float64(0.5340404900482003)),
 (329, np.float64(0.5241896362718121))]

In [None]:
for idx,prob in sorted_scores:
  print(data['title'][idx])

The Lord of the Rings: The Two Towers
The Hobbit: The Battle of the Five Armies
Justin Bieber: Never Say Never
Semi-Pro
The Lord of the Rings: The Return of the King


In [None]:
def recommend(movie):
    #movie_index = new_df[new_df['title'] == movie].index[0]
    movie_list = data[data['title'].str.contains(movie)]
    if len(movie_list):
        movie_idx= movie_list.index[0]
        distances = similarity[movie_idx]
        movies_list = sorted(list(enumerate(distances)),reverse=True, key=lambda x:x[1])[1:6]

        #
        print('Recommendations for {0} :\n'.format(movie_list.iloc[0]['title']))
        for i in movies_list:
            print(data.iloc[i[0]].title)
    else:
        return "No movies found. Please check your input"

In [None]:
recommend("Interstellar")

Recommendations for Interstellar :

The Blade of Don Juan
The Right Stuff
Capricorn One
Prometheus
Serenity


In [None]:
recommend('The Matrix')

Recommendations for The Matrix Revolutions :

The Matrix
The Matrix Reloaded
The Work and the Glory II: American Zion
300: Rise of an Empire
Terminator Genisys


In [None]:
import pickle

In [None]:
with open('similarity.pkl', 'wb') as f:
  pickle.dump(similarity,f)

In [None]:
data.to_pickle('movies.pkl')

In [None]:
from google.colab import files
files.download('similarity.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>