# Cosine Similarity

## Using Numpy

In [1]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
def cos_sim(A, B):
    return dot(A, B) / (norm(A) * norm(B))

In [2]:
doc1 = np.array([0, 1, 1, 1])
doc2 = np.array([1, 0, 1, 1])
doc3 = np.array([2, 0, 2, 2])

In [3]:
cos_sim(doc1, doc2), cos_sim(doc1, doc3), cos_sim(doc2, doc3)

(0.6666666666666667, 0.6666666666666667, 1.0000000000000002)

## Using Scikit-Learn

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([doc1], [doc2])

array([[0.66666667]])

In [5]:
cosine_similarity([doc2], [doc3])

array([[1.]])

## Recommended Systems Using Similarity

In [6]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving movies_metadata.csv to movies_metadata.csv


In [7]:
import pandas as pd
movie = pd.read_csv(filename, low_memory = False)
movie.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [8]:
movie.shape

(45466, 24)

In [9]:
df = movie[['title', 'overview']]
df.head()

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [10]:
df.overview[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [11]:
df = df.head(10000)

### Preprocessing Data

In [12]:
df.isnull().sum()

title        0
overview    29
dtype: int64

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
df.dropna(how = 'any', inplace = True)
df.shape

(9971, 2)

In [15]:
df.tail(3)

Unnamed: 0,title,overview
9997,The Frisco Kid,Rabbi Avram arrives in Philadelphia from Polan...
9998,Onmyoji: The Yin Yang Master,"During a dark time in the Heian period, when e..."
9999,State Property 2,Three gangsters vie for control of the streets...


In [16]:
df.set_index('title', inplace = True)
df.reset_index(inplace = True)
df.tail(3)

Unnamed: 0,title,overview
9968,The Frisco Kid,Rabbi Avram arrives in Philadelphia from Polan...
9969,Onmyoji: The Yin Yang Master,"During a dark time in the Heian period, when e..."
9970,State Property 2,Three gangsters vie for control of the streets...


#### Preprocessing Text

In [17]:
df['clean_doc'] = df.overview.str.replace('[^A-Za-z ]', '')
# import re
# df['clean_doc'] = df.overview.apply(lambda x: re.sub('[^A-Za-z ]', '', x))
df.head(3)

Unnamed: 0,title,overview,clean_doc
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Led by Woody Andys toys live happily in his ro...
1,Jumanji,When siblings Judy and Peter discover an encha...,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,A family wedding reignites the ancient feud be...


#### DTM

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tvect.fit_transform(df.overview)
tfidf_matrix.shape

(9971, 32350)

In [19]:
tfidf_clean = tvect.fit_transform(df.clean_doc)
tfidf_clean.shape

(9971, 36150)

#### Table with Title & Index

In [20]:
indices = pd.Series(df.index, index = df.title).drop_duplicates()
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [21]:
indices['Jumanji']

1

#### Function that Finds Similar Movies using Cosine Similarity

In [22]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [23]:
cosine_sim.shape

(9971, 9971)

In [24]:
cosine_sim[1, :5]

array([0.01682702, 1.        , 0.04871756, 0.        , 0.        ])

In [25]:
def get_recommendations(title, cosine_sim = cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df.title.iloc[movie_indices]

In [27]:
get_recommendations('Jumanji')

6143               Brainscan
8776                 Quintet
9475               Word Wars
8055                 Masques
6032        Poolhall Junkies
2468                eXistenZ
9081                 Nirvana
1490      The Innocent Sleep
7725      The Last of Sheila
7882    The Last Starfighter
Name: title, dtype: object