In [17]:
import pandas as pd 

movies = pd.read_csv('data/movies.csv')
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [18]:
movies.tail(10)

Unnamed: 0,movieId,title,genres
62413,209145,Liberté (2019),Drama
62414,209147,The Carpet of Horror (1962),Crime|Horror
62415,209151,Mao Zedong 1949 (2019),(no genres listed)
62416,209153,Happy Flight (2008),Comedy|Drama
62417,209155,Santosh Subramaniam (2008),Action|Comedy|Romance
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama


In [19]:
# extra characters like parentheses in the movie titles will make it difficult to search for movies.
# I will use regular expressions to clean up the title column by removing any character that isn't a letter, digit, or a space.

import re 

def clean_title(title: str) -> str:
    return re.sub("[^a-zA-Z0-9 ]", "", title)

movies['clean_title'] = movies['title'].apply(lambda title: clean_title(title))
movies['clean_title'].head(10)

0                      Toy Story 1995
1                        Jumanji 1995
2               Grumpier Old Men 1995
3              Waiting to Exhale 1995
4    Father of the Bride Part II 1995
5                           Heat 1995
6                        Sabrina 1995
7                   Tom and Huck 1995
8                   Sudden Death 1995
9                      GoldenEye 1995
Name: clean_title, dtype: object

Since the movie titles are text-based and ML models don't work directly on words, I will use the `TfidfVectorizer` tool in python to transform the text data into numerical form that captures how important each word is.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies['clean_title'])
print(tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 446566 stored elements and shape (62423, 170073)>
  Coords	Values
  (0, 153609)	0.4788631896261391
  (0, 138134)	0.30818287987354687
  (0, 763)	0.2947573407787223
  (0, 153617)	0.5236464902527855
  (0, 138180)	0.5609151642422612
  (1, 763)	0.3284429867728573
  (1, 76515)	0.6556226145512709
  (1, 76516)	0.679914841526996
  (2, 763)	0.22159051090518359
  (2, 61531)	0.4587178998289233
  (2, 107020)	0.2945915056134832
  (2, 93306)	0.2658829644982531
  (2, 61532)	0.4587178998289233
  (2, 107075)	0.4026827592738571
  (2, 93339)	0.4587178998289233
  (3, 763)	0.21653641961669365
  (3, 161345)	0.3375257478128795
  (3, 151795)	0.18830007825002149
  (3, 47814)	0.4482553482876627
  (3, 161363)	0.4482553482876627
  (3, 151964)	0.4482553482876627
  (3, 47815)	0.4482553482876627
  (4, 763)	0.19764049948025617
  (4, 49639)	0.2743880168654919
  (4, 104437)	0.1176070632906874
  :	:
  (62419, 165119)	0.3842738783112516
  (62419, 842)	0.2166429

With the `tfidf` matrix calculated, computing the similarity between our search term and the titles in our data becomes much more straightforward. The `scikit-learn` library has a handy function called `cosine_similarity()` to accomplish this.

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

# for each given search term (title), this function will create and return an NDArray whose cells are values between 0 and 1 indicating how similar the search term and the title at the corresponding location in the tfidf matrix we created previously. A value closer to 1 means they are more similar.
def search(title: str) -> np.ndarray:
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    return similarity

# We remember that movies.head() showed a few titles with '1995' string in it, so we should expect when we call search("1995") that we get values > 0 for the first few elements of the ndarray
search("1995")


array([0.29475734, 0.32844299, 0.22159051, ..., 0.        , 0.        ,
       0.        ], shape=(62423,))

In [22]:
# return the top 5 highest matching movies based on the search term
similarity = search("Toy Story 1995")
indices = np.argpartition(similarity, -5)[-5:]
top_5_movies = movies.iloc[indices]
top_5_movies

Unnamed: 0,movieId,title,genres,clean_title
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


Ok, I think I want to create another search function that handles the fetching of rows from the `movies` dataframe. The new function `nsearch(title, n)` will accept a second parameter, `n`, which corresponds to the top N matches for the given title. 


In [31]:
def nsearch(title: str, n: int = 1) -> pd.DataFrame:
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -n)[-n:]
    results = movies.iloc[indices].sort_index()
    return results

# lets see if it works as expected
top_matching_movie = nsearch("Toy Story 1995")
top_matching_movie # should be the row containing 'Toy Story 1995'

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


In [32]:
top_5_matching_movies = nsearch("Toy Story 1995", n=5)
top_5_matching_movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
