# Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz



# Import Data

In [2]:
# load data
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# extract genres data from movies
genres = movies['genres']
genres

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

# Preprocess Raw Data

In [4]:
# tokenize genres
genre_tokens = [word_tokenize(genre.replace("|", " ")) for genre in genres]
genre_tokens

[['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],
 ['Adventure', 'Children', 'Fantasy'],
 ['Comedy', 'Romance'],
 ['Comedy', 'Drama', 'Romance'],
 ['Comedy'],
 ['Action', 'Crime', 'Thriller'],
 ['Comedy', 'Romance'],
 ['Adventure', 'Children'],
 ['Action'],
 ['Action', 'Adventure', 'Thriller'],
 ['Comedy', 'Drama', 'Romance'],
 ['Comedy', 'Horror'],
 ['Adventure', 'Animation', 'Children'],
 ['Drama'],
 ['Action', 'Adventure', 'Romance'],
 ['Crime', 'Drama'],
 ['Drama', 'Romance'],
 ['Comedy'],
 ['Comedy'],
 ['Action', 'Comedy', 'Crime', 'Drama', 'Thriller'],
 ['Comedy', 'Crime', 'Thriller'],
 ['Crime', 'Drama', 'Horror', 'Mystery', 'Thriller'],
 ['Action', 'Crime', 'Thriller'],
 ['Drama', 'Sci-Fi'],
 ['Drama', 'Romance'],
 ['Drama'],
 ['Children', 'Drama'],
 ['Drama', 'Romance'],
 ['Adventure', 'Drama', 'Fantasy', 'Mystery', 'Sci-Fi'],
 ['Crime', 'Drama'],
 ['Drama'],
 ['Mystery', 'Sci-Fi', 'Thriller'],
 ['Children', 'Drama'],
 ['Crime', 'Drama'],
 ['Children', 'Comedy'],
 

In [5]:
#  apply lemmatizer
lemmatizer = WordNetLemmatizer()
genre_lemmas = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in genre_tokens]
genre_lemmas

[['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],
 ['Adventure', 'Children', 'Fantasy'],
 ['Comedy', 'Romance'],
 ['Comedy', 'Drama', 'Romance'],
 ['Comedy'],
 ['Action', 'Crime', 'Thriller'],
 ['Comedy', 'Romance'],
 ['Adventure', 'Children'],
 ['Action'],
 ['Action', 'Adventure', 'Thriller'],
 ['Comedy', 'Drama', 'Romance'],
 ['Comedy', 'Horror'],
 ['Adventure', 'Animation', 'Children'],
 ['Drama'],
 ['Action', 'Adventure', 'Romance'],
 ['Crime', 'Drama'],
 ['Drama', 'Romance'],
 ['Comedy'],
 ['Comedy'],
 ['Action', 'Comedy', 'Crime', 'Drama', 'Thriller'],
 ['Comedy', 'Crime', 'Thriller'],
 ['Crime', 'Drama', 'Horror', 'Mystery', 'Thriller'],
 ['Action', 'Crime', 'Thriller'],
 ['Drama', 'Sci-Fi'],
 ['Drama', 'Romance'],
 ['Drama'],
 ['Children', 'Drama'],
 ['Drama', 'Romance'],
 ['Adventure', 'Drama', 'Fantasy', 'Mystery', 'Sci-Fi'],
 ['Crime', 'Drama'],
 ['Drama'],
 ['Mystery', 'Sci-Fi', 'Thriller'],
 ['Children', 'Drama'],
 ['Crime', 'Drama'],
 ['Children', 'Comedy'],
 

In [6]:
# apply CountVectorizer
cv = CountVectorizer(lowercase=True)
feature_matrix = cv.fit_transform([" ".join(lemmas) for lemmas in genre_lemmas]).toarray()
feature_matrix

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
# printing the result of countvectorization
print("Count Vector : \n", feature_matrix)
print("\nNote: First row of above count vector: ",feature_matrix[0])
print("\nColumns Coresponding to above count vector is :\n", cv.get_feature_names_out())

Count Vector : 
 [[0 1 1 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Note: First row of above count vector:  [0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Columns Coresponding to above count vector is :
 ['action' 'adventure' 'animation' 'children' 'comedy' 'crime'
 'documentary' 'drama' 'fantasy' 'fi' 'film' 'genre' 'horror' 'imax'
 'listed' 'musical' 'mystery' 'no' 'noir' 'romance' 'sci' 'thriller' 'war'
 'western']


# Merge movies_data and feature_matrix

In [8]:
# extract movieId and title from movies and assign to movies_data
movies_data = movies.loc[:,['movieId','title']]

# group movies_data and feature_matrix together
movies_data = movies_data.join(pd.DataFrame(feature_matrix))
movies_data.head()

Unnamed: 0,movieId,title,0,1,2,3,4,5,6,7,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Compute cosine similarity

In [9]:
# Compute the pairwise cosine similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)
similarity_matrix

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

# Input Data

In [10]:
# prompt the user to enter partial or full movie title
movie_title = input('Finding similar movies.\nPlease enter the movie title: ')

Finding similar movies.
Please enter the movie title: feature


# Get Closest Match

## Define get_closest_match()

In [11]:
# Find the closest match to the user's input using fuzzy matching
def get_closest_match(title):
    titles = movies_data['title']
    highest_ratio = 0
    closest_match = ''
    for t in titles:
        ratio = fuzz.ratio(title.lower(), t.lower())
        if ratio > highest_ratio:
            highest_ratio = ratio
            closest_match = t
    return closest_match

## Printing the closest match

In [12]:
closest_match = get_closest_match(movie_title)
print(f"Closest match found: {closest_match}")

Closest match found: Fracture (2007)


# Derive Recommended Movies List

## Define recommend_movies()

In [13]:
def recommend_movies(closest_match, similarity_matrix, movies_data):
    # Find the index of the movie title in the movies_data DataFrame
    movie_index = movies_data.index[movies_data['title'] == closest_match][0]

    # Get the similarity scores of the input movie with all other movies
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))

    # Sort the similarity scores in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top 11 most similar movies (including the input movie)
    top_11_movies = similarity_scores[:11]

    # Get the titles of the top 11 movies
    top_11_titles = [movies_data.iloc[movie[0]]['title'] for movie in top_11_movies]

    # Remove the closest match from the list of recommended movies
    top_10_movies = [title for title in top_11_titles if title != closest_match][:10]

    # Print the recommended movies
    if len(top_10_movies) > 0:
        print(f"Top 10 recommended movies for {closest_match}:")
        for i, movie in enumerate(top_10_movies, start = 1):
            print(f"{i}. {movie}")
    else:
        print(f"No movies found similar to {closest_match}.")

## Printing the Recommended Movies List

In [14]:
recommend_movies(closest_match, similarity_matrix, movies_data)

Top 10 recommended movies for Fracture (2007):
1. Primal Fear (1996)
2. La Cérémonie (1995)
3. Murder at 1600 (1997)
4. Kiss the Girls (1997)
5. Wild Things (1998)
6. Spanish Prisoner, The (1997)
7. Godfather: Part III, The (1990)
8. Name of the Rose, The (Name der Rose, Der) (1986)
9. General's Daughter, The (1999)
10. Twin Peaks: Fire Walk with Me (1992)
