## Import required dependencies

In [1]:
import pandas as pd
# To convert the user input into meaningful input suitable for the dataset
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer # Convert textual columns 
from sklearn.metrics.pairwise import cosine_similarity
import pickle as pkl
import os

In [2]:
# Load the dataset
Movies_dataset = pd.read_csv('preparingsimilarities\movies.csv')
# Show the first five rows in the dataset
Movies_dataset.head()
# Show the last five rows in the dataset
Movies_dataset.tail()
# Show the dataset shape
Movies_dataset.shape

(4803, 24)

In [3]:
# Select the most relevant features for the recommendation
selected_features = ['genres','keywords','tagline','cast','director']
selected_features

['genres', 'keywords', 'tagline', 'cast', 'director']

In [4]:
# Check about the none(missing) values in the dataset to decide if will make a data cleaning or not
Movies_dataset.isnull().sum()
# Make a data cleaning for 'genres','keywords','tagline','cast' and 'director' choosed columns 
for feature in selected_features:
    Movies_dataset[feature].fillna('',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Movies_dataset[feature].fillna('',inplace=True)


In [5]:
# Choose the selected features for the recommendation and make a new dataset
combined_features = Movies_dataset['genres']+' '+Movies_dataset['keywords']+' '+Movies_dataset['tagline']+' '+Movies_dataset['cast']+' '+Movies_dataset['director']
combined_features

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [6]:
# Convert textual columns in selected features into meaningful numeric columns
Vectorizer = TfidfVectorizer(min_df=1,stop_words='english', lowercase=True)
vectorized_features = Vectorizer.fit_transform(combined_features)
print(vectorized_features)
# Find the similarity scores using cosine similarity
similarity = cosine_similarity(vectorized_features)
similarity
# Create a list for all the movies names
movies_names = Movies_dataset['title'].tolist()

  (0, 2387)	0.17387762611499494
  (0, 7649)	0.11355691458889829
  (0, 12866)	0.19553338573938647
  (0, 10114)	0.16165931117025054
  (0, 8649)	0.22860674889222501
  (0, 14432)	0.1525185407602397
  (0, 16454)	0.19975784468313598
  (0, 13897)	0.2073363856597466
  (0, 13161)	0.21920124762693652
  (0, 17051)	0.20332801528845756
  (0, 16774)	0.23801225015751357
  (0, 13191)	0.15121581535902417
  (0, 11351)	0.2739303704041666
  (0, 16765)	0.12906888267705946
  (0, 4882)	0.2418630583829073
  (0, 14101)	0.21535043940154278
  (0, 3177)	0.25126855964819583
  (0, 16375)	0.1263324200909883
  (0, 14202)	0.3418956777020543
  (0, 5754)	0.16577485025092195
  (0, 3018)	0.22356693392633634
  (0, 3627)	0.21535043940154278
  (0, 5366)	0.10433355352725562
  (0, 13440)	0.10433355352725562
  (0, 5204)	0.11182749749508697
  :	:
  (4801, 13017)	0.29414364992580866
  (4801, 13672)	0.28404411818176767
  (4801, 17027)	0.29414364992580866
  (4801, 4774)	0.2518762905014918
  (4801, 397)	0.18067495553272095
  (4801, 

In [7]:
# Show similarities
similarity

array([[1.        , 0.04720973, 0.03891121, ..., 0.        , 0.        ,
        0.        ],
       [0.04720973, 1.        , 0.021542  , ..., 0.01267122, 0.        ,
        0.        ],
       [0.03891121, 0.021542  , 1.        , ..., 0.        , 0.05626942,
        0.        ],
       ...,
       [0.        , 0.01267122, 0.        , ..., 1.        , 0.        ,
        0.02730605],
       [0.        , 0.        , 0.05626942, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02730605, 0.        ,
        1.        ]])

In [8]:
# Making a recommendation system by taking the user movie name and find the first 20 similar movies for the user input

def recommend(user_movie_name):

    # Find the close matches between the user input and the movies names
    close_matches_names = difflib.get_close_matches(user_movie_name,movies_names)
    if close_matches_names:
        close_matches_names = close_matches_names[0]
    print(close_matches_names)

    # Find the index of the closest movie of the user input
    index_closest_movie = Movies_dataset[Movies_dataset.title==close_matches_names]['index'].values[0]
    print(index_closest_movie)

    # Get a list of similar movies
    similarity_list = list(enumerate(similarity[index_closest_movie]))
    similarity_list

    # Sort the list of similarity based on the similarity values and reverse it from biggest to smallest
    similarity_list_sorted = sorted(similarity_list,key=lambda x:x[1],reverse=True)
    similarity_list_sorted

    # Make the Recommendation program for the user input
    num_recommended_movies=20
    list_movies = []
    for NumMovie in range(len(similarity_list_sorted)):
        if NumMovie==num_recommended_movies:
            break
        list_movies.append(f"({NumMovie+1}) {Movies_dataset.iloc[similarity_list_sorted[NumMovie][0],:].title}")
    return list_movies


In [9]:
# Try to use recommend function to recommend first 20 similar films for iron man
print(recommend('iron man'))

Iron Man
68
['(1) Iron Man', '(2) Iron Man 2', '(3) Iron Man 3', '(4) Avengers: Age of Ultron', '(5) The Avengers', '(6) Captain America: Civil War', '(7) Captain America: The Winter Soldier', '(8) Ant-Man', '(9) X-Men', '(10) Made', '(11) X2', '(12) X-Men: Apocalypse', '(13) X-Men: First Class', '(14) The Incredible Hulk', '(15) The Helix... Loaded', '(16) Captain America: The First Avenger', '(17) Guardians of the Galaxy', '(18) Kick-Ass 2', '(19) Thor: The Dark World', '(20) Deadpool']


In [10]:
file_path = 'preparingsimilarities\similarity.pkl'

# Check if the file already exists
if not os.path.exists(file_path):
    # Save the similarity object to the file
    pkl.dump(similarity, open(file_path, 'wb'))
    print(f"Similarity object has been saved to '{file_path}'")

Similarity object has been saved to 'preparingsimilarities\similarity.pkl'
