In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
dataset_path = 'imdb-movies-dataset.csv'
df = pd.read_csv(dataset_path)

In [10]:
print("Dataset columns: ", df.columns.tolist())

Dataset columns:  ['Poster', 'Title', 'Year', 'Certificate', 'Duration (min)', 'Genre', 'Rating', 'Metascore', 'Director', 'Cast', 'Votes', 'Description', 'Review Count', 'Review Title', 'Review']


In [11]:
# Ensure no missing values in the columns
for col in ['Title', 'Genre', 'Director', 'Cast', 'Description']:
    df[col] = df[col].fillna("")

In [12]:
# Merge the relevant colums
df['combined_features'] = df['Title'] + " " + df['Genre'] + " " + df['Director'] + " " + df['Cast'] + " " + df['Description']

In [13]:
# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_features'])
print("TF-IDF matrix shape:", tfidf_matrix)

TF-IDF matrix shape: <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 285626 stored elements and shape (10000, 39319)>
  Coords	Values
  (0, 16844)	0.18607796928551193
  (0, 7434)	0.056039812073635155
  (0, 10410)	0.04321188469107035
  (0, 30100)	0.14905328337699258
  (0, 23135)	0.09197146169001279
  (0, 32195)	0.21868067007403053
  (0, 1773)	0.13776458915157996
  (0, 15539)	0.16886332778089597
  (0, 24873)	0.14967349123234813
  (0, 13589)	0.21203085379571776
  (0, 11142)	0.18107059848573379
  (0, 30337)	0.2151474100283804
  (0, 1779)	0.18199911990531434
  (0, 24117)	0.22758377433107352
  (0, 32987)	0.24110029453858625
  (0, 300)	0.18961122933116206
  (0, 38745)	0.2288451351845244
  (0, 25606)	0.2179855819528031
  (0, 32476)	0.15169875279123685
  (0, 23671)	0.16829427046675224
  (0, 3642)	0.13147234784542508
  (0, 36626)	0.14677445943885875
  (0, 264)	0.1909022241623573
  (0, 15612)	0.20230080269780346
  (0, 5792)	0.15700852854264344
  :	:
  (9999, 36610)	0.196163410944415

In [14]:
def get_recommendations(query, df, tfidf_matrix, vectorizer, top_n=5):
    # Convert the query into a vector
    query_vec = vectorizer.transform([query])
    # Compute cosine similarity between the query and all movies
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # Get the indices of the top matching movies
    top_indices = similarity_scores.argsort()[::-1][:top_n]
    # Retrieve and return the recommended movies along with their similarity scores
    recommendations = df.iloc[top_indices][['Title', 'Genre', 'Description']].copy()
    recommendations['similarity'] = similarity_scores[top_indices]
    return recommendations

In [17]:
user_query = "I love thrilling action movies set in space, with a comedic twist."
results = get_recommendations(user_query, df, tfidf_matrix, vectorizer, top_n=5)

In [18]:
print("Top Recommendations:")
print(results)

Top Recommendations:
                             Title                      Genre  \
8255                      Amarcord              Comedy, Drama   
1028                 Lost in Space  Action, Adventure, Family   
9399           It's Pat: The Movie                     Comedy   
6875  Space Babes from Outer Space             Comedy, Sci-Fi   
5131                  Freaky Tales   Adventure, Comedy, Drama   

                                            Description  similarity  
8255  A series of comedic and nostalgic vignettes se...    0.140185  
1028  The Robinson family was going into space to fi...    0.139578  
9399  The comedic misadventures of a person of indet...    0.132818  
6875  Three space women land on Earth in search of s...    0.131187  
5131  Four interconnected stories set in 1987 Oaklan...    0.126427  
