In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df = pd.read_csv('imdb_top_1000.csv')

In [None]:
def calculate_cosine_similarity(data):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cosine_similarities

def find_top_similar(query_index, cosine_similarities, N=10):
    similar_indices = cosine_similarities[query_index].argsort()[:-N-1:-1]
    return similar_indices

In [None]:
query_entities = ["Action Movies with High Ratings", "Classic Comedies", "Epic Films with Long Runtimes"]

features_to_use = ['Runtime', 'IMDB_Rating', 'Meta_score']

df['Combined_Features'] = df[features_to_use].astype(str).agg(' '.join, axis=1)

cosine_similarities = calculate_cosine_similarity(df['Combined_Features'])


In [None]:
for query in query_entities:
    query_index = df[df['Genre'].str.contains(query)].index[0]
    similar_indices = find_top_similar(query_index, cosine_similarities)
    
    print(f"Top 10 Similar Movies for '{query}':")
    for i, index in enumerate(similar_indices[1:11], 1):  # Exclude the query itself
        print(f"{i}. {df.iloc[index]['Series_Title']} (IMDb: {df.iloc[index]['IMDB_Rating']}, Metascore: {df.iloc[index]['Meta_score']})")
