In [4]:
# importing libraries & Reading the dataset

import pandas as pd
import textwrap
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_data(file_path):
    df = pd.read_csv('imdb_top_1000.csv', encoding='latin1')
    
    return df

In [6]:
# pre-processing the dataset

def preprocess_data(df):
    
    # handling missing values
    df['Overview'] = df['Overview'].fillna("")
    df['Genre'] = df['Genre'].fillna("")
    
    # combining overview and genre text data to effectively match user inputs
    df['combined_features'] = df['Overview'] + " " + (df['Genre'] + " ") * 3 
    
    return df

# wrapping text to ensure better readbility in the output

def wrap_text(text, width=50):
    """Wrap text for better readability in the table output."""
    return "\n".join(textwrap.wrap(text, width))

In [8]:
# extracting genre from user input by matching it with the dataset genre

def extract_genre_from_query(query, df):
    query = query.lower()

    # generating unique genres from dataset
    all_genres = set(genre.strip().lower() for sublist in df['Genre'].dropna().str.split(",") for genre in sublist)

    detected_genre = None
    for genre in all_genres:
        if re.search(rf'\b{genre}\b', query): 
            detected_genre = genre
            break

    return detected_genre

In [10]:
# building an recommendation engine

# computing cosine similarity between the user input and all combined features
# will return the top 3-5 movies sorted by IMDb rating

def get_recommendations(query, vectorizer, tfidf_matrix, df, top_n=5):
    
    query_vec = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # getting top N similar items
    similar_indices = cosine_sim.argsort()[-top_n*3:][::-1] 
    recommendations = df.iloc[similar_indices].copy()
    recommendations['similarity'] = cosine_sim[similar_indices]

    # rounding similarity scores
    recommendations['similarity'] = recommendations['similarity'].round(4)
    
    # detecting genre from user input
    detected_genre = extract_genre_from_query(query, df)
    
    if detected_genre:
        recommendations = recommendations[recommendations['Genre'].str.lower().str.contains(detected_genre, na=False)]
    
    # sorting recommendations by IMDb rating
    recommendations = recommendations.sort_values(by='IMDB_Rating', ascending=False)

    # limit back to top N
    recommendations = recommendations.head(top_n)
    
    # applying text wrapping for better readability
    recommendations['Series_Title'] = recommendations['Series_Title'].apply(lambda x: wrap_text(x, width=25))
    recommendations['Genre'] = recommendations['Genre'].apply(lambda x: wrap_text(x, width=30))
    recommendations['Overview'] = recommendations['Overview'].apply(lambda x: wrap_text(x, width=50))

    return recommendations[['Series_Title', 'Genre', 'Overview', 'IMDB_Rating', 'similarity']]

In [14]:
def main():
    file_path = 'imdb_top_1000.csv' 
    
    # loading and preprocessing the dataset
    df = load_data(file_path)
    df = preprocess_data(df)
    
    # building the TF-IDF matrix using the combined features (Overview + Genre)
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['combined_features'])

    # prompting user for input
    user_query = input("\nEnter your movie preferences: ")

    # generating recommendations
    recommendations = get_recommendations(user_query, vectorizer, tfidf_matrix, df, top_n=5)

    # displaying recommendations in a formatted table
    if recommendations.empty:
        print("\nNo matching movies found. Try refining your query.")
    else:
        print("\nTop Recommendations:\n")
        print(recommendations.to_markdown(index=False))  

if __name__ == "__main__":
    main()


Enter your movie preferences:  I want to watch Sci-Fi movies



Top Recommendations:

| Series_Title            | Genre                    | Overview                                           |   IMDB_Rating |   similarity |
|:------------------------|:-------------------------|:---------------------------------------------------|--------------:|-------------:|
| Interstellar            | Adventure, Drama, Sci-Fi | A team of explorers travel through a wormhole in   |           8.6 |       0.2434 |
|                         |                          | space in an attempt to ensure humanity's survival. |               |              |
| Eternal Sunshine of the | Drama, Romance, Sci-Fi   | When their relationship turns sour, a couple       |           8.3 |       0.2355 |
| Spotless Mind           |                          | undergoes a medical procedure to have each other   |               |              |
|                         |                          | erased from their memories.                        |               |              |
| A 