In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
favorite_movie = input("Enter your favorite movie: ")

imdb = pd.read_csv("imdb.csv")
# Load the dataset
data = imdb
tfidf_vectorizer = TfidfVectorizer()

# Preprocess the dataset by converting titles to lowercase
data['Series_Title'] = data['Series_Title'].str.lower()
def get_movie_data_by_title(movie_title, dataset):
    movie_title = movie_title.lower()
    movie_data = dataset[dataset['Series_Title'] == movie_title]
    return movie_data
def jaccard_similarity(s1, s2):
    set1 = set(s1.split(', '))
    set2 = set(s2.split(', '))
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union
def year_similarity(year1, year2):
    year1 = int(year1)
    year2 = int(year2)
    max_year = max(year1, year2)
    min_year = min(year1, year2)
    year_difference = max_year - min_year
    # Normalize to a similarity score between 0 and 1 (higher values for closer years)
    similarity = 1 - (year_difference / max_year)
    return similarity
favorite_movie_data = get_movie_data_by_title(favorite_movie, data)

def find_similar_movies(user_input, dataset):
    # Convert user input to lowercase
    user_input = user_input.lower()
    
    # Tokenize the user input
    user_input_tokens = user_input.split()
    
    # Tokenize the movie titles
    movie_titles = dataset['Series_Title'].str.split()
    dataset = dataset[dataset['Released_Year'].str.isnumeric()]
    dataset['Released_Year'] = dataset['Released_Year'].astype(int)
    dataset = dataset.dropna()

    dataset.reset_index(drop=True, inplace=True)

    # Create a CountVectorizer to convert movie titles to a binary bag-of-words matrix
    vectorizer = CountVectorizer(binary=True)
    movie_title_matrix = vectorizer.fit_transform([' '.join(title) for title in movie_titles])
    user_input_matrix = vectorizer.transform([' '.join(user_input_tokens)])
    movie_overview_matrix = tfidf_vectorizer.fit_transform(dataset['Overview'])
    # Function to find similar movies
# Tokenize and calculate TF-IDF vector for user's movie overview
    user_input_overview = favorite_movie_data['Overview'].values[0]
    user_input_overview_matrix = tfidf_vectorizer.transform([user_input_overview])
    overview_similarity_scores = cosine_similarity(user_input_overview_matrix, movie_overview_matrix)
    dataset['Overview_Similarity'] = overview_similarity_scores[0]

    # Calculate cosine similarities between user input and movie titles
    # Calculate cosine similarities between user input and movie titles for filtered dataset
    similarity_scores = cosine_similarity(user_input_matrix, movie_title_matrix[dataset.index])

# Calculate genre similarity scores for the filtered dataset
    genre_similarity_scores = dataset['Genre'].apply(lambda x: jaccard_similarity(user_input, x))

# Calculate year similarity scores for the filtered dataset
    year_similarity_scores = dataset['Released_Year'].apply(lambda year: year_similarity(int(favorite_movie_data['Released_Year'].values[0]), year))
    director_similarity_scores = dataset['Director'].apply(lambda x: jaccard_similarity(user_input, x))


    # Add similarity scores to the dataset
    dataset['Title_Similarity'] = similarity_scores[0]
    dataset['Genre_Similarity'] = genre_similarity_scores
    dataset['Year_Similarity'] = year_similarity_scores
    dataset['Director_Similarity'] = year_similarity_scores

# Define the weights for each feature
    title_weight = 0.3
    genre_weight = 0.2
    year_weight = 0.075
    director_weight = 0.075
    overview_weight = 0.35
# Calculate the final similarity score with the specified weights
    dataset['Final_Similarity'] = (
        title_weight * dataset['Title_Similarity'] +
        genre_weight * dataset['Genre_Similarity'] +
        year_weight * dataset['Year_Similarity'] + 
        director_weight * dataset['Director_Similarity'] +
    overview_weight * dataset['Overview_Similarity']
    )
    
    # Sort movies by final similarity score in descending order
    sorted_movies = dataset.sort_values(by='Final_Similarity', ascending=False)
    sorted_movies = sorted_movies[sorted_movies['Series_Title'] != user_input]

    # Return the top 5 most similar movies
    similar_movies = sorted_movies[['Series_Title', 'Released_Year', 'IMDB_Rating', 'Final_Similarity']].head(5)
    if dataset.empty:
        print("No movies found that match the criteria.")
      # or raise an exception, or return an empty result, depending on your preference
    return similar_movies

# Ask the user for their favorite movie

if favorite_movie_data.empty:
    print("Movie not found in the dataset. Please check the title.")
else:
    # Find and display similar movies
    similar_movies = find_similar_movies(favorite_movie, data)
    print("Top 5 similar movies:")
    print(similar_movies[['Series_Title', 'Released_Year', 'IMDB_Rating', 'Final_Similarity']])

                         

Enter your favorite movie: the godfather
Top 5 similar movies:
                Series_Title  Released_Year  IMDB_Rating  Final_Similarity
3     the godfather: part ii           1974          9.0          0.400818
693  the godfather: part iii           1990          7.6          0.357751
590          ordinary people           1980          7.7          0.327056
441              october sky           1999          7.8          0.318279
655     synecdoche, new york           2008          7.6          0.314978


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Released_Year'] = dataset['Released_Year'].astype(int)


In [92]:
imdb


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,the shawshank redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,the godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,the dark knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,the godfather: part ii,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 angry men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,breakfast at tiffany's,1961,A,115 min,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,giant,1956,G,201 min,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,from here to eternity,1953,Passed,118 min,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,lifeboat,1944,,97 min,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,
