In [203]:
# Import libraries needed
import pandas as pd

In [204]:
# Reads CSV files and assigns variable
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")

In [206]:
# Split the genres column by "|" and create a list of genres for each movie
movies_df['genre_list'] = movies_df['genres'].str.split('|')

In [208]:
# Get the unique list of genres
unique_genres = set(g for genres in movies_df['genre_list'] for g in genres)

In [209]:
# Create binary indicator variables for each genre
for genre in unique_genres:
    movies_df[genre] = movies_df['genre_list'].apply(lambda x: 1 if genre in x else 0)

In [210]:
# Drop unnecessary columns
movies_df.drop(['genres', 'genre_list'], axis = 1, inplace = True)

In [211]:
# Calculate average rating for each movieId
average_ratings = ratings_df.groupby('movieId')['rating'].mean().reset_index()

In [212]:
# Merge average_ratings with movies_df using movieId column
movies_df = pd.merge(movies_df, average_ratings, on = 'movieId', how = 'left')

In [213]:
# Rename the rating column to average rating
movies_df.rename(columns = {'rating': 'average_rating'}, inplace = True)

In [214]:
# Extract the year from the title column
movies_df['Year Published'] = movies_df['title'].str.extract(r'\((\d{4})\)')

In [215]:
# Remove the year from the 'title' column
movies_df['title'] = movies_df['title'].str.replace(r'\s*\(\d{4}\)', '', regex = True)

In [216]:
def find_similar_movies(input_title, movies_df):
    # Convert input title to lowercase
    input_title = input_title.lower()

    # Find the row corresponding to the input movie title
    input_movie = movies_df.loc[movies_df['title'].str.lower() == input_title]
    if input_movie.empty:
        print("Movie not found.")
        return

    # Extract genres of the input movie
    input_genres = input_movie.iloc[0, 2:-2].tolist()  # Convert to list

    # Calculate similarity score for each movie
    movies_df['similarity_score'] = movies_df.apply(lambda row: sum([a and b for a, b in zip(input_genres, row[2:-2].tolist())]) / sum([a or b for a, b in zip(input_genres, row[2:-2].tolist())]), axis = 1)

    # Filter out input movie from the DataFrame
    movies_df = movies_df[movies_df['title'].str.lower() != input_title]

    # Sort movies by similarity score and average rating
    similar_movies = movies_df.sort_values(by = ['similarity_score', 'average_rating'], ascending = [False, False])

    # Display top 10 similar movies
    print("Top 10 similar movies to", input_title, ":")
    print(similar_movies[['title', 'average_rating']].head(10))

In [217]:
# Ask for user input
input_title = input("Enter a movie title: ")

Enter a movie title: toy story


In [218]:
# Call the function to find similar movies
find_similar_movies(input_title, movies_df)

Top 10 similar movies to toy story :
                            title  average_rating
52826  Tangled: Before Ever After        4.000000
43614                       Moana        3.866667
4780               Monsters, Inc.        3.854325
3021                  Toy Story 2        3.818480
3912    Emperor's New Groove, The        3.661517
22353              Boxtrolls, The        3.560606
28245              Brother Bear 2        3.428571
12969     Tale of Despereaux, The        3.350000
2203                         Antz        3.260504
59335                Missing Link        3.250000
