In [1]:
import pandas as pd

In [2]:
# Load movie-to-movie relationship graph
graph_file_path = 'data/community/jaccard_backboned.csv' 
graph_data = pd.read_csv(graph_file_path)

# Display first few rows of the graph data
graph_data.head()

Unnamed: 0,source,target,variance,nij,score
0,i242,i224,0.002625,0.139241,0.979968
1,i242,i898,0.002625,0.09182,0.962907
2,i242,i312,0.002625,0.107692,0.951371
3,i242,i221,0.002625,0.162754,0.989661
4,i242,i690,0.002625,0.128234,0.977992


In [8]:
import pandas as pd

# Load movie metadata
metadata_file_path = 'data/transformed/item_metadata.json'
metadata_data = pd.read_json(metadata_file_path, orient='index')

# Reset index to make 'movie_id' a column instead of index
metadata_data.reset_index(inplace=True)

# Rename columns
metadata_data.columns = ['movie_id', 'title', 'release_date'] + list(metadata_data.columns[3:])

# Display first few rows of the processed metadata data
metadata_data.head()


Unnamed: 0,movie_id,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,i1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,i2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,i3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,i4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,i5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


merge the graph data with the movie metadata based on the movie IDs. This will allow us to have all the necessary information in one place for building the recommendation system.​

In [10]:
# Merge graph data with movie metadata
merged_data = pd.merge(graph_data, metadata_data, left_on='source', right_on='movie_id', how='left')

# Display first few rows of the merged data
merged_data.head()

Unnamed: 0,source,target,variance,nij,score,movie_id,title,release_date,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,i242,i224,0.002625,0.139241,0.979968,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
1,i242,i898,0.002625,0.09182,0.962907,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
2,i242,i312,0.002625,0.107692,0.951371,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
3,i242,i221,0.002625,0.162754,0.989661,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
4,i242,i690,0.002625,0.128234,0.977992,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0


Build Movie Similarity Matrix.

For this, we will use the score column from the movie-to-movie relationship graph as the similarity measure between movies. The similarity matrix will be a square matrix with movies as both rows and columns, and each entry representing the similarity between two movies.​

In [11]:
import numpy as np

# Create a list of unique movie IDs
unique_movies = merged_data['source'].unique()

# Initialize an empty similarity matrix
similarity_matrix = pd.DataFrame(index=unique_movies, columns=unique_movies, dtype=float)

# Populate the similarity matrix with scores from the graph data
for index, row in merged_data.iterrows():
    similarity_matrix.loc[row['source'], row['target']] = row['score']

# Fill diagonal with 1s, as a movie is perfectly similar to itself
np.fill_diagonal(similarity_matrix.values, 1)

# Display the similarity matrix
similarity_matrix.head()

Unnamed: 0,i242,i224,i898,i312,i221,i690,i272,i285,i116,i270,...,i1625,i1156,i1654,i1122,i1310,i1653,i1235,i1678,i1671,i1652
i242,1.0,0.979968,0.962907,0.951371,0.989661,0.977992,0.959098,0.995534,0.977082,0.951469,...,,,,,,,,,,
i224,,1.0,,,0.985038,,,,0.985903,,...,,,,,,,,,,
i898,,,1.0,0.996155,,0.989208,0.974886,,,,...,,,,,,,,,,
i312,,,,1.0,,0.986702,0.989317,,,0.968361,...,,,,,,,,,,
i221,,,,,1.0,,,0.98892,0.986208,,...,,,,,,,,,,


In [14]:
def recommend_movies(movie1, movie2, movie3):
    # List of movie IDs provided by the user
    user_movies = [movie1, movie2, movie3]

    # Dictionary to hold aggregated scores for movie recommendations
    recommendation_scores = {}

    # Find the most similar movies for each user-provided movie
    for movie in user_movies:
        # Check if the movie is in the similarity matrix
        if movie in similarity_matrix.index:
            # Get the row corresponding to the movie from the similarity matrix
            similar_movies = similarity_matrix.loc[movie]

            # Drop NaN values and sort by similarity score
            similar_movies = similar_movies.dropna().sort_values(ascending=False)

            # Add scores to the recommendation_scores dictionary
            for sim_movie, score in similar_movies.items():
                if sim_movie not in user_movies:  # Exclude movies already liked by the user
                    if sim_movie not in recommendation_scores:
                        recommendation_scores[sim_movie] = score
                    else:
                        recommendation_scores[sim_movie] += score

    # Sort the recommended movies by aggregated score
    sorted_recommendations = sorted(recommendation_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the top 3 recommended movies
    top_3_recommendations = sorted_recommendations[:3]

    # Get movie titles for the input and recommended movies
    movie_titles = metadata_data.loc[metadata_data['movie_id'].isin(user_movies + [rec[0] for rec in top_3_recommendations]), ['movie_id', 'title']]

    # Create a table with the input and recommended movies
    table = pd.DataFrame(columns=['Input Movies', 'Recommended Movies'])
    for i in range(3):
        input_movie_title = movie_titles.loc[movie_titles['movie_id'] == user_movies[i], 'title'].values[0]
        if i < len(top_3_recommendations):
            recommended_movie_id = top_3_recommendations[i][0]
            recommended_movie_title = movie_titles.loc[movie_titles['movie_id'] == recommended_movie_id, 'title'].values[0]
        else:
            recommended_movie_title = ''
        table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)

    return table


In [15]:
# Test the recommend_movies function with movie IDs i530, i97, i59
recommendation_table = recommend_movies('i450', 'i229', 'i300')
recommendation_table

  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)


Unnamed: 0,Input Movies,Recommended Movies
0,Star Trek V: The Final Frontier (1989),Independence Day (ID4) (1996)
1,Star Trek III: The Search for Spock (1984),Mission: Impossible (1996)
2,Air Force One (1997),Escape from New York (1981)


In [16]:
# Test the recommend_movies function with movie IDs i530, i97, i59
recommendation_table = recommend_movies('i9', 'i12', 'i150')
recommendation_table

  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)


Unnamed: 0,Input Movies,Recommended Movies
0,Dead Man Walking (1995),Trainspotting (1996)
1,"Usual Suspects, The (1995)",Fargo (1996)
2,Swingers (1996),Twelve Monkeys (1995)


In [17]:
# Test the recommend_movies function with movie IDs i530, i97, i59
recommendation_table = recommend_movies('i591', 'i125', 'i328')
recommendation_table

  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)


Unnamed: 0,Input Movies,Recommended Movies
0,Primal Fear (1996),Ransom (1996)
1,Phenomenon (1996),"Time to Kill, A (1996)"
2,Conspiracy Theory (1997),Mission: Impossible (1996)


In [18]:
# Test the recommend_movies function with movie IDs i530, i97, i59
recommendation_table = recommend_movies('i250', 'i172', 'i33')
recommendation_table

  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)


Unnamed: 0,Input Movies,Recommended Movies
0,"Fifth Element, The (1997)",Army of Darkness (1993)
1,"Empire Strikes Back, The (1980)",Beavis and Butt-head Do America (1996)
2,Desperado (1995),Four Rooms (1995)
