In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load movie-to-movie relationship graph
graph_file_path = 'data/recommend/hyperbolic_backboned.csv' 

graph_data = pd.read_csv(graph_file_path)

# Display first few rows of the graph data
graph_data.head()

Unnamed: 0,source,target,score,variance,nij
0,i242,i224,0.999042,0.002625,0.835703
1,i242,i311,0.99991,0.002625,2.076414
2,i242,i273,0.979384,0.002538,1.181107
3,i242,i340,0.999861,0.002625,2.697472
4,i242,i534,0.953641,0.002625,0.135851


In [4]:
# Load movie metadata
metadata_file_path = 'data/transformed/item_metadata.json'
metadata_data = pd.read_json(metadata_file_path, orient='index')

# Reset index to make 'movie_id' a column instead of index
metadata_data.reset_index(inplace=True)

# Rename columns
metadata_data.columns = ['movie_id', 'title', 'release_date'] + list(metadata_data.columns[3:])

# Display first few rows of the processed metadata data
metadata_data.head()


Unnamed: 0,movie_id,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,i1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,i2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,i3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,i4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,i5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


merge the graph data with the movie metadata based on the movie IDs. This will allow us to have all the necessary information in one place for building the recommendation system.​

In [5]:
# Merge graph data with movie metadata
merged_data = pd.merge(graph_data, metadata_data, left_on='source', right_on='movie_id', how='left')

# Display first few rows of the merged data
merged_data.head()

Unnamed: 0,source,target,score,variance,nij,movie_id,title,release_date,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,i242,i224,0.999042,0.002625,0.835703,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
1,i242,i311,0.99991,0.002625,2.076414,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
2,i242,i273,0.979384,0.002538,1.181107,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
3,i242,i340,0.999861,0.002625,2.697472,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0
4,i242,i534,0.953641,0.002625,0.135851,i242,Kolya (1996),24-Jan-1997,0,0,...,0,0,0,0,0,0,0,0,0,0


Build Movie Similarity Matrix.

For this, we will use the score column from the movie-to-movie relationship graph as the similarity measure between movies. The similarity matrix will be a square matrix with movies as both rows and columns, and each entry representing the similarity between two movies.​

In [6]:

# Create a list of unique movie IDs
unique_movies = merged_data['source'].unique()

# Initialize an empty similarity matrix
similarity_matrix = pd.DataFrame(index=unique_movies, columns=unique_movies, dtype=float)

# Populate the similarity matrix with scores from the graph data
for index, row in merged_data.iterrows():
    similarity_matrix.loc[row['source'], row['target']] = row['score']

# Fill diagonal with 1s, as a movie is perfectly similar to itself
np.fill_diagonal(similarity_matrix.values, 1)

# Display the similarity matrix
similarity_matrix.head()

  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['source'], row['target']] = row['score']
  similarity_matrix.loc[row['sourc

Unnamed: 0,i242,i224,i311,i273,i340,i534,i508,i360,i748,i238,...,i598,i861,i973,i1553,i1539,i1508,i1156,i1310,i1309,i1472
i242,1.0,0.999042,0.99991,0.979384,0.999861,0.953641,0.974942,0.999671,0.999489,0.964508,...,,,,,,,,,,
i224,,1.0,,0.996241,,,,,,0.96688,...,,,,,,,,,,
i311,,,1.0,,0.999999,,,,0.999988,,...,,,,,,,,,,
i273,,,,1.0,0.988463,0.963668,0.997261,,0.99866,0.961835,...,,,,,,,,,,
i340,,,,,1.0,0.984645,0.98395,0.995058,0.999981,,...,,,,,,,,,,


In [7]:
def recommend_movies(movie1, movie2, movie3):
    # List of movie IDs provided by the user
    user_movies = [movie1, movie2, movie3]

    # Dictionary to hold aggregated scores for movie recommendations
    recommendation_scores = {}

    # Find the most similar movies for each user-provided movie
    for movie in user_movies:
        # Check if the movie is in the similarity matrix
        if movie in similarity_matrix.index:
            # Get the row corresponding to the movie from the similarity matrix
            similar_movies = similarity_matrix.loc[movie]

            # Drop NaN values and sort by similarity score
            similar_movies = similar_movies.dropna().sort_values(ascending=False)

            # Add scores to the recommendation_scores dictionary
            for sim_movie, score in similar_movies.items():
                if sim_movie not in user_movies:  # Exclude movies already liked by the user
                    if sim_movie not in recommendation_scores:
                        recommendation_scores[sim_movie] = score
                    else:
                        recommendation_scores[sim_movie] += score

    # Sort the recommended movies by aggregated score
    sorted_recommendations = sorted(recommendation_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the top 3 recommended movies
    top_3_recommendations = sorted_recommendations[:3]

    # Get movie titles for the input and recommended movies
    movie_titles = metadata_data.loc[metadata_data['movie_id'].isin(user_movies + [rec[0] for rec in top_3_recommendations]), ['movie_id', 'title']]

    # Create a table with the input and recommended movies
    table = pd.DataFrame(columns=['Input Movies', 'Recommended Movies'])
    for i in range(3):
        input_movie_title = movie_titles.loc[movie_titles['movie_id'] == user_movies[i], 'title'].values[0]
        if i < len(top_3_recommendations):
            recommended_movie_id = top_3_recommendations[i][0]
            recommended_movie_title = movie_titles.loc[movie_titles['movie_id'] == recommended_movie_id, 'title'].values[0]
        else:
            recommended_movie_title = ''
        table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)

    return table


In [8]:
# Example usage (Top 3 degree centrality recommendations)
recommendation_table = recommend_movies('i50', 'i181', 'i100')
recommendation_table

  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)


Unnamed: 0,Input Movies,Recommended Movies
0,Star Wars (1977),"Godfather, The (1972)"
1,Return of the Jedi (1983),Mr. Holland's Opus (1995)
2,Fargo (1996),Twelve Monkeys (1995)


In [11]:
# Top Horror movies
recommendation_table = recommend_movies('i185', 'i183', 'i200')
recommendation_table

  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)


Unnamed: 0,Input Movies,Recommended Movies
0,Psycho (1960),Jaws (1975)
1,Alien (1979),"Omen, The (1976)"
2,"Shining, The (1980)",Copycat (1995)


In [13]:
# Top Romance movies
recommendation_table = recommend_movies('i483', 'i313', 'i498')
recommendation_table

  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)
  table = table.append({'Input Movies': input_movie_title, 'Recommended Movies': recommended_movie_title}, ignore_index=True)


Unnamed: 0,Input Movies,Recommended Movies
0,Casablanca (1942),Somewhere in Time (1980)
1,Titanic (1997),"Bridges of Madison County, The (1995)"
2,"African Queen, The (1951)",Bread and Chocolate (Pane e cioccolata) (1973)
