Graph traversal recommendation system does not seem to produce desired result

In [6]:
import pandas as pd
import numpy as np
import networkx as nx
import math
from tqdm import tqdm

In [2]:
# Load movie-to-movie relationship graph
graph_file_path = 'data/recommend/hyperbolic_backboned.csv' 

graph_data = pd.read_csv(graph_file_path)

# Display first few rows of the graph data
graph_data.head()

Unnamed: 0,source,target,score,variance,nij
0,i242,i224,0.999042,0.002625,0.835703
1,i242,i311,0.99991,0.002625,2.076414
2,i242,i273,0.979384,0.002538,1.181107
3,i242,i340,0.999861,0.002625,2.697472
4,i242,i534,0.953641,0.002625,0.135851


In [3]:
import pandas as pd

# Load movie metadata
metadata_file_path = 'data/transformed/item_metadata.json'
metadata_data = pd.read_json(metadata_file_path, orient='index')

# Reset index to make 'movie_id' a column instead of index
metadata_data.reset_index(inplace=True)

# Rename columns
metadata_data.columns = ['movie_id', 'title', 'release_date'] + list(metadata_data.columns[3:])

# Display first few rows of the processed metadata data
metadata_data.head()


Unnamed: 0,movie_id,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,i1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,i2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,i3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,i4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,i5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [14]:
# Create the movie graph
G = nx.from_pandas_edgelist(graph_data, source="source", target="target", edge_attr=True)

In [10]:
# Function to map movie IDs to titles
def map_movie_ids_to_titles(movie_ids, metadata):
    # Ensure metadata is indexed by movie_id
    metadata = metadata.set_index('movie_id', drop=False)
    titles = []
    for movie_id in movie_ids:
        try:
            titles.append(metadata.at[movie_id, 'title'])
        except KeyError:
            titles.append('Unknown Movie')
    return titles

In [18]:
# Search depth recommendation algorithm
def search_depth_recommend(G, movie_ids, n=3, sort_by='nij', search_depth='max'):
    if search_depth == 'max':
        search_depth = math.inf

    recommendations = {}
    for movie_id in movie_ids:
        path = []
        current_search_depth = 1
        neighbors = {k: v for k, v in G[movie_id].items() if k not in movie_ids}
        if len(neighbors) > 0:
            max_neighbor = max(neighbors.items(), key=lambda x:x[1][sort_by])
        else:
            recommendations[movie_id] = []
            continue
        visited = set(neighbors)
        visited.add(movie_id)
        path.append(max_neighbor)

        while current_search_depth < search_depth:
            max_neighbors = {neighbor: weight for neighbor, weight in G[max_neighbor[0]].items()
                             if neighbor not in visited and neighbor not in movie_ids}
            if len(max_neighbors) == 0:
                break
            max_neighbor = max(max_neighbors.items(), key=lambda x:x[1][sort_by])
            path.append(max_neighbor)
            visited = visited.union(set(max_neighbors.keys()))
            visited.add(max_neighbor[0])
            current_search_depth += 1

        naive_recommend = [(key, val[sort_by]) for key, val in G[movie_id].items() if key not in movie_ids]
        dijkstra_scores = list(_dijkstra_scores(path, sort_by=sort_by).items())
        all_recommend = naive_recommend + dijkstra_scores
        sorted_recommend = sorted(all_recommend, key=lambda x:x[1], reverse=True)
        recommendations[movie_id] = [x[0] for x in sorted_recommend if x[0] not in movie_ids][:n]

    return recommendations

# Dijkstra scores calculation
def _dijkstra_scores(path, sort_by='nij', penalty='linear'):
    scores = {}
    for i, node in enumerate(path):
        if i == 0:
            continue
        node, _ = node
        scores[node] = 0
        for j in range(1, i + 2):
            scores[node] += 1 / j**2 * path[j - 1][-1][sort_by]
        scores[node] /= i + 1
    return scores

In [19]:
# Function to create an aggregate recommendation DataFrame
def create_aggregate_recommendation_dataframe(movie_ids, recommendations, metadata, total_recs=3):
    rec_list = [rec for sublist in recommendations.values() for rec in sublist if rec not in movie_ids]
    unique_recs = list(set(rec_list))[:total_recs]
    recommended_titles = map_movie_ids_to_titles(unique_recs, metadata)
    input_titles = map_movie_ids_to_titles(movie_ids, metadata)
    repeated_input_titles = np.resize(input_titles, len(recommended_titles))
    return pd.DataFrame({
        'Input Movies': repeated_input_titles,
        'Recommended Movies': recommended_titles
    })
    
# Main function to recommend movies
def recommend_movies(*movie_ids, total_recs=3):
    recommendations = search_depth_recommend(G, movie_ids)
    recommendation_df = create_aggregate_recommendation_dataframe(movie_ids, recommendations, metadata_data, total_recs)
    return recommendation_df

In [17]:
# Example usage (Top 3 degree centrality recommendations for movies 'i50', 'i181', 'i100')
recommendation_table = recommend_movies('i50', 'i181', 'i100')
recommendation_table
# Recommended movies are all top 10 degree centrality

Unnamed: 0,Input Movies,Recommended Movies
0,Star Wars (1977),Toy Story (1995)
1,Return of the Jedi (1983),"English Patient, The (1996)"
2,Fargo (1996),Contact (1997)


In [29]:
recommendation_table = recommend_movies('i185', 'i183', 'i200')
recommendation_table

Unnamed: 0,Input Movies,Recommended Movies
0,Psycho (1960),Pulp Fiction (1994)
1,Alien (1979),Star Wars (1977)
2,"Shining, The (1980)","Silence of the Lambs, The (1991)"
