In [7]:
import pandas as pd
import neo4j
from neo4j import GraphDatabase
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
from utils import query_db
from utils import URI, AUTH

## Fetch movie data from DB

In [8]:
movie_data = query_db("""
  MATCH (m:Movie) 
  RETURN m.id as id, m.info as info, m.title as title
  """)

print(movie_data[0])

{'id': '3', 'info': 'This was Burgess Meredith\'s last film. He died of complications of Alzheimer\'s disease on September 9, 1997. He showed symptoms at the time of this movie\'s filming, and had to be coached during each scene in which he appeared. Meredith\'s acting talents are evident despite his failing mental faculties.,In the first movie Grandpa Gustafson says he\'s 94 years old. In the second movie he says he just turned 95. So less than a year has passed but somehow Melanie\'s daughter Allie aged 3 or 4 years.,Grandpa Gustafson: What the... what the hell is this?\nJohn Gustafson: That\'s lite beer.\nGrandpa Gustafson: Gee, I weigh ninety goddamn pounds, and you bring me this sloppin\' foam?\nJohn Gustafson: Ariel\'s got me on a diet because the doc said my cholestorol\'s a little too high.\nGrandpa Gustafson: Well let me tell you something now, Johnny. Last Thursday, I turned 95 years old. And I never exercised a day in my life. Every morning, I wake up, and I smoke a cigarett

## Get transformer for creating embeddings

In [9]:
embedder = SentenceTransformer("msmarco-distilbert-base-v4")


In [10]:
embedding_dimensions = 768

### Ingest embedding data into the database

In [11]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        for m in movie_data:
            id = m["id"]
            info = m["info"] if "info" in m else ""
            info = "" if info is None else info
            embedding = embedder.encode(info).astype(np.float32).tolist()
            set_embedding_query = """
            MATCH (m {id: '$id'})
            SET m.embedding =  apoc.convert.fromJsonList('$embeddings')
            """.replace("$id", str(id)).replace("$embeddings", str(embedding))
            session.run(set_embedding_query)

### TODO
Check that the embeddings are now ingested in the database.

## Create the vector index

In [12]:
create_vector_index_query="""
CREATE VECTOR INDEX embedding_index
FOR (m:Movie)
ON (m.embedding)
OPTIONS {indexConfig: {`vector.dimensions`: $embedding_dimensions, `vector.similarity_function`: 'cosine'}}
""".replace("$embedding_dimensions", str(embedding_dimensions))

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:

        session.run(create_vector_index_query)

### Show all vector indices

In [13]:
query_db('SHOW INDEXES WHERE type = "VECTOR"')

[{'id': 11,
  'name': 'embedding_index',
  'state': 'POPULATING',
  'populationPercent': 0.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['embedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': None}]

Vector index seems ok.

### Create some queries to fetch movies via vector search

In [14]:
def get_similar_movies_query(movie_name: str):
  return f"""
  MATCH (m:Movie)
  WHERE toLower(m.title) = '{movie_name.lower()}'

  CALL db.index.vector.queryNodes('embedding_index', 5, m.embedding)
  YIELD node AS similarMovie, score
  RETURN similarMovie.title, similarMovie.genre, score
  """

def get_similar_movies_with_same_actor_query(movie_name: str):
    return f"""
    MATCH (m:Movie)
    WHERE toLower(m.title) = '{movie_name.lower()}'

    MATCH (p: Person)-[:ACTED_IN]->(m: Movie)

    CALL db.index.vector.queryNodes('embedding_index', 100, m.embedding)
    YIELD node AS similarMovie, score
    WHERE similarMovie <> m
    AND (similarMovie)<-[:ACTED_IN]-(p)
    RETURN similarMovie.title, score
    """

In [15]:
query_db(get_similar_movies_query("American Sniper"))

[{'similarMovie.title': 'American Sniper',
  'similarMovie.genre': None,
  'score': 0.9999999403953552},
 {'similarMovie.title': 'Burlesque',
  'similarMovie.genre': None,
  'score': 0.6912174224853516},
 {'similarMovie.title': 'Against the Ropes',
  'similarMovie.genre': None,
  'score': 0.6825745105743408},
 {'similarMovie.title': 'Dance Flick',
  'similarMovie.genre': None,
  'score': 0.6817898750305176},
 {'similarMovie.title': 'Next Friday',
  'similarMovie.genre': None,
  'score': 0.6667101979255676}]

In [16]:
query_db(get_similar_movies_with_same_actor_query("American Sniper"))

[{'similarMovie.title': 'Burnt', 'score': 0.6471226215362549}]

### TODO
Check that American Sniper and Burnt in fact have some actors in common.

In [20]:
query_db("""
  MATCH (m1:Movie {title: 'American Sniper'}) <- [:ACTED_IN] - (p:Person) - [:ACTED_IN] -> (m2:Movie {title: 'Burnt'})
  RETURN p""")

[{'p': {'name': 'Bradley Cooper'}}]

### TODO

Think how you could you use the viewer and rating data for producing recommendations for the viewers? Implement some queries for the recommendations.

In [30]:
# user_977 has rated the following movies
query_db("""
MATCH (m:Movie ) <- [r:RATED] - (p:Person {name: "user_524"})
RETURN m.title, p.name, r.rating
""")

[{'m.title': 'Conspiracy Theory',
  'p.name': 'user_524',
  'r.rating': 4.8965140711502695},
 {'m.title': 'Four Rooms',
  'p.name': 'user_524',
  'r.rating': 9.095388370857327},
 {'m.title': 'Dance Flick',
  'p.name': 'user_524',
  'r.rating': 2.3014017317864277}]

### Simplistic collaborative filtering example:

In [31]:
# Find users that also liked movie Four Rooms
query_db("""
MATCH (m:Movie {title: 'Four Rooms'}) <- [r:RATED] - (p:Person)
RETURN m.title, p.name, r.rating
""")

[{'m.title': 'Four Rooms',
  'p.name': 'user_206',
  'r.rating': 4.450282899298179},
 {'m.title': 'Four Rooms',
  'p.name': 'user_524',
  'r.rating': 9.095388370857327},
 {'m.title': 'Four Rooms',
  'p.name': 'user_998',
  'r.rating': 9.423343494873555}]

In [32]:
# Movies that user_998 liked
query_db("""
MATCH (m:Movie) <- [r:RATED] - (p:Person {name: "user_998"})
RETURN m.title, p.name, r.rating
""")

[{'m.title': 'Blind Date',
  'p.name': 'user_998',
  'r.rating': 2.1697300325945443},
 {'m.title': 'Half Past Dead',
  'p.name': 'user_998',
  'r.rating': 1.1323987725701523},
 {'m.title': 'Four Rooms',
  'p.name': 'user_998',
  'r.rating': 9.423343494873555}]

### Let's put it all together

In [38]:
# user_977 has rated the following movies
query_db("""
MATCH (movieThatUserLiked:Movie ) <- [r1:RATED] - (p1:Person {name: "user_524"})
WHERE r1.rating > 8

MATCH (movieThatUserLiked) <- [r2:RATED] - (p2:Person)
WHERE p1 <> p2
         
MATCH (movieRecommended:Movie) <- [r3:RATED] - (p2)
WHERE r3.rating > 8
AND NOT (movieRecommended)<-[:RATED]-(p1)

RETURN movieRecommended.title, p2.name, r3.rating
""")


[{'movieRecommended.title': 'Blind Date',
  'p2.name': 'user_206',
  'r3.rating': 8.486462831327273},
 {'movieRecommended.title': 'American Heart',
  'p2.name': 'user_206',
  'r3.rating': 9.234823959143533}]