In [16]:
from langchain_community.graphs import Neo4jGraph
# from neo4j.debug import watch
import pandas as pd



In [17]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [18]:
HF_TOKEN=os.getenv("HF_TOKEN")
GROQ_API_KEY=os.getenv("GROQ_API_KEY")

In [19]:
NEO4J_URI=os.getenv("NEO4J_URI")
NEO4J_USERNAME=os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD=os.getenv("NEO4J_PASSWORD")

In [20]:
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

**Nodes:**
- `Movie`: Represents a movie. Each movie node has <u>attributes</u> such as **id** (a unique identifier for the movie), **released** (the release date of the movie), **title** (the movie's title), and **imdbRating** (the movie's rating on IMDb).
- `Person`: Represents an individual who can either be an <u>actor</u> or a <u>director</u> (or both) in movies. Each person node has a <u>single attribute</u>, **name**, which is the name of the person.
- `Genre`: Represents a movie genre. Each genre node has a <u>single attribute</u>, **name**, which is the genre type (e.g., Action, Comedy, Drama, etc.).
- `Location`: Represents the location where the movies was taken. Each location has a single attribute **name**, which is the name of the location (e.g: United States, United Kingdom)
- `SimilarMovie`: Rrpresents a similar movie to its corresponding movie. Each similar movie has a single attribute **name**, which is the name of the movie (e.g: Finding Nemo. Which is the similar movie for Toy story)

**Relationships:**
- `:DIRECTED`: A directional relationship from a Person node to a Movie node, signifying that the person directed the movie.
- `:ACTED_IN`: A directional relationship from a Person node to a Movie node, signifying that the person acted in the movie.
- `:IN_GENRE`: A directional relationship from a Movie node to a Genre node, signifying that the movie belongs to that particular genre.
- `:WAS_TAKEN_IN`: A directional relationship from a Movie node to a Location node, signifying that the movie was taken in that location.
- `:IS_SIMILAR_TO`: A directional relationship from a Movie node to a SimilarMovie node, signifying that movies is similar to the one we are looking at.

**Instructions in the script:**
- `LOAD CSV WITH HEADERS`: Loads a CSV file that contains the movie data with headers indicating each column's purpose.
- `MERGE`: Ensures that a node or relationship is created if it does not already exist; otherwise, it matches the existing node or relationship. This prevents duplication.
- `SET`: Assigns properties to the nodes after they've been created or matched.
- `FOREACH`: Executes the contained commands for each element in a list. This is used to iterate over the lists of directors, actors, and genres associated with each movie. It ensures that all the respective Person and Genre nodes are created and linked appropriately to the Movie nodes.

In [21]:
movie_data_path="E:/KnowledgeGraph+RAG/data/indian_movies_40.csv"

In [22]:
movie_data_url_github="https://raw.githubusercontent.com/debdoot9804/GRAPH_RAG-with-NEO4J/refs/heads/main/data/indian_movies_40.csv"

**Import movie information from the CSV file with tagline, construct the knowledge graph, and store the data in the Graph database**

In [23]:
graph.query("""
LOAD CSV WITH HEADERS FROM $data_url   // Load CSV data from a file specified by $movie_directory
AS row                                                      // Each row in the CSV will be represented as 'row'
            
MERGE (m:Movie {id:row.movieId})                            // Merge a Movie node with the id from the row
SET m.released = date(row.released),                        // Set the 'released' property of the Movie node to the date from the row
    m.title = row.title,                                    // Set the 'title' property of the Movie node to the title from the row
    m.tagline = row.tagline,                                // Set the 'tagline' property of the Movie node to the tagline from the row
    m.imdbRating = toFloat(row.imdbRating)                  // Convert the 'imdbRating' from string to float and set it as the property
            
FOREACH (director in split(row.director, '|') |             // For each director in the list of directors from the row (split by '|') 
    MERGE (p:Person {name:trim(director)})                  // Merge a Person node with the director's name from the row, trimming any extra spaces
    MERGE (p)-[:DIRECTED]->(m))                             // Create a DIRECTED relationship from the director to the Movie
            
FOREACH (actor in split(row.actors, '|') |                  // For each actor in the list of actors from the row (split by '|') 
    MERGE (p:Person {name:trim(actor)})                     // Merge a Person node with the actor's name from the row, trimming any extra spaces
    MERGE (p)-[:ACTED_IN]->(m))                             // Create an ACTED_IN relationship from the actor to the Movie
            
FOREACH (genre in split(row.genres, '|') |                  // For each genre in the list of genres from the row (split by '|')
    MERGE (g:Genre {name:trim(genre)})                      // Merge a Genre node with the genre's name from the row, trimming any extra spaces
    MERGE (m)-[:IN_GENRE]->(g))                             // Create an IN_GENRE relationship from the Movie to the Genre
            
MERGE (l:Location {name:trim(row.location)})
MERGE (m)-[:WAS_TAKEN_IN]->(l)

MERGE (s:SimilarMovie {name:trim(row.similar_movie)})
MERGE (m)-[:IS_SIMILAR_TO]->(s)                                            
""",
params={"data_url": movie_data_url_github})            # Pass the parameter 'movie_directory' which contains the path to the CSV file

[]

In [24]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Movie {id: STRING, released: DATE, title: STRING, tagline: STRING, imdbRating: FLOAT}
Person {name: STRING}
Genre {name: STRING}
Location {name: STRING}
SimilarMovie {name: STRING}
Relationship properties:

The relationships:
(:Movie)-[:IN_GENRE]->(:Genre)
(:Movie)-[:WAS_TAKEN_IN]->(:Location)
(:Movie)-[:IS_SIMILAR_TO]->(:SimilarMovie)
(:Person)-[:DIRECTED]->(:Movie)
(:Person)-[:ACTED_IN]->(:Movie)


**Check the number of nodes that were created from the data (original shape of data: `)**

In [25]:
# Match all nodes in the graph
cypher = """
  MATCH (n) 
  RETURN count(n)
  """
result = graph.query(cypher)
result

[{'count(n)': 182}]

## Loading Embedding Model

In [26]:
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')



In [27]:
from typing import List
def embed_text(text:str)->List:
    embeddings = embed_model.encode(text)
    return embeddings.tolist()

In [29]:
df=pd.read_csv("E:/KnowledgeGraph+RAG/data/indian_movies_40.csv")
df.head()

Unnamed: 0,movieId,released,title,actors,director,genres,imdbRating,similar_movie,tagline,location
0,1,2001-12-14,Lagaan,Aamir Khan|Gracy Singh,Ashutosh Gowariker,Drama|Sport|Musical,8.1,Swades,Once upon a time in India...,India
1,2,1994-06-15,Hum Aapke Hain Koun..!,Salman Khan|Madhuri Dixit,Sooraj R. Barjatya,Comedy|Drama|Romance,7.5,Vivah,Celebrate the joy of togetherness!,India
2,3,2019-07-26,Super 30,Hrithik Roshan|Mrunal Thakur,Vikas Bahl,Biography|Drama,8.0,Chhichhore,Ab Raja Ka Beta Raja Nahi Banega...,India
3,4,2006-06-09,Fanaa,Aamir Khan|Kajol,Kunal Kohli,Drama|Romance|Thriller,7.2,Rang De Basanti,Destroyed in love…,India
4,5,2015-12-18,Bajirao Mastani,Ranveer Singh|Deepika Padukone,Sanjay Leela Bhansali,Action|Drama|History,7.5,Padmaavat,A Romance of a Warrior.,India


In [30]:
embedding_list = [embed_text(i) for i in df["tagline"]]

In [31]:
print("Number of vectors:", len(embedding_list))
print("Embedding dimension:", len(embedding_list[0]))
embedding_list[0][:5]

Number of vectors: 40
Embedding dimension: 768


[0.024004274979233742,
 0.03278125822544098,
 -0.02010752074420452,
 -0.026515036821365356,
 -0.02058703824877739]

In [32]:
df["taglineEmbedding"] = embedding_list
df.head(3)

Unnamed: 0,movieId,released,title,actors,director,genres,imdbRating,similar_movie,tagline,location,taglineEmbedding
0,1,2001-12-14,Lagaan,Aamir Khan|Gracy Singh,Ashutosh Gowariker,Drama|Sport|Musical,8.1,Swades,Once upon a time in India...,India,"[0.024004274979233742, 0.03278125822544098, -0..."
1,2,1994-06-15,Hum Aapke Hain Koun..!,Salman Khan|Madhuri Dixit,Sooraj R. Barjatya,Comedy|Drama|Romance,7.5,Vivah,Celebrate the joy of togetherness!,India,"[-0.05529607832431793, 0.052273888140916824, -..."
2,3,2019-07-26,Super 30,Hrithik Roshan|Mrunal Thakur,Vikas Bahl,Biography|Drama,8.0,Chhichhore,Ab Raja Ka Beta Raja Nahi Banega...,India,"[0.02318468876183033, 0.0374489389359951, -0.0..."


## Creating Vector Index

In [33]:
graph.query("""
  CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS      // Create a vector index named 'movie_tagline_embeddings' if it doesn't already exist  
  FOR (m:Movie) ON (m.taglineEmbedding)                           // Index the 'taglineEmbedding' property of Movie nodes 
  OPTIONS { indexConfig: {                                        // Set options for the index
    `vector.dimensions`: 768,                                    // Specify the dimensionality of the vector space (768 dimensions)
    `vector.similarity_function`: 'cosine'                        // Specify the similarity function to be cosine similarity
  }}"""
)

[]

In [34]:
graph.query("""
  SHOW VECTOR INDEXES     // Retrieves information about all vector indexes in the database
  """
)

[{'id': 2,
  'name': 'movie_tagline_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

In [35]:
df.head(5)

Unnamed: 0,movieId,released,title,actors,director,genres,imdbRating,similar_movie,tagline,location,taglineEmbedding
0,1,2001-12-14,Lagaan,Aamir Khan|Gracy Singh,Ashutosh Gowariker,Drama|Sport|Musical,8.1,Swades,Once upon a time in India...,India,"[0.024004274979233742, 0.03278125822544098, -0..."
1,2,1994-06-15,Hum Aapke Hain Koun..!,Salman Khan|Madhuri Dixit,Sooraj R. Barjatya,Comedy|Drama|Romance,7.5,Vivah,Celebrate the joy of togetherness!,India,"[-0.05529607832431793, 0.052273888140916824, -..."
2,3,2019-07-26,Super 30,Hrithik Roshan|Mrunal Thakur,Vikas Bahl,Biography|Drama,8.0,Chhichhore,Ab Raja Ka Beta Raja Nahi Banega...,India,"[0.02318468876183033, 0.0374489389359951, -0.0..."
3,4,2006-06-09,Fanaa,Aamir Khan|Kajol,Kunal Kohli,Drama|Romance|Thriller,7.2,Rang De Basanti,Destroyed in love…,India,"[0.04635387659072876, 0.06544550508260727, 0.0..."
4,5,2015-12-18,Bajirao Mastani,Ranveer Singh|Deepika Padukone,Sanjay Leela Bhansali,Action|Drama|History,7.5,Padmaavat,A Romance of a Warrior.,India,"[0.051126688718795776, 0.057396262884140015, 0..."


In [36]:
for index, row in df.iterrows():
    movie_id = row['movieId']
    embedding = row['taglineEmbedding']
    graph.query(f"MATCH (m:Movie {{id: '{movie_id}'}}) SET m.taglineEmbedding = {embedding}")

## Verifying the index which was created

In [38]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Movie {id: STRING, released: DATE, title: STRING, tagline: STRING, imdbRating: FLOAT, taglineEmbedding: LIST}
Person {name: STRING}
Genre {name: STRING}
Location {name: STRING}
SimilarMovie {name: STRING}
Relationship properties:

The relationships:
(:Movie)-[:IN_GENRE]->(:Genre)
(:Movie)-[:WAS_TAKEN_IN]->(:Location)
(:Movie)-[:IS_SIMILAR_TO]->(:SimilarMovie)
(:Person)-[:DIRECTED]->(:Movie)
(:Person)-[:ACTED_IN]->(:Movie)


## Testing with an example

In [39]:
result = graph.query("""
    MATCH (m:Movie) 
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
    """
)

In [40]:
result[0]['m.tagline']

'Once upon a time in India...'

In [41]:
result[0]['m.taglineEmbedding']

[0.024004274979233742,
 0.03278125822544098,
 -0.02010752074420452,
 -0.026515036821365356,
 -0.02058703824877739,
 -0.016401266679167747,
 -0.0754060223698616,
 -0.026108084246516228,
 0.015569753013551235,
 0.028955576941370964,
 0.015646085143089294,
 -0.01380168180912733,
 0.0162296574562788,
 0.025638580322265625,
 -0.021246159449219704,
 -0.018568139523267746,
 0.00905848853290081,
 0.012076864950358868,
 -0.050221774727106094,
 0.055182117968797684,
 -0.007034633308649063,
 0.022018548101186752,
 -0.03362948074936867,
 0.013210928998887539,
 -0.003811721922829747,
 -0.06840339303016663,
 0.013876087963581085,
 -0.015010667033493519,
 -0.006142395082861185,
 -0.014158778823912144,
 0.011481153778731823,
 -0.016669804230332375,
 0.03006281517446041,
 -0.046799302101135254,
 1.1829189361378667e-06,
 0.027018483728170395,
 0.014652763493359089,
 -0.031553201377391815,
 0.02090509980916977,
 -0.01341201551258564,
 -0.013608782552182674,
 0.0969466120004654,
 0.03299154341220856,
 -0.