In [1]:
import kagglehub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import os
import pandas as pd
import ast
from neo4j import GraphDatabase

from dotenv import load_dotenv
load_dotenv()

from langchain_google_genai import GoogleGenerativeAI
api_key = os.getenv("GOOGLE_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
movies = pd.read_csv('data/imdb_top_1000.csv').head(10)

movies.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
# Set up Neo4j connection
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()
        print("Connection closed")

    def reset_database(self):
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
        print("Database resetted successfully!")

    def add_document(self, documents: list):
        self.driver.add_graph_documents(documents)

    def execute_query(self, query, parameters=None):
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            return [record for record in result]

# Connect to Neo4j
uri = "bolt://localhost:7687"
user = "neo4j"
password = "ilovemovies"
conn = Neo4jConnection(uri, user, password)

In [4]:
conn.reset_database()

Database resetted successfully!


# Manual Upload

In [5]:
# Function to Load DataFrame into Neo4j
def load_movies_to_neo4j(movies_df, connection):
    for _, row in movies_df.iterrows():
        # Create Movie Node
        connection.execute_query( 
            """
            MERGE (movie:Movie {title: $title})
            SET movie.year = $year,
                movie.rating = $rating,
                movie.genre = $genre,
                movie.runtime = $runtime,
                movie.overview = $overview;
            """,
            parameters={
                "title": row["Series_Title"],
                "year": int(row["Released_Year"]),
                "rating": float(row["IMDB_Rating"]),
                "genre": row["Genre"],
                "runtime": row["Runtime"],
                "overview": row["Overview"],
            },
        )

        # Create Director Node and Relationship
        connection.execute_query(
            """
            MERGE (director:Director {name: $name})
            MERGE (movie:Movie {title: $title})
            MERGE (director)-[:DIRECTED]->(movie);
            """,
            parameters={
                "name": row["Director"],
                "title": row["Series_Title"],
            },
        )

        # Create Actor Nodes and Relationships
        for actor in [row["Star1"], row["Star2"], row["Star3"], row["Star4"]]:
            connection.execute_query(
                """
                MERGE (actor:Actor {name: $name})
                MERGE (movie:Movie {title: $title})
                MERGE (actor)-[:ACTED_IN]->(movie);
                """,
                parameters={
                    "name": actor,
                    "title": row["Series_Title"],
                },
            )

# Load DataFrame to Neo4j
load_movies_to_neo4j(movies, conn)

In [7]:
query = """
MATCH (m:Movie)-[:ACTED_IN]-(a:Actor)
RETURN m.title, a.name;
"""
conn.execute_query(query)

[<Record m.title='The Shawshank Redemption' a.name='William Sadler'>,
 <Record m.title='The Shawshank Redemption' a.name='Bob Gunton'>,
 <Record m.title='The Shawshank Redemption' a.name='Morgan Freeman'>,
 <Record m.title='The Shawshank Redemption' a.name='Tim Robbins'>,
 <Record m.title='The Godfather' a.name='Diane Keaton'>,
 <Record m.title='The Godfather' a.name='James Caan'>,
 <Record m.title='The Godfather' a.name='Al Pacino'>,
 <Record m.title='The Godfather' a.name='Marlon Brando'>,
 <Record m.title='The Dark Knight' a.name='Michael Caine'>,
 <Record m.title='The Dark Knight' a.name='Aaron Eckhart'>,
 <Record m.title='The Dark Knight' a.name='Heath Ledger'>,
 <Record m.title='The Dark Knight' a.name='Christian Bale'>,
 <Record m.title='The Godfather: Part II' a.name='Diane Keaton'>,
 <Record m.title='The Godfather: Part II' a.name='Robert Duvall'>,
 <Record m.title='The Godfather: Part II' a.name='Robert De Niro'>,
 <Record m.title='The Godfather: Part II' a.name='Al Pacino'>,

In [None]:
conn.reset_database()

In [8]:
# Initialize Google Generative AI
llm = GoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=api_key)

df = movies.copy()

# Step 1: Define Node Labels and Properties
node_structure = "\n".join(
    [f"{col}: {', '.join(map(str, df[col].unique()[:3]))}..." for col in df.columns]
)

print(node_structure)

Poster_Link: https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_UX67_CR0,0,67,98_AL_.jpg, https://m.media-amazon.com/images/M/MV5BM2MyNjYxNmUtYTAwNi00MTYxLWJmNWYtYzZlODY3ZTk3OTFlXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UY98_CR1,0,67,98_AL_.jpg, https://m.media-amazon.com/images/M/MV5BMTMxNTMwODM0NF5BMl5BanBnXkFtZTcwODAyMTk2Mw@@._V1_UX67_CR0,0,67,98_AL_.jpg...
Series_Title: The Shawshank Redemption, The Godfather, The Dark Knight...
Released_Year: 1994, 1972, 2008...
Certificate: A, UA, U...
Runtime: 142 min, 175 min, 152 min...
Genre: Drama, Crime, Drama, Action, Crime, Drama...
IMDB_Rating: 9.3, 9.2, 9.0...
Overview: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency., An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son., When the menace known as the Joker wreaks havoc and chaos on the people of Gotha

In [9]:
node_example = {
    "NodeLabel1": {"property1": "row['property1']", "property2": "row['property2']"},
    "NodeLabel2": {"property1": "row['property1']", "property2": "row['property2']"},
}

define_nodes_prompt = PromptTemplate(
    input_variables=["structure", "example"],
    template=("""
        Analyze the dataset structure below and define labels for nodes and their properties.\n
        The node properties should be based on the dataset columns and their values.\n
        Return the result as a dictionary where the keys are the node labels and the values are the node properties.\n\n
        Example: {example}\n\n
        
        Dataset Structure:\n{structure}\n\n
              
        Make sure to include all the possible node labels and their properties. 
        Return only the dictionary containing node labels and properties, and don't include any other text or quotation.
        """
    ),
)


node_chain = LLMChain(llm=llm, prompt=define_nodes_prompt)
node_definitions = node_chain.run(structure=node_structure, example=node_example)
node_definitions = ast.literal_eval(node_definitions)

print("Node Definitions:", node_definitions)

  node_chain = LLMChain(llm=llm, prompt=define_nodes_prompt)
  node_definitions = node_chain.run(structure=node_structure, example=node_example)


Node Definitions: {'Movie': {'Poster_Link': "row['Poster_Link']", 'Series_Title': "row['Series_Title']", 'Released_Year': "row['Released_Year']", 'Certificate': "row['Certificate']", 'Runtime': "row['Runtime']", 'Genre': "row['Genre']", 'IMDB_Rating': "row['IMDB_Rating']", 'Overview': "row['Overview']", 'Meta_score': "row['Meta_score']", 'Director': "row['Director']", 'No_of_Votes': "row['No_of_Votes']", 'Gross': "row['Gross']"}, 'Star': {'Name': "row['Star4']", 'Movie': "row['Series_Title']"}}


In [10]:
relationship_example = [
    ("NodeLabel1", "RelationshipLabel", "NodeLabel2"),
    ("NodeLabel1", "RelationshipLabel", "NodeLabel3"),
    ("NodeLabel2", "RelationshipLabel", "NodeLabel3"),
]

# Step 2: Identify Relationships
identify_relationships_prompt = PromptTemplate(
    input_variables=["structure", "node_definitions", "example"],
    template="""
        Consider the following Dataset Structure:\n{structure}\n\n

        Consider the following Node Definitions:\n{node_definitions}\n\n

        Based on the dataset structure and node definitions, identify relationships (edges) between nodes.\n
        Return the relationships as a list of triples where each triple contains the start node label, relationship label, and end node label, and each triple is a tuple.\n
        Please return only the list of tuples. Please do not report triple backticks to identify a code block, just return the list of tuples.\n\n

        Example:\n{example}
    """
)

relationship_chain = LLMChain(llm=llm, prompt=identify_relationships_prompt)
relationships = relationship_chain.run(structure=node_structure, node_definitions=str(node_definitions), example=str(relationship_example))

# Convert the output to a Python list of tuples
relationships = eval(relationships)
print("Relationships:", relationships)

Relationships: [('Movie', 'DIRECTED_BY', 'Director'), ('Movie', 'ACTED_IN', 'Star'), ('Movie', 'ACTED_IN', 'Star'), ('Movie', 'ACTED_IN', 'Star'), ('Movie', 'ACTED_IN', 'Star')]


In [11]:
example_cypher = """
CREATE (n1:NodeLabel1 {'property1': 'row['property'1]', 'property2': 'row['property2]'})
CREATE (n2:NodeLabel2 {'property1': 'row['property'1]', 'property2': 'row['property2]'})
CREATE (n1)-[:RelationshipLabel]->(n2);
"""

# Step 3: Build Cypher Queries
build_cypher_prompt = PromptTemplate(
    input_variables=["structure", "node_definitions", "relationships", "example"],
    template="""
        Consider the following Node Definitions:\n{node_definitions}\n\n
        Consider the following Relationships:\n{relationships}\n\n
        Generate Cypher queries to create nodes and relationships using the node definitions and relationships below. Remember to replace the placeholder values with actual data from the dataset.\n
        Include all the properties in the Node Definitions for each node as defined and create relationships.\n
        Return a single string with each query separated by a semicolon.\n
        Don't include any other text or quotation marks in the response.\n
        Please return only the string containing Cypher queries. Please do not report triple backticks to identify a code block.\n\n


        Example:\n{example}
    """
)

cypher_chain = LLMChain(llm=llm, prompt=build_cypher_prompt)
cypher_queries_str = cypher_chain.run(structure=node_structure, node_definitions=str(node_definitions), relationships=str(relationships), example=example_cypher)

# Convert the output to a list of strings
cypher_queries = cypher_queries_str.replace('\n', '')#.split(';')
print("Cypher Queries:", cypher_queries)

Cypher Queries: CREATE (m:Movie {Poster_Link: "row['Poster_Link']", Series_Title: "row['Series_Title']", Released_Year: "row['Released_Year']", Certificate: "row['Certificate']", Runtime: "row['Runtime']", Genre: "row['Genre']", IMDB_Rating: "row['IMDB_Rating']", Overview: "row['Overview']", Meta_score: "row['Meta_score']", Director: "row['Director']", No_of_Votes: "row['No_of_Votes']", Gross: "row['Gross']"})CREATE (s:Star {Name: "row['Star4']", Movie: "row['Series_Title']"})CREATE (m)-[:DIRECTED_BY]->(:Director {Name: "row['Director']"})CREATE (m)-[:ACTED_IN]->(s)CREATE (m)-[:ACTED_IN]->(s)CREATE (m)-[:ACTED_IN]->(s)CREATE (m)-[:ACTED_IN]->(s);


In [13]:
# Iterate over each row of the dataframe
for index, row in df.iterrows():
    # Replace placeholders with actual values from the dataframe
    cypher_query = cypher_queries
    for column in df.columns:
        cypher_query = cypher_query.replace(f"row['{column}']", str(row[column]))
        
    # Execute the Cypher query
    conn.execute_query(cypher_query)
    print(f"Executed query: {cypher_query}")

Executed query: CREATE (m:Movie {Poster_Link: "https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_UX67_CR0,0,67,98_AL_.jpg", Series_Title: "The Shawshank Redemption", Released_Year: "1994", Certificate: "A", Runtime: "142 min", Genre: "Drama", IMDB_Rating: "9.3", Overview: "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", Meta_score: "80.0", Director: "Frank Darabont", No_of_Votes: "2343110", Gross: "28,341,469"})CREATE (s:Star {Name: "William Sadler", Movie: "The Shawshank Redemption"})CREATE (m)-[:DIRECTED_BY]->(:Director {Name: "Frank Darabont"})CREATE (m)-[:ACTED_IN]->(s)CREATE (m)-[:ACTED_IN]->(s)CREATE (m)-[:ACTED_IN]->(s)CREATE (m)-[:ACTED_IN]->(s);
Executed query: CREATE (m:Movie {Poster_Link: "https://m.media-amazon.com/images/M/MV5BM2MyNjYxNmUtYTAwNi00MTYxLWJmNWYtYzZlODY3ZTk3OTFlXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UY98_CR1,0,67,98_AL_.jpg"

In [14]:
query = """
MATCH (m:Movie)-[:ACTED_IN]-(a:Actor)
RETURN m.title, a.name;
"""
conn.execute_query(query)

[<Record m.title='The Shawshank Redemption' a.name='William Sadler'>,
 <Record m.title='The Shawshank Redemption' a.name='Bob Gunton'>,
 <Record m.title='The Shawshank Redemption' a.name='Morgan Freeman'>,
 <Record m.title='The Shawshank Redemption' a.name='Tim Robbins'>,
 <Record m.title='The Godfather' a.name='Diane Keaton'>,
 <Record m.title='The Godfather' a.name='James Caan'>,
 <Record m.title='The Godfather' a.name='Al Pacino'>,
 <Record m.title='The Godfather' a.name='Marlon Brando'>,
 <Record m.title='The Dark Knight' a.name='Michael Caine'>,
 <Record m.title='The Dark Knight' a.name='Aaron Eckhart'>,
 <Record m.title='The Dark Knight' a.name='Heath Ledger'>,
 <Record m.title='The Dark Knight' a.name='Christian Bale'>,
 <Record m.title='The Godfather: Part II' a.name='Diane Keaton'>,
 <Record m.title='The Godfather: Part II' a.name='Robert Duvall'>,
 <Record m.title='The Godfather: Part II' a.name='Robert De Niro'>,
 <Record m.title='The Godfather: Part II' a.name='Al Pacino'>,

In [15]:
conn.reset_database()

Database resetted successfully!


In [16]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

llm_transformer = LLMGraphTransformer(llm=llm)

documents = []

for _, row in df.iterrows():
    text = f"Title: {row['Series_Title']}\nDirector: {row['Director']}\nStars: {row['Star1']}, {row['Star2']}, {row['Star3']}, {row['Star4']}\nGenre: {row['Genre']}\nOverview: {row['Overview']}"
    documents.append(Document(page_content=text))


graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Frank Darabont', type='Person', properties={}), Node(id='Drama', type='Genre', properties={}), Node(id='Bob Gunton', type='Person', properties={}), Node(id='Tim Robbins', type='Person', properties={}), Node(id='Morgan Freeman', type='Person', properties={}), Node(id='William Sadler', type='Person', properties={}), Node(id='The Shawshank Redemption', type='Movie', properties={})]
Relationships:[Relationship(source=Node(id='The Shawshank Redemption', type='Movie', properties={}), target=Node(id='Frank Darabont', type='Person', properties={}), type='DIRECTED_BY', properties={}), Relationship(source=Node(id='The Shawshank Redemption', type='Movie', properties={}), target=Node(id='Tim Robbins', type='Person', properties={}), type='STARRING', properties={}), Relationship(source=Node(id='The Shawshank Redemption', type='Movie', properties={}), target=Node(id='Morgan Freeman', type='Person', properties={}), type='STARRING', properties={}), Relationship(source=Node(id='The Shaws

In [17]:
from langchain_community.graphs import Neo4jGraph

graph = Neo4jGraph(url=uri, username=user, password=password)
graph.add_graph_documents(graph_documents)

In [150]:
conn.reset_database()

Database resetted successfully!


In [28]:
from llama_index.core import KnowledgeGraphIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
from llama_index.embeddings.gemini import GeminiEmbedding



llm = Gemini(
    model="models/gemini-1.5-flash",
    api_key=api_key,
    temperature=0,
)

embed_model = GeminiEmbedding(
    model_name="models/text-embedding-004",
    task_type="retrieval_document",
    api_key=api_key,
)

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 512

from IPython.display import Markdown, display

In [29]:
documents = SimpleDirectoryReader(
    input_files=["data/imdb_top_1000.csv"]
).load_data()

In [31]:
graph_store = Neo4jGraphStore(
    username=user,
    password=password,
    url=uri,
)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=2,
)

ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

In [None]:
context = StorageContext()
graph_store = Neo4jGraphStore()
index = KnowledgeGraphIndex(context=context, graph_store=graph_store)

index.add_documents(documents)