# Lesson 3: Preparing Text Data for RAG

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Environment variables
Sets environment variables. If asked, please replace the following [your-project-id] with your project ID and run it.

In [1]:
# get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]

ACCESS_TOKEN = !gcloud auth print-access-token
ACCESS_TOKEN = ACCESS_TOKEN[0]

EMBEDDING_MODEL = "textembedding-gecko@002"

LOCATION = "us-central1"

if PROJECT_ID == "(unset)":
    print(f"Please set the project ID manually below")

In [2]:
PROJECT_ID

'gpeg-oe-platform'

In [3]:
ACCESS_TOKEN

'ya29.c.c0AY_VpZg63Zv_jESMUvEJV3P8WTFbBlwzxULP7bF3iN0a-kWDeP0y54izBYLP7fqoLXP9uX_0hoN5IotlCtW3uQACLrw59abeQmDS94COuEhYxMO_QzGOnjOvdckVFT7BY7YCJvG6W_NZ7uZ5jnWCy1AalWRY-jRJb67GO1WpLy5zBVAKF-U61gaROmi74GJtHVI-u2UmMTLTtkq0WQPhkVj4_XddrVvnNijiQZSsleM3jFWni2kqfNw094JovBcsT3WLgoBYlGVLHL63EBrVcsyx4LMZRgDWYFWG4UitR0aGmCYAuEhjrMsdVpasKxvHkl_Q5MopHDLNFze9u41b3zxz7odKAnNvKLd66VE6PXx_ojQm31A0o0hcjK81g_6V7GoG396DZu2mQa-Feohnl9FtZj2XaXW-R6SZaS39-xFvIqOBfJ0-2uo4n4Oz3o6y2WrnzO22gR7SWRJ6RX_ziaq6cVak4ea2OzS_0qg9joBFznJW8YXtju_iFQXvmMhrqFsc9B17_gV6uBoJhej9xpg5s6g795wZdch6Rmi3txZpRsaYz3iIeSt0hOstl6t6hllQBQedq0O6fl9nmv1804y-gScfygyO47XZnfd6aauRQiWijitrU5gj1osx3qIVtVgk7WQSge8OZW3ilk5wVtyiYZ502ZjycrJY-J1-_78FMewgcJz0BuISqmVnowcQZc9dhivgMkXx9g8j5oY1UafSdq47nMhSp06nFRIeB31a78dIWlW9ey5rdrpJ_zZ2022Ua-Igq6vihJtktFRnZIbBW19Waw3zQnj01ZpX1v9ot769Ru0zY-qOvZyvF1iYjBQUxh3yFRRetXq-MpfZMB5XmhRVIWQJIvkJQ6s-StabpYFVBxy_5iYV48z0B3qhWqkvh4IFwnOgaj9XiVh7Rliz-wg17S-7JxmWQsJXf40y9UrcV34m7kxxsY-87bnFSwiI1MbesjQZvk1h27Z4FFxfqo6yfx

### Authentication (Colab only)
If you are running this notebook on Colab, you will need to run the following cell authentication. This step is not required if you are using Vertex AI Workbench as it is pre-authenticated.

In [None]:
import sys

# if it's Colab runtime, authenticate the user with Google Cloud
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

<p style="background-color:#fd4a6180; padding:15px; margin-left:20px"> ⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️<b>Note:</b> This notebook takes about 30 seconds to be ready to use. Please wait until the "Kernel starting, please wait..." </p>


In [4]:
# init the vertexai package
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import packages and set up Neo4j

In [5]:
from dotenv import load_dotenv
import os

from langchain_community.graphs import Neo4jGraph

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [7]:
# Note the code below is unique to this course environment, and not a 
# standard part of Neo4j's integration with OpenAI. Remove if running 
# in your own environment.
# OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

In [8]:
# Connect to the knowledge graph instance using LangChain
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

### Create a vector index 

In [9]:
kg.query("""
    CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
    FOR (m:Movie) ON (m.taglineEmbedding) 
    OPTIONS { indexConfig: {
        `vector.dimensions`: 768,
        `vector.similarity_function`: 'cosine'    
    }}
""")

[]

In [10]:
kg.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 9,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 5, 10, 9, 42, 41, 727000000, tzinfo=<UTC>),
  'readCount': 44},
 {'id': 6,
  'name': 'movie_tagline_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 5, 8, 9, 10, 22, 26000000, tzinfo=<UTC>),
  'readCount': 2}]

### Populate the vector index
- Calculate vector representation for each movie tagline using Vertex AI
- Add vector to the `Movie` node as `taglineEmbedding` property

In [11]:
## POPULATE INDEX

kg.query("""
    MATCH (movie:Movie) WHERE movie.tagline IS NOT NULL
    WITH movie, genai.vector.encode(
      movie.tagline, 
      "VertexAI",
      {
        token: $token,      
        projectId: $projectId,
        model: $embedding_model
      })AS vector
    CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", vector)
    """,
    params={"token":ACCESS_TOKEN, "projectId": PROJECT_ID, "embedding_model": EMBEDDING_MODEL} )

[]

In [12]:
result = kg.query("""
    MATCH (m:Movie) 
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
    """
)

In [13]:
result[0]['m.tagline']

'Free your mind'

In [14]:
result[0]['m.taglineEmbedding'][:10]

[0.017995720729231834,
 0.010906558483839035,
 -0.012228431180119514,
 0.016306528821587563,
 0.017122313380241394,
 0.024987483397126198,
 -0.013222266919910908,
 0.018318042159080505,
 -0.034899428486824036,
 0.021264759823679924]

In [15]:
len(result[0]['m.taglineEmbedding'])

768

### Similarity search
- Calculate embedding for question
- Identify matching movies based on similarity of question and `taglineEmbedding` vectors

In [16]:
ACCESS_TOKEN

'ya29.c.c0AY_VpZg63Zv_jESMUvEJV3P8WTFbBlwzxULP7bF3iN0a-kWDeP0y54izBYLP7fqoLXP9uX_0hoN5IotlCtW3uQACLrw59abeQmDS94COuEhYxMO_QzGOnjOvdckVFT7BY7YCJvG6W_NZ7uZ5jnWCy1AalWRY-jRJb67GO1WpLy5zBVAKF-U61gaROmi74GJtHVI-u2UmMTLTtkq0WQPhkVj4_XddrVvnNijiQZSsleM3jFWni2kqfNw094JovBcsT3WLgoBYlGVLHL63EBrVcsyx4LMZRgDWYFWG4UitR0aGmCYAuEhjrMsdVpasKxvHkl_Q5MopHDLNFze9u41b3zxz7odKAnNvKLd66VE6PXx_ojQm31A0o0hcjK81g_6V7GoG396DZu2mQa-Feohnl9FtZj2XaXW-R6SZaS39-xFvIqOBfJ0-2uo4n4Oz3o6y2WrnzO22gR7SWRJ6RX_ziaq6cVak4ea2OzS_0qg9joBFznJW8YXtju_iFQXvmMhrqFsc9B17_gV6uBoJhej9xpg5s6g795wZdch6Rmi3txZpRsaYz3iIeSt0hOstl6t6hllQBQedq0O6fl9nmv1804y-gScfygyO47XZnfd6aauRQiWijitrU5gj1osx3qIVtVgk7WQSge8OZW3ilk5wVtyiYZ502ZjycrJY-J1-_78FMewgcJz0BuISqmVnowcQZc9dhivgMkXx9g8j5oY1UafSdq47nMhSp06nFRIeB31a78dIWlW9ey5rdrpJ_zZ2022Ua-Igq6vihJtktFRnZIbBW19Waw3zQnj01ZpX1v9ot769Ru0zY-qOvZyvF1iYjBQUxh3yFRRetXq-MpfZMB5XmhRVIWQJIvkJQ6s-StabpYFVBxy_5iYV48z0B3qhWqkvh4IFwnOgaj9XiVh7Rliz-wg17S-7JxmWQsJXf40y9UrcV34m7kxxsY-87bnFSwiI1MbesjQZvk1h27Z4FFxfqo6yfx

In [18]:
question = "What movies are about love?"

In [19]:
kg.query("""
    WITH genai.vector.encode(
        $question, 
        "VertexAI",
          {
            token: $token,      
            projectId: $projectId,
            model: $embedding_model
          }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddings', 
        $top_k, 
        question_embedding
        ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """, 
    params={"token":ACCESS_TOKEN, 
            "projectId": PROJECT_ID,
            "embedding_model": EMBEDDING_MODEL,
            "question": question,
            "top_k": 5
            })

[{'movie.title': 'Joe Versus the Volcano',
  'movie.tagline': 'A story of love, lava and burning desire.',
  'score': 0.9402147531509399},
 {'movie.title': "You've Got Mail",
  'movie.tagline': 'At odds in life... in love on-line.',
  'score': 0.9159703850746155},
 {'movie.title': 'As Good as It Gets',
  'movie.tagline': 'A comedy from the heart that goes for the throat.',
  'score': 0.9155163764953613},
 {'movie.title': 'Apollo 13',
  'movie.tagline': 'Houston, we have a problem.',
  'score': 0.9103593230247498},
 {'movie.title': 'Snow Falling on Cedars',
  'movie.tagline': 'First loves last. Forever.',
  'score': 0.9100793600082397}]

### Try for yourself: ask you own question!
- Change the question below and run the graph query to find different movies

In [20]:
question = "What movies are about adventure?"

In [21]:
kg.query("""
    WITH genai.vector.encode(
        $question, 
        "VertexAI",
          {
            token: $token,      
            projectId: $projectId,
            model: $embedding_model
          }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddings', 
        $top_k, 
        question_embedding
        ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """, 
    params={"token":ACCESS_TOKEN, 
            "projectId": PROJECT_ID,
            "embedding_model": EMBEDDING_MODEL,
            "question": question,
            "top_k": 5
            })

[{'movie.title': 'Ninja Assassin',
  'movie.tagline': 'Prepare to enter a secret world of assassins',
  'score': 0.9230484962463379},
 {'movie.title': 'Cloud Atlas',
  'movie.tagline': 'Everything is connected',
  'score': 0.9175840616226196},
 {'movie.title': 'Apollo 13',
  'movie.tagline': 'Houston, we have a problem.',
  'score': 0.9151338934898376},
 {'movie.title': 'The Green Mile',
  'movie.tagline': "Walk a mile you'll never forget.",
  'score': 0.9148365259170532},
 {'movie.title': 'Cast Away',
  'movie.tagline': 'At the edge of the world, his journey begins.',
  'score': 0.914813756942749}]

In [None]:
kg.query("""
  SHOW VECTOR INDEXES
  """
)

In [None]:
# kg.query("""
#   DROP INDEX movie_tagline_embeddings
#   """
# )