In [None]:
!pip install "shapely<2.0.0"
!pip install google-cloud-aiplatform --upgrade
!pip install langchain
!pip install ipympl plot-utils matplotlib seaborn

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
import vertexai
from google.cloud import aiplatform

PROJECT_ID = ! gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1" # @param {type:"string"}

# define project information manually if the above code didn't work
if PROJECT_ID == "(unset)":
  PROJECT_ID = "[your-project-id]" # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)
print(f"Vertex AI SDK version: {aiplatform.__version__}")


In [None]:
import vertexai
from vertexai.language_models import TextEmbeddingModel

embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

embeddings = embedding_model.get_embeddings(["Python"])

vector = embeddings[0].values
print(f"Length = {len(vector)}")
print(vector)

In [None]:
embeddings = embedding_model.get_embeddings(["Python", "Java",
                                             "BASIC", "COBOL",
                                             "JavaScript", "Lisp"])

for embedding in embeddings:
  vector = embedding.values
  print(vector)

In [None]:
embeddings = embedding_model.get_embeddings(["""Text embedding is an important NLP
      technique that converts textual data into numerical vectors that can be processed
      by machine learning algorithms, especially large models. These vector
      representations are designed to capture the semantic meaning and context
      of the words they represent."""])


for embedding in embeddings:
  vector = embedding.values
  print(vector)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

emb_1 = embedding_model.get_embeddings(['Python is a great programming language..'])
emb_2 = embedding_model.get_embeddings(['JavaScript is my favorite great programming.'])
emb_3 = embedding_model.get_embeddings(['The dog chased that car.'])

print(cosine_similarity([emb_1[0].values],[emb_2[0].values]))
print(cosine_similarity([emb_2[0].values],[emb_3[0].values]))
print(cosine_similarity([emb_1[0].values],[emb_3[0].values]))

In [None]:
in_1 = "Missing flamingo discovered at swimming pool"
in_2 = "Sea otter spotted on surfboard by beach"
in_3 = "Baby panda enjoys boat ride"
in_4 = "Breakfast themed food truck beloved by all!"
in_5 = "New curry restaurant aims to please!"
in_6 = "Python developers are wonderful people"
in_7 = "TypeScript, C++ or Java? All are great!"

input_text_lst_news = [in_1, in_2, in_3, in_4, in_5, in_6, in_7]

embeddings = []
for input_text in input_text_lst_news:
    emb = embedding_model.get_embeddings(
        [input_text])[0].values
    embeddings.append(emb)


In [None]:
import numpy as np
embeddings_array = np.array(embeddings)
print("Shape: " + str(embeddings_array.shape))
print(embeddings_array)

In [None]:
from sklearn.decomposition import PCA

# Perform PCA for 2D visualization
PCA_model = PCA(n_components = 2)
PCA_model.fit(embeddings_array)
new_values = PCA_model.transform(embeddings_array)


print("Shape: " + str(new_values.shape))
print(new_values)

In [None]:
import seaborn as sns
import pandas as pd

data = pd.DataFrame({ 'x':new_values[:,0],
                      'y':new_values[:,1],
                      'sentences': input_text_lst_news})

# Create a visualization
sns.relplot(data, x='x', y='y',
    kind='scatter', hue='sentences'
)

# Generating Embeddings with LangChain

In [None]:
from langchain.embeddings import VertexAIEmbeddings

input_array = [
        "Missing flamingo discovered at swimming pool",
        "Sea otter spotted on surfboard by beach",
        "Baby panda enjoys boat ride",
        "Breakfast themed food truck beloved by all!",
        "Hello World!",
        "New curry restaurant aims to please!",
        "Python developers are wonderful people",
        "TypeScript, C++ or Java? All are great!"

    ]

embedding_langchain_model=VertexAIEmbeddings()
embeddings = embedding_langchain_model.embed_documents(input_array)

print(len(embeddings), len(embeddings[0]))
print(embeddings[0])


# Rate Limiting Embeddings Class

In [None]:
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.pydantic_v1 import BaseModel
from typing import List
import time

# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)

class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]


In [None]:
# Embedding
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5

embedding_langchain_model = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

embeddings = embedding_langchain_model.embed_documents(input_array)
print(len(embeddings), len(embeddings[0]))
print(embeddings[0])