# Libraries



In [28]:
import json
import scann
import base64
import vertexai
import numpy as np
import pandas as pd
from google.cloud import storage
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models

# Variables

In [146]:
project_id = "vtxdemos"
model_id = "gemini-1.5-pro-001"
emb_model_id = "text-embedding-004"
bucket_id = "vtxdemos-vsearch-datasets"
bucket_folder = "profile_synthetic_data"

# Synthetic Data

Because we are creating synthetic data, this could take long time.

In [117]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 2,
    "top_p": 0.95,
    "response_mime_type": "application/json"
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

model = GenerativeModel(
    model_id,
)
prompt = """
Your mission is to create false user profiles with the following schema:
['id', 'name', 'first_name', 'last_name', 'email', 'job_title', 'company', 'location', 'experiences', 'education', 'skills', 'languages', 'created_at', 'updated_at']

<rules>
The output should be a JSON format where the key name is the same as the schema.
Create 20 rows with key, value pairs.
Use random synthetic data for the value.
</rules>


"""

response = model.generate_content(
    [prompt],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

print(response.text)

{"id": ["e8c5d4a4-3fc7-41bd-92d5-878947a43224", "39d6f10a-084c-4374-b11c-1e97b14584b9", "d2003255-8e20-45b6-9f0f-0985c8c5e488", "7a187e0e-e127-4892-a74c-a230e7d40d43", "13268b3a-9f44-4400-a63f-241a8a35028c", "4864a703-703d-4b4b-bc63-94b32c92457b", "82686d18-3602-4c1b-a058-88d7d0b29f77", "944a776a-3811-4508-9857-c11d2c0260b3", "82f0518c-4e4e-4890-a7d7-c95c4ff9e4c9", "0b40371c-925c-485c-a42e-393e1086f95b", "b8713743-607b-4797-8a48-c715a762d564", "802f919b-774e-4c8c-a999-9b8c301b6703", "e59b93f7-6553-4208-b43f-6a094d3e0c4c", "3a417d72-7778-4250-8250-c6962d678609", "d297710a-c4a4-4c15-b528-4d54a78e327b", "14d5803a-033b-4679-81f3-899b88e5b626", "38886f49-b051-470b-a89b-3439397b9546", "1291db7b-1592-4986-a04f-c832a5824d1d", "525c52e3-865e-4445-a87f-7c06946c3547", "db48168e-1a3c-4276-8904-1231a4115a9f"], "name": ["Benjamin Harris", "Tyler Lawson", "Austin Martin", "Charles Thompson", "Melissa Davis", "Michael White", "Emily Thomas", "Elizabeth Anderson", "Kevin Garcia", "Thomas Wilson", "John

In [120]:
output = json.loads(response.text)

# From Structured Data to Natural Language



> This is for when you have real large dataset, if is a demo and the data is synthetic this step can be skipped.



In [143]:
vertexai.init(project=project_id, location="us-central1")
model = GenerativeModel(
    model_id,
    system_instruction=[
    """
    Without missing any word/details transform the following dictionary as a 500 token paragraph (chunks) separated by breaklines.
    """
    ]
)
emb_model = TextEmbeddingModel.from_pretrained(emb_model_id)

In [144]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 2,
    "top_p": 0.95,
}

def generate_structure(dictionary: str):
  prompt = f"""
  <dictionary>
  {dictionary}
  <dictionary>

  """

  responses = model.generate_content(
      [prompt],
      generation_config=generation_config,
      stream=True,
  )
  gemini_response = []
  try:
    for response in responses:
      print(response.text)
      gemini_response.append(response.text)
  except:
    print(responses)
    return "error"

  return "".join(gemini_response)


In [137]:
_dataset = []
# Iterate through each "row" (index)
for i in range(len(output['id'])):
    row_data = {}
    # Iterate through each key (column)
    for key in output:
        row_data[key] = output[key][i]

    # Now row_data contains the key-value pairs for the current row
    _dataset.append(row_data)

In [145]:
for n, profile in enumerate(_dataset):
  _re = generate_structure(str(profile))
  if _re == "error":
    break
  _re_for_emb = _re.split("\n\n")
  inputs = [TextEmbeddingInput(text, "RETRIEVAL_DOCUMENT") for text in _re_for_emb]
  embeddings = emb_model.get_embeddings(inputs)
  _dataset[n]["description"] = _re
  _dataset[n]["embeddings"] = embeddings[0].values

Benjamin
 Harris is a Software Engineer at Google, based in New York City. His journey
 includes a Software Engineer Intern role at Microsoft in 2019. 


Benjamin holds a Master of Science in Computer Science from Stanford University, which he earned in 2019, and a Bachelor of Science in Computer Science
 from the University of California, Berkeley, obtained in 2017. His skillset boasts proficiency in Python, Java, C++, and SQL. While
 he primarily communicates in English, Benjamin's expertise shines through in his technical abilities.

His professional identity is further encapsulated in his record, bearing the unique identifier 'e8c5d4a4-3fc7-41
bd-92d5-878947a43224'. 

This information, accurate as of its last update on 2023-11-16T10:2
1:34.567Z, provides a glimpse into Benjamin's qualifications. Notably, a set of embeddings, represented by a lengthy string of numerical values, further enriches his profile. This numerical fingerprint, capturing the essence of his skills and experien

# Store In Google Cloud Storage



In [147]:
client = storage.Client()
bucket = client.get_bucket(bucket_id)
bucket.blob(f"{bucket_folder}/dataset.json").upload_from_string(json.dumps(_dataset), content_type="application/json")