# Bloque 1

In [1]:
USUARIO = "civica"
PROJECT_ID = "master-424910"
REGION = "europe-west1" 

### Desde BigQuery agregamos una nueva conexión a una fuente de datos externa:
- `Tipo de conexión`: Modelos remotos de Vertex AI, funciones remotas y BigLake (Cloud Resource)
- `Id de la conexión`: cnx_vertex
- `Región`: (la que estemos utilizando para la configuración del resto de recursos)

### Creamos una tabla en BigQuery que contenga los embeddings
Para ello nos ayudaremos del fichero .sql existente en el repositorio

### Creamos un bucket para guardar los embeddings que indexaremos después

In [2]:
BUCKET_URI = f"gs://bucket-{PROJECT_ID}-{USUARIO}"

In [3]:
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://bucket-master-424910-civica/...


### Exportamos los embeddings de la tabla de BigQuery en formato JSON al bucket que hemos creado

In [7]:
DATASET = "ecommerce"
TABLE_ID = "embeddings"
BUCKET_NAME = f"bucket-{PROJECT_ID}-{USUARIO}"
OUTPUT = "embeddings.json"

In [8]:
from google.cloud import bigquery
from google.cloud import storage
import json
import os

# Configuración
project_id = f"{PROJECT_ID}"
dataset = f"{DATASET}"
table_id = f"{TABLE_ID}"
bucket_name = f"{BUCKET_NAME}"
output = f"{OUTPUT}"

# Inicializa el cliente de BigQuery y el cliente de Storage
bq_client = bigquery.Client(project=project_id)
storage_client = storage.Client(project=project_id)

# Consulta
query = f"""
SELECT id, text_embedding
FROM `{dataset}.{table_id}`
"""

# Ejecución
query_job = bq_client.query(query)
results = query_job.result()

# Genera archivo JSON temporal con los embeddings 
temp_file_path = "/tmp/temp_embeddings.json"
with open(temp_file_path, "w") as f:
    for row in results:
        record = {
            "id": str(row["id"]),  
            "embedding": [str(value) for value in row["text_embedding"]], 
        }
        f.write(json.dumps(record) + "\n")

# Sube el archivo JSON al bucket creado anteriormente 
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(output)
blob.upload_from_filename(temp_file_path)

# Limpia el archivo temporal
os.remove(temp_file_path)

print("Export completed successfully")

Export completed successfully


## Creamos el índice

In [9]:
DISPLAY_NAME = "ecommerce_index"
DESCRIPTION = "products data"

In [10]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=f'gs://{BUCKET_NAME}')

In [11]:
DIMENSIONS = 768

tree_ah_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=DISPLAY_NAME,
    contents_delta_uri=f"{BUCKET_URI}",
    dimensions=DIMENSIONS,
    approximate_neighbors_count=5,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=80,
    description=DESCRIPTION,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1017560508533/locations/europe-west1/indexes/6615515173723570176/operations/1446547734071869440
MatchingEngineIndex created. Resource name: projects/1017560508533/locations/europe-west1/indexes/6615515173723570176
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1017560508533/locations/europe-west1/indexes/6615515173723570176')


In [12]:
INDEX_RESOURCE_NAME = tree_ah_index.resource_name
INDEX_RESOURCE_NAME

'projects/1017560508533/locations/europe-west1/indexes/6615515173723570176'

In [13]:
tree_ah_index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

# Creamos el endpoint y lo desplegamos

In [14]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    public_endpoint_enabled=True,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1017560508533/locations/europe-west1/indexEndpoints/3571398484969914368/operations/8839769472353959936
MatchingEngineIndexEndpoint created. Resource name: projects/1017560508533/locations/europe-west1/indexEndpoints/3571398484969914368
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1017560508533/locations/europe-west1/indexEndpoints/3571398484969914368')


In [15]:
DEPLOYED_INDEX_ID = f"products_data_index_{USUARIO}"
DEPLOYED_INDEX_ID

'products_data_index_civica'

In [16]:
my_index_endpoint = my_index_endpoint.deploy_index(
    index=tree_ah_index, deployed_index_id=DEPLOYED_INDEX_ID
)
my_index_endpoint.deployed_indexes

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/1017560508533/locations/europe-west1/indexEndpoints/3571398484969914368
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/1017560508533/locations/europe-west1/indexEndpoints/3571398484969914368/operations/4045124719064645632
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/1017560508533/locations/europe-west1/indexEndpoints/3571398484969914368


[id: "products_data_index_civica"
index: "projects/1017560508533/locations/europe-west1/indexes/6615515173723570176"
create_time {
  seconds: 1717862910
  nanos: 408840000
}
index_sync_time {
  seconds: 1717864484
  nanos: 649755000
}
automatic_resources {
  min_replica_count: 2
  max_replica_count: 2
}
deployment_group: "default"
]

## Verificamos que el número de items se corresponde con el número de embeddings

In [17]:
number_of_vectors = sum(
    aiplatform.MatchingEngineIndex(
        deployed_index.index
    )._gca_resource.index_stats.vectors_count
    for deployed_index in my_index_endpoint.deployed_indexes
)

print(f"Expected: 3890, Actual: {number_of_vectors}")

Expected: 3890, Actual: 3890


## Para borrar el endpoint y el bucket...

In [None]:
# import os

# delete_bucket = False

# # Force undeployment of indexes and delete endpoint
# my_index_endpoint.delete(force=True)

# # Delete indexes
# tree_ah_index.delete()

# if delete_bucket or os.getenv("IS_TESTING"):
#     ! gsutil rm -rf {BUCKET_URI}