In [None]:
'''!pip install --upgrade astrapy
!pip install langchain langchain-openai google-cloud-secret-manager'''

# Este notebook se utiliza para crear la base de datos de embeddings de Astrapy, generados con GPT4 para realizar recomendaciones.

In [1]:
from astrapy.db import AstraDB
import langchain
from langchain_openai import OpenAIEmbeddings
import json
import os

from astrapy.db import AstraDB
from dotenv import load_dotenv
# Initialize the client
load_dotenv()
db = AstraDB(
    token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
    api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
)


secret_string = os.getenv("OPENAI_API_KEY")

# Configurar el motor de OpenAI
engine = "gpt-4"
embeddings = OpenAIEmbeddings(api_key=secret_string, model="text-embedding-3-large")

def get_embedding(text):
    query_result = embeddings.embed_query(text)
    print(query_result)
    return query_result


print(f"Connected to Astra DB: {db.get_collections()}")

# Create a collection
collection = db.create_collection("vector_movies", dimension=3072, metric="cosine")
# Procesar cada línea y obtener los embeddings
# Leer el documento de texto

with open('titleoverview_comb.csv', 'r') as file:
    lines = file.readlines()

documents = []

for index, line in enumerate(lines):
    # Extraer el texto de cada línea

    # Obtener el embedding para el texto
    vector = get_embedding(line)

    # Crear el documento
    document = {
        "_id": str(index + 1),
        "text": line,
        "$vector": vector
    }

    # Insertar el documento en la base de datos
    # Reemplaza 'collection' con tu objeto de colección de la base de datos
    # res = collection.insert_one(document)  # Usamos insert_one para insertar un solo documento
    res = collection.upsert(document)  # Aqui usamos upsert si existe se actualiza si no, se crea
    # Añadir al documento JSON
    documents.append(document)

# Guardar los documentos en un archivo JSON
with open('documentos.json', 'w') as file:
    json.dump(documents, file)
print(documents)



# Perform a similarity search
#query = [0.15, 0.1, 0.1, 0.35, 0.55]
#results = collection.vector_find(query, limit=2, fields={"text", "$vector"})

#for document in results:
#    print(document)



Connected to Astra DB: {'message': 'Operation not allowed'}
[-0.004561377991030559, -0.007065282880750717, -0.02142877054715033, -0.011655776342724184, -0.01979832093815325, -0.0054299806156529445, -0.03101736940768526, 0.04864175442943394, -0.059472603449425815, -0.021448181171553112, 0.011248163474813644, -0.04188703595119254, -0.005400865610371308, -0.04557495773017747, -0.014305256724513797, 0.0066770806372429626, -0.01440230798388264, 0.017255594333966253, 0.02564076614649489, -0.06673198698626914, 0.04739950799526716, -0.02259337820899613, -0.010170901760135291, 0.002666464910972685, -0.0313473407093072, 0.013276520639519866, 0.00041003875940344653, 0.02251573757403007, 0.011995452956547515, -0.023233912050482308, 0.008239594923475372, 0.008142544595429069, 0.0013878234396353648, 0.004998105398561464, -0.03323012098496016, -0.0006472061272093155, 0.029697480475907336, -0.02880461596776528, -0.009287741632872087, 0.014431422989164278, 0.002233376646271349, 0.008317235558441431, -0

HTTPStatusError: Client error '401 Unauthorized' for url 'https://7178a332-b133-42d9-a020-9499e20f09f0-us-east-1.apps.astra.datastax.com/api/json/v1/default_keyspace/vector_movies'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/401

In [36]:
#Ejemplo de la función get similarity, para calcular la similaridad coseno de los embeddings.
def get_similarity(movie_description,N):
    db = AstraDB(token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"))
    collection = db.collection(collection_name="vector_movies")
    vector_embedding = get_embedding(movie_description)
    list_vec = collection.vector_find(
        vector=vector_embedding,
        limit=10
    )

    # Extract the movie titles from the 'text' values in the list
    titles = [item['text'].split(':')[0].strip('\"') for item in list_vec]

    return titles[1:(N+1)]

result = get_similarity('Toy Story')
print(result)





[-0.03298373886492768, 0.03965032465500274, 0.0018729937692091802, 0.01631726880180457, 0.02919013233770049, -0.011483991496296945, 0.027602849474545908, 0.016253776816726136, 0.03361865126513145, 0.032729774649904224, 0.003245993511030473, -0.011547482550052807, -0.01947596106618284, 0.013142702144172836, 0.031174234985321143, 0.031380582539842196, -0.026523496904083375, 0.04225347182883171, -0.00958718823268335, -0.069776952131078, 0.02817427175231639, -0.013428413283058079, 0.010349084603043996, 0.004797562481824399, -0.0007127892444957974, 0.002200370883017877, -0.0025297321795679354, -0.00843640880882208, -0.029825044737904264, 0.007984032838920447, -0.0029344891606607423, 0.0138093614682384, -0.004138840354385568, 0.016182348100682253, 0.0006845157463769454, 0.018952157311559897, -0.0030892493608902483, -0.006523732548938878, -0.015118869923473189, 0.04311060152019715, 0.000945425357694754, -0.04047571487308697, 0.007245946195794858, 0.009047512878774658, 0.01717440035581515, 0.0

In [34]:
print(np.shape(result[1:2]))
print(result[1:7])

(1,)
['Pinocchio', 'Small Soldiers', 'The Adventures of Pinocchio', 'The Fox and the Hound', 'The Love Bug']
