# Bases de datos vectoriales

## Codificar datos a embeddings

In [None]:
!pip install -U sentence-transformers
!pip install chromadb
!pip install openai

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [None]:
df = pd.read_csv('/content/imdb_top_1000.csv')

In [None]:
df.head()

In [None]:
df['text'] = df.apply(lambda x : x['Overview']+' '+x['Director']+' '+x['Star1']+' '+x['Star2']+' '+x['Star3']+' '+x['Star4'], axis=1)

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
embeddings = model.encode(df['text'],batch_size=64,show_progress_bar=True)

In [None]:
df['embeddings'] = embeddings.tolist()

In [None]:
df['ids'] = df.index
df['ids'] = df['ids'].astype('str')

In [None]:
df

## Chroma

In [None]:
import chromadb
from chromadb.utils import embedding_functions

In [None]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key='xxxxxxxx',
    model_name = 'text-embedding-ada-002'
)

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name = 'all-MiniLM-L6-v2')

## Chroma embeddings

### Chroma Query

### Where
1. Estructura

`
{
    "metadata_field": {
        <Operator>: <Value>
    }
}
`
2. Operadores

$eq - equal to (string, int, float)

$ne - not equal to (string, int, float)

$gt - greater than (int, float)

$gte - greater than or equal to (int, float)

$lt - less than (int, float)

$lte - less than or equal to (int, float)

### Cargar índice de Chroma previamente creado

In [None]:
client_persistent_2 = chromadb.PersistentClient(path="/content/data_embeddings")

In [None]:
db_2 = client_persistent_2.get_collection('movies_db_no_embeddigs')

In [None]:
db_2.peek(1)

# Pinecone

In [None]:
!pip install pinecone-client

In [None]:
import pinecone
from getpass import getpass

In [None]:
pincone_api = getpass('Enter the secret value: ')

Enter the secret value: ··········


In [None]:
pinecone.init(api_key=pincone_api, environment="______________")

In [None]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size=64

for i in tqdm(range(0, len(df), batch_size)):

    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df[i:i_end]
    # generate embeddings for batch
    ids = batch['ids']
    emb = batch['embeddings']
    metadata = batch.drop(['ids','embeddings','text'],axis=1).to_dict('records')

    # add all to upsert list
    to_upsert = list(zip(ids, emb, metadata))
    # update/insert these records to pinecone
    _ = index.upsert(to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

### Pinecone query

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
query = 'a history of time travel'


### Filter

`
filter={
        "genre": {"$eq": "documentary"},
        "year": 2019
    }
`

The metadata filters can be combined with AND and OR:

$eq - Equal to (number, string, boolean)

$ne - Not equal to (number, string, boolean)

$gt - Greater than (number)

$gte - Greater than or equal to (number)

$lt - Less than (number)

$lte - Less than or equal to (number)

$in - In array (string or number)

$nin - Not in array (string or number)


In [None]:
query = 'a history of time travel'


In [None]:
responses

### Load Index

In [None]:
pinecone.init(api_key=pincone_api, environment="_________")

In [None]:
index_2 = pinecone.Index('movies-emebeddings')

In [None]:
query = 'a history of an space journey'



In [None]:
responses