# Process BGE to Vectors and save it in a pinecone vector db

## load config

In [6]:
import yaml

with open("../../src/config/cfg.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

## laod file

In [2]:
import pandas as pd

df = pd.read_csv('../../data/01_raw/bge/id_bge_data.csv', delimiter='|')
df.head(n=5)

Unnamed: 0,id,bge
0,97 II 21631,. Urteil der I. Zivilabteilung vom 11. Mai 197...
1,99 IV 22552,. Urteil des Kassationshofes vom 30. November ...
2,97 IV 4213,. Urteil des Kassationshofes vom 19. Februar 1...
3,92 IV 299,. Urteil des Kassationshofes vom 22. April 196...
4,90 IV 328,. Urteil des Kassationshofes vom 21. Februar 1...


# vectorize

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(config['sentence_transformer']['model_name'])

# Convert text to vectors
df['vectors'] = df['bge'].apply(lambda x: model.encode(x))


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)e9125/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 4.00MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 2.09MB/s]
Downloading (…)7e55de9125/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 50.8MB/s]
Downloading (…)55de9125/config.json: 100%|██████████| 612/612 [00:00<00:00, 3.20MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 880kB/s]
Downloading (…)125/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 453kB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:04<00:00, 21.8MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 162kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 1.27MB/s]
Downloading (…)e9125/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.70MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 3.6

In [5]:
amount_dimensions = len(df['vectors'][0])
print(f'Amount of dimensions: {amount_dimensions}')

Amount of dimensions: 384


## setup pinecone

In [4]:
import os
import pinecone

api_key = os.environ["PINECONE_API_KEY"]
pinecone.init(api_key=api_key, environment="gcp-starter")

## create index

In [10]:
index_name = 'bge'
pinecone.create_index(index_name, dimension=amount_dimensions, metric=config['vectorization']['metric'])
pinecone.describe_index(index_name)

index = pinecone.Index(index_name)

## insert data (upsert)

In [11]:
# Convert DataFrame to a list of tuples (id, vector)
to_upsert = df.apply(lambda x: (x['id'], x['vectors'].tolist()), axis=1).tolist()
index.upsert(vectors=to_upsert)

{'upserted_count': 884}