# Process BGE to Vectors and save it in a pinecone vector db

## load config

In [12]:
import yaml

with open("../../src/config/cfg.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

## laod file

In [13]:
import pandas as pd

df = pd.read_csv('../../data/02_interim/bge/id_bge_data_preprocessed.csv', delimiter='|')
df.head(n=5)

Unnamed: 0,id,bge
0,97 II 21631,. Urteil I. Zivilabteilung 11 . Mai 1971 i.S ....
1,99 IV 22552,. Urteil Kassationshofes 30 . November 1973 i....
2,97 IV 4213,. Urteil Kassationshofes 19 . Februar 1971 i.S...
3,92 IV 299,. Urteil Kassationshofes 22 . April 1966 i.S ....
4,90 IV 328,. Urteil Kassationshofes 21 . Februar 1964 i.S...


# vectorize

In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(config['sentence_transformer']['model_name'])

# Convert text to vectors
df['vectors'] = df['bge'].apply(lambda x: model.encode(x))


In [15]:
amount_dimensions = len(df['vectors'][0])
print(f'Amount of dimensions: {amount_dimensions}')

Amount of dimensions: 384


## setup pinecone

In [16]:
import os
import pinecone

api_key = os.environ["PINECONE_API_KEY"]
pinecone.init(api_key=api_key, environment="gcp-starter")

## create index

In [17]:
index_name = 'bge'
pinecone.create_index(index_name, dimension=amount_dimensions, metric=config['vectorization']['metric'])
pinecone.describe_index(index_name)

index = pinecone.Index(index_name)

## insert data (upsert)

In [18]:
# Convert DataFrame to a list of tuples (id, vector)
to_upsert = df.apply(lambda x: (x['id'], x['vectors'].tolist()), axis=1).tolist()
index.upsert(vectors=to_upsert)

{'upserted_count': 884}