# Process LAW to Vectors and save it in a pinecone vector db

## load config

In [21]:
import yaml

with open("../../src/config/cfg.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

## load file

In [22]:
import pandas as pd

df = pd.read_csv('../../data/02_interim/law/law_art_abs_text.csv', delimiter='|')
df['id'] = df['Gesetz'].astype(str) + '_' + df['Artikel'].astype(str) + '_' + df['Absatz'].astype(str)
df.head()

Unnamed: 0,Gesetz,Artikel,Absatz,Text,id
0,SVG,26,1.0,jedermann Verkehr verhalten ordnungsgemäss Ben...,SVG_26_1
1,SVG,26,2.0,besonderer Vorsicht gebieten gegenüber Kind Ge...,SVG_26_2
2,SVG,27,1.0,Signal Markierung sowie Weisung Polizei befolg...,SVG_27_1
3,SVG,27,2.0,Feuerwehr Sanitäts Polizei Zollfahrzeug Wahrne...,SVG_27_2
4,SVG,28,,Bahnübergang anhalten Schranke schliessen Sign...,SVG_28_nan


# vectorize

In [23]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(config['sentence_transformer']['model_name'])

# Convert text to vectors
df['vectors'] = df['Text'].apply(lambda x: model.encode(x))


In [24]:
amount_dimensions = len(df['vectors'][0])
print(f'Amount of dimensions: {amount_dimensions}')

Amount of dimensions: 384


## setup pinecone

In [25]:
import os
import pinecone

api_key = os.environ["PINECONE_API_KEY"]
pinecone.init(api_key=api_key, environment=config['vectorization']['environment'])

## create index

In [26]:
index_name = 'law'
pinecone.create_index(index_name, dimension=amount_dimensions, metric=config['vectorization']['metric'])
pinecone.describe_index(index_name)

index = pinecone.Index(index_name)

## insert data (upsert)

In [None]:
# Convert DataFrame to a list of tuples (id, vector, metadata)
to_upsert = df.apply(lambda x: (x['id'], x['vectors'].tolist(), {"text": x['Text']}), axis=1).tolist()

In [None]:
import itertools

def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

# Upsert data with 5 vectors per upsert request
for ids_vectors_chunk in chunks(to_upsert, batch_size=10):
    index.upsert(vectors=ids_vectors_chunk)  # Assuming `index` defined elsewhere