# Load Law Text into InMemory Vectorstore

In [1]:
import os
import openai
import pinecone

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import CSVLoader

api_key = os.environ["PINECONE_API_KEY"]
pinecone.init(api_key=api_key, environment="eu-west4-gcp")
openai.api_key  = os.environ['OPENAI_API_KEY']

  from tqdm.autonotebook import tqdm


In [3]:
import yaml

with open("../../src/config/cfg.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

In [4]:
import pandas as pd

df = pd.read_csv('../../data/01_raw/law/law_art_abs_text.csv', delimiter='|')
df.head(n=5)

Unnamed: 0,Gesetz,Artikel,Absatz,Text
0,SVG,26,1.0,"Jedermann muss sich im Verkehr so verhalten, d..."
1,SVG,26,2.0,Besondere Vorsicht ist geboten gegenüber Kinde...
2,SVG,27,1.0,Signale und Markierungen sowie die Weisungen ...
3,SVG,27,2.0,"Den Feuerwehr-, Sanitäts-, Polizei- und Zollfa..."
4,SVG,28,,"Vor Bahnübergängen ist anzuhalten, wenn Schran..."


In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(config['sentence_transformer']['model_name'])

df['vectors'] = df['Text'].apply(lambda x: model.encode(x))

In [6]:
amount_dimensions = len(df['vectors'][0])
print(f'Amount of dimensions: {amount_dimensions}')

Amount of dimensions: 384


In [7]:
# combine first 3 columns to unique ID column
def combine_columns(row):
    gesetz = row['Gesetz']
    artikel = row['Artikel']
    absatz = row['Absatz'] if not pd.isna(row['Absatz']) else ''  # Replace NaN with an empty string
    return f"{gesetz} {artikel} {absatz}"

df['id'] = df.apply(combine_columns, axis=1)


In [8]:
df.head()

Unnamed: 0,Gesetz,Artikel,Absatz,Text,vectors,id
0,SVG,26,1.0,"Jedermann muss sich im Verkehr so verhalten, d...","[-0.047865324, 0.08606049, -0.02556109, -0.009...",SVG 26 1
1,SVG,26,2.0,Besondere Vorsicht ist geboten gegenüber Kinde...,"[-0.023096276, 0.08544042, 0.019380467, 0.0371...",SVG 26 2
2,SVG,27,1.0,Signale und Markierungen sowie die Weisungen ...,"[-0.09046197, 0.027646037, -0.005517484, -0.02...",SVG 27 1
3,SVG,27,2.0,"Den Feuerwehr-, Sanitäts-, Polizei- und Zollfa...","[-0.017185686, 0.13891844, -0.0925548, 0.02409...",SVG 27 2
4,SVG,28,,"Vor Bahnübergängen ist anzuhalten, wenn Schran...","[-0.08830286, 0.0122225145, -0.02661782, 0.041...",SVG 28


In [10]:
index_name = 'law'

pinecone.create_index(index_name, dimension=amount_dimensions, metric=config['vectorization']['metric'])
pinecone.describe_index(index_name)



ApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=UTF-8', 'date': 'Tue, 07 Nov 2023 12:47:57 GMT', 'x-envoy-upstream-service-time': '238', 'content-length': '24', 'server': 'envoy'})
HTTP response body: index law already exists


In [12]:
index = pinecone.Index(index_name)

In [13]:
to_upsert = df.apply(lambda x: (x['id'], x['vectors'].tolist()), axis=1).tolist()
index.upsert(vectors=to_upsert)

{'upserted_count': 275}