# Identify relevant law articles for each case

In [93]:
import yaml

with open("../../src/config/cfg.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

## Validate schema of received json

In [94]:
import json

f = open('../../data/01_raw/dev/sachverhalt.json')
data = json.load(f)

In [95]:
import sys
# add path to sys.path in order to access schemas
sys.path.append('../../src/schemas')

from sachverhalt import schema_sachverhalt

schema_sachverhalt.validate(data)

### setup pinecone connection

In [96]:
import os
import pinecone

api_key = os.environ["PINECONE_API_KEY"]
pinecone.init(api_key=api_key, environment=config['vectorization']['environment'])

index_name = "law"
index = pinecone.Index(index_name)

### load embedding

In [97]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

openai_api_key = os.environ.get('OPENAI_API_KEY')
model_name = config['sentence_transformer']['model_name']

embed = HuggingFaceEmbeddings(
    model_name=model_name
)

### load vectorstore

In [98]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_key=text_field
)

### query vectorstore

In [99]:
query = "Missachtung des Vortritts und Kollision mit rechter Fahrzeugseite von G"
query = remove_stopwords(query)
query = lemmatize(query)

documents = vectorstore.similarity_search(
    query,  # our search query
    k=10  # return 10 most relevant docs
)

content = documents[0].page_content

### get id of best matching law article

In [100]:
import pandas as pd

df_interim = pd.read_csv('../../data/02_interim/law/law_art_abs_text.csv', delimiter='|')
filtered_df = df_interim[df_interim['Text'] == content]

merged_column = filtered_df['Gesetz'] + '|' + filtered_df['Artikel'] + '|' + filtered_df['Absatz']

# Assign the result to a variable
id = merged_column.values[0]
