In [None]:
import os
import json
import pandas as pd

In [None]:
file_path = 'labor_code_structure.json'

with open(file_path, 'r') as file:
    jsonData = json.load(file)

In [None]:
from langchain.graphs import Neo4jGraph
from google.colab import userdata

graph = Neo4jGraph(
    url=userdata.get("NEO4J_URI"),
    username=userdata.get("NEO4J_USERNAME"),
    password=userdata.get("NEO4J_PASSWORD")
)

# Importing data

In [None]:
def sanitize(text):
    text = str(text).replace("'", "").replace('"', '').replace('{', '').replace('}', '')
    return text

In [None]:
# Root node
root_node = "Кодекс законів про працю України"

# Create root node
query = f'''
    MERGE (root:Root {{name: "{sanitize(root_node)}"}})
'''
graph.query(query)

In [None]:
# Loop through each article and add them to the graph
for article, references in jsonData[root_node].items():
    article = sanitize(article)

    # Create article node
    query = f'''
        MERGE (root:Root {{name: "{sanitize(root_node)}"}})
        MERGE (article:Article {{name: "{article}"}})
        MERGE (root)-[:CONTAINS]->(article)
    '''
    graph.query(query)

    # Create reference nodes and relationships
    for ref in references:
        ref = sanitize(ref)
        query = f'''
            MERGE (article:Article {{name: "{article}"}})
            MERGE (reference:Reference {{id: "{ref}"}})
            MERGE (article)-[:REFERS_TO]->(reference)
        '''
        graph.query(query)

In [None]:
# Extract related document IDs
related_document_ids = set()
# Loop through each article and collect document IDs
for article, references in jsonData[root_node].items():
    for ref in references:
        related_document_ids.add(ref)

# Count unique related document IDs
unique_count = len(related_document_ids)

# Print the count
print(f"Unique related document IDs: {unique_count}")

In [None]:
file_path = 'the_labour_code_of_ukraine.txt'
with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()

In [None]:
import re
articles = re.finditer(r'Стаття \d+\s*(?:-\s*\d+)?\s*\..*?(?=\nСтаття \d+|$)', data, re.DOTALL)
article_data = {}

for article in articles:
    article_text = article.group()
    # Extract the article number and title
    article_number = re.search(r'Стаття (\d+\s*(?:-\s*\d+)?)\s*\.', article_text).group(1).replace(" ", "")
    article_data[f"Стаття {article_number}"] = article_text

In [None]:
for name, text in article_data.items():
    sanitized_name = sanitize(name)
    sanitized_text = sanitize(text)
    query = f'''
        MATCH (a:Article {{name: "{sanitized_name}"}})
        SET a.text = "{sanitized_text}"
    '''
    graph.query(query)

# Embedding

In [None]:
from sentence_transformers import SentenceTransformer
from typing import List
import numpy as np

class MyEmbeddings:
        def __init__(self, model):
            self.model = SentenceTransformer(model, trust_remote_code=True)

        def normalize_vector(self, vector):
            norm = np.linalg.norm(vector)
            if norm == 0:
                return vector
            return vector / norm

        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            embeddings = [self.model.encode(t) for t in texts]
            normalized_embeddings = [self.normalize_vector(embedding).tolist() for embedding in embeddings]
            return normalized_embeddings

        def embed_query(self, text: str) -> List[float]:
            embedding = self.model.encode([text])[0]
            normalized_embedding = self.normalize_vector(embedding)
            return normalized_embedding.tolist()

In [None]:
embeding_model = MyEmbeddings('lang-uk/ukr-paraphrase-multilingual-mpnet-base')

In [None]:
from langchain_community.vectorstores import Neo4jVector

vector_index = Neo4jVector.from_existing_graph(
    embeding_model,
    search_type="hybrid",
    node_label="Article",
    text_node_properties=["text"],
    embedding_node_property="embedding",
    url=userdata.get("NEO4J_URI"),
    username=userdata.get("NEO4J_USERNAME"),
    password=userdata.get("NEO4J_PASSWORD")
)