In [21]:
!pip install --quiet langchain langchain-community langchain-groq neo4j wikipedia tiktoken json-repair langchain_openai sentence-transformers yfiles_jupyter_graphs

In [22]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget

import pandas as pd
import json
import os
import json_repair
from typing import Tuple, List, Optional
from langchain_community.graphs import Neo4jGraph
from langchain_community.chat_models import ChatOllama
from langchain.document_loaders import WikipediaLoader
from langchain_community.llms import Ollama
from langchain.chains import LLMChain
from langchain.prompts.chat import (ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate)
from langchain import PromptTemplate
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.schema import (SystemMessage,HumanMessage,AIMessage)
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq
from langchain_community.graphs.graph_document import Node, Relationship, GraphDocument
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import HuggingFaceEmbeddings

In [23]:
os.environ["NEO4J_URI"] = "neo4j+s://fac518af.databases.neo4j.io"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "pMPx2MxjQ9J3rKNxD3h_tF1PytJtQfZieJg95whZ6SY"
os.environ["GROQ_API_KEY"] = "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"

In [24]:
graph = Neo4jGraph()

In [25]:
hf = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)

vector_index_ent = Neo4jVector.from_existing_graph(
    embedding=hf,
    search_type="hybrid",
    node_label="Node",
    text_node_properties=["id"],
    embedding_node_property="ent_embedding",
    index_name='entity_index'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [26]:
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="Все финансовые сущности, например: лица, организации, показатели компании, отрасли и сферы деятельности компании и т.д., которые фигурируют в тексте",
    )

entity_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Вы извлекаете все финансовые сущности, например: лица, организации, показатели компании, отрасли и сферы деятельности компании и т.д., которые фигурируют в тексте
            Примеры извлечения сущностей:

            input: Какие технологические разработки есть у компании
            answer: [технологические разработки, компания]

            input: Какая валовая стоимость у продуктов компании
            answer: [валовая стоимость, компания]""",
        ),
        (
            "human",
            """Используйте заданный формат для извлечения информации из следующих текстов.
            """
            "input: {question}",
        ),
    ]
)
model = ChatGroq(temperature=0, model_name="llama3-70b-8192")
entity_chain = entity_prompt | model.with_structured_output(Entities)

In [27]:
import re
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """

    result = ""
    entities = entity_chain.invoke({"question": question})
    entities_sims = []
    print("entities")
    print(entities)
    for entity in entities.names: # \nid:
      entity_sim = [re.findall("\nid: (.*)", el.page_content)[0] for el in vector_index_ent.similarity_search(entity)]
      entities_sims.append(entity_sim)

    for entity_sim in entities_sims:
      for entity in entity_sim:
          response = graph.query("""
          CALL db.index.fulltext.queryNodes('keyword', $query, {limit:2})
          YIELD node,score
          CALL {
            WITH node
            MATCH (node)-[r:!MENTIONS]->(neighbor)
            WITH node, r, neighbor,
                  apoc.map.removeKeys(properties(node), ['ent_embedding', 'id']) AS nodeProps,
                  apoc.map.removeKeys(properties(r), ['ent_embedding', 'id']) AS relProps,
                  apoc.map.removeKeys(properties(neighbor), ['ent_embedding', 'id']) AS neighborProps
            RETURN node.id +  ' (' + apoc.convert.toJson(nodeProps) + ')'  +' -> ' + type(r) + ' (' + apoc.convert.toJson(relProps) + ') -> ' + neighbor.id  +  ' (' + apoc.convert.toJson(neighborProps) + ')' AS output
            UNION ALL
            WITH node
            MATCH (node)<-[r:!MENTIONS]-(neighbor)
            WITH node, r, neighbor,
                  apoc.map.removeKeys(properties(node), ['ent_embedding', 'id']) AS nodeProps,
                  apoc.map.removeKeys(properties(r), ['ent_embedding', 'id']) AS relProps,
                  apoc.map.removeKeys(properties(neighbor), ['ent_embedding', 'id']) AS neighborProps
            RETURN neighbor.id + ' (' + apoc.convert.toJson(neighborProps) + ')' + ' <- ' + type(r) + ' (' + apoc.convert.toJson(relProps) + ') <- ' +  node.id + ' (' + apoc.convert.toJson(nodeProps) + ')' AS output
          }
          RETURN output LIMIT 50
          """,
          {"query": generate_full_text_query(entity)},
          )

          result += "\n".join([el['output'] for el in response])
    print("result")
    print(result)
    return result

In [28]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""

prompt = ChatPromptTemplate.from_template(template)

In [29]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough, RunnableLambda

In [30]:
_search_query = RunnableLambda(lambda x : x["question"])

In [31]:
template = """Отвечайте на вопрос, основываясь только на следующем контексте:
{context}

Вопрос: {question}
Используйте естественный язык и будьте краткими.
Отвечать:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | structured_retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | model
    | StrOutputParser()
)

In [41]:
answer= chain.invoke({"question": "Сколько составила выручка в период 12 месяцев?"})

entities
names=['выручка', 'период']
result
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 164 778 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 249 586 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 521 699 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 800 125 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({}) -> 69 859 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({}) -> 101 111 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({}) -> 226 022 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({}) -> 337 514 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 11 984 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 20 638 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 31 782 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн. руб."}) -> 66 899 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн рублей"}) -> 3938 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица":"млн рублей"}) -> 7038 ({})
Выручка ({}) -> ИМЕЕТЗНАЧЕНИЕ ({"единица"

In [42]:
print(answer)

Выручка за 12 месяцев составила 800 125 млн рублей.
