# Recuperación de información mediante Self Query Retriever

In [1]:
from langchain import hub
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate

In [2]:
local_llm = "llama3:70b"
url_llm = "http://172.17.30.133:11434" # ordenados con 2 gpus
#url_llm = "http://172.17.30.172:11434" #COMPUTACION

from langchain_community.llms import Ollama
llm = Ollama(model=local_llm, base_url=url_llm, temperature=0) 

In [3]:
import csv
from typing import Dict, List, Optional
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document

class CSVLoader(BaseLoader):
    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        metadata_columns: Optional[List[str]] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
        self.csv_args = csv_args or {}
        self.metadata_columns = metadata_columns or []

    def load(self) -> List[Document]:
        docs = []
        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv_reader = csv.DictReader(csvfile, **self.csv_args)
            for i, row in enumerate(csv_reader):
                metadata = {"row": i}
                for col in self.metadata_columns:
                    if col in row:
                        metadata[col] = row[col].strip()
                content = []
                for k, v in row.items():
                    if k != self.source_column and k not in self.metadata_columns:
                        content.append(f"{k.strip()}: {v.strip()}")
                doc_content = "\n".join(content)
                doc = Document(page_content=doc_content, metadata=metadata)
                docs.append(doc)

        return docs

In [4]:
metadata_columns = ["row", "grant agreement", "project acronym", "organisation PIC", "organisation name", "Small or Medium-sized Enterprise",
                   "organisation activity type", "organisation country", "organisation role", "funding for the organisation", 
                    "organisation total participation cost", "project title", "project total cost", "funding for the project",
                   "master call", "subcall", "type of proposal", "project legal basis", "project topic"]

# Instancia el CSVLoader con el archivo CSV y las columnas de metadatos
loader = CSVLoader(
    file_path="cordis_data_processed_29052024.csv",
    source_column= None,  # Opcional: columna para establecer como origen
    metadata_columns=metadata_columns,
    encoding="latin1"
)

# Carga los documentos del CSV
raw_documents = loader.load()

In [5]:
raw_documents[0]

Document(page_content='project objective: LINkS will change the paradigm of the self-organization of the intracellular living matter by demonstrating the existence of Long-range ElectroDynamic Interactions (LEDIs) between proteins. LEDIs may act as a long distance protein-protein attractive mechanism expanding above several hundred of AngstrÃ¶ms that could explain the high spatial organization and coordination of biomolecular reactions; responsible for the transmission of information in cells. LEDIs result from condensation phenomenon, characterized by the emerging of the mode of lowest frequency; expected in the TeraHertz (THz) frequency band. However, to date, LEDIs have eluded detection, partly because previous theoretical predictions were incorrect, but also because performing THz spectroscopy in aqueous media is a well-known technological roadblock not yet overcome. LINkS will develop a breakthrough lab-on-chip THz biosensor technology to investigate LEDIs between proteins, from i

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=400)
documents = text_splitter.split_documents(raw_documents)

In [7]:
# SE USA UNA DE LAS GPU
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cuda:0'}
encode_kwargs = {"normalize_embeddings": True}

emb = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

  from tqdm.autonotebook import tqdm, trange


In [8]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="row",
        description="The row in which the data is located.",
        type="int",
    ),
    AttributeInfo(
        name="grant agreement",
        description="The grant agreement number of the project, which is a identificator of the project.",
        type="int",
    ),
    AttributeInfo(
        name="project acronym",
        description="The acronym of the proyect, which is its shorted name.",
        type="string",
    ),
    AttributeInfo(
        name="organisation PIC",
        description="The Participant Identification Code, which is a identificator of the organisation.",
        type="int",
    ),
    AttributeInfo(
        name="organisation name", 
        description="The name of the orgnanisation.", 
        type="string"
    ),
    AttributeInfo(
        name="Small or Medium-sized Enterprise", 
        description="Characteristic of the organisation related to the number of employees it has. It can be: True, False or Unknown.", 
        type="string"
    ),
    AttributeInfo(
        name="organisation activity type", 
        description="The kind of activity that the organisation carries out. It can be: PRC, REC, HES, OTH, PUB or Unknown.", 
        type="string"
    ),
    AttributeInfo(
        name="organisation country", 
        description="The country in which the organisation is established. It is codified in ISO 3166-1. It can also take the value Unknown.", 
        type="string"
    ),
    AttributeInfo(
        name="organisation role", 
        description="The role that the organisation has played in the project. It can be: participan', thirdParty, coordinator, partner or internationalPartner",
        type="string"
    ),
    AttributeInfo(
        name="funding for the organisation", 
        description="The amount of money that the organisation has received from the European Union in order to conduct the project.", 
        type="string"
    ),
    AttributeInfo(
        name="organisation total participation cost", 
        description="The amount of money that the organisation has invested in carring out the project.", 
        type="string"
    ),
    AttributeInfo(
        name="project title", 
        description="The name of the project.", 
        type="string"
    ),
    AttributeInfo(
        name="project total cost", 
        description="The total amount of money that all the participants in the project have spent to carry out the project.", 
        type="string"
    ),
    AttributeInfo(
        name="funding for the project", 
        description="The total amount of money that the European Union has provided to carry out the project.", 
        type="string"
    ),
    AttributeInfo(
        name="funding for the project", 
        description="The total amount of money that the European Union has provided to carry out the project.", 
        type="string"
    ),
    AttributeInfo(
        name="master call", 
        description="The master call within which the project has been framed.", 
        type="string"
    ),
    AttributeInfo(
        name="subcall", 
        description="The subcall within which the project has been framed.", 
        type="string"
    ),
    AttributeInfo(
        name="type of proposal", 
        description="The kind of proposal that constitutes the project.", 
        type="string"
    ),
    AttributeInfo(
        name="project legal basis", 
        description="The kind of proposal that constitutes the project.", 
        type="string"
    ),
    AttributeInfo(
        name="project topic", 
        description="The topic within which the project has been framed.", 
        type="string"
    )
]
document_content_description = "Brief summary of aspects corresponding to the participation of an organisation in a determined research project funded by the European Union within the Horison 2020 programme."

In [9]:
from langchain.vectorstores import Chroma
vectorstore_load = Chroma(persist_directory="./chroma_db_5", embedding_function= emb)

In [10]:
#!pip install lark

In [11]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore_load,
    document_content_description,
    metadata_field_info,
    verbose = True
)

In [12]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

template = """You are an assistant for question-answering tasks and an expert in research projects funded by the European Union under the Horizon 2020 programme.
Use the following context from Horizon 2020 projects to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:

"""
prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [13]:
query_1 = "What is the objective of the project with grant agreement 740934?"
query_2 = "What is the total cost of the project with the acronym HYPERGRYD?"
query_3 = "How much funding was allocated for the project titled Transforming Research through Innovative Practices for Linked interdisciplinary Exploration?"
query_4 = "Which organisation played the role of coordinator in the grant agreement 777998?"
query_5 = "What topic does the project with the acronym INTERRFACE belong to?"
query_6 = "What legal basis was the project titled European Joint Programme on Radioactive Waste Management framed within?"
query_7 = "What type of proposal was the grant agreement 814416?"
query_8 = "To which master call was the project with the acronym G9NIGHT submitted?"
query_9 = "To which sub call was the project titled Electron Nanocrystallography submitted?"
query_10 = "Provide the grant agreement of 1 project which objective is related to artificial intelligence."
query_11 = "Provide the acronym of 1 project which objective is related to robotics."
query_12 = "Provide the title of 1 project which objective is related to geolocation."
query_13 = "Provide the objective of 1 project related to digital twin."
query_14 = "Provide the objective of 3 different projects related to corrosion."
query_15 = "Provide the title of 3 different projects which objective is related to offshore structures."
query_16 = "Provide the acronym of 3 different projects which objective is related to materials engineering."
query_17 = "Provide the grant agreement of 3 different projects which objective is related to nanocomposites."
query_18 = "Provide the name of an organisation that has participated in projects which objective is related to artificial intelligence."
query_19 = "Provide the name of an organisation which activity type is PRC and that has participated in projects which objective is related to robotics."
query_20 = "Provide the PIC of an organisation that is a small or medium enterprise and has participated in projects which objective is related to geolocation."
query_21 = "Provide the name of an organisation that has played the role of coordinator in projects which objective is related to digital twin."
query_22 = "Provide the PIC of a Spanish organisation that has participated in projects which objective is related to corrosion."
query_23 = "Provide the name of an european organisation that has participated in projects which objective is related to offshore structures."
query_24 = "Provide the PIC of an european small or medium enterprise that has participated in projects which objective is related to materials engineering."
query_25 = "Provide the name of an european small or medium enterprise that has played the role of coordinator in projects which objective is related to nanocomposites."

In [15]:
import time

inicio = time.time()

#retrieved_docs_1 = retriever.get_relevant_documents(query_1)
answer_1 = rag_chain.invoke(query_1)

fin = time.time()  # Marca de tiempo al final

tiempo_ejecucion = fin - inicio  # Calcular el tiempo de ejecución

print(f"El tiempo de ejecución fue: {tiempo_ejecucion} segundos")

El tiempo de ejecución fue: 15.948180437088013 segundos


In [16]:
answer_1

"I'm not familiar with a specific project with grant agreement 740934. Can you please provide more context or information about this project? I'll do my best to help you find the answer."

In [14]:
import langchain

langchain.debug = False

retriever.invoke(query_1)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


[]

In [15]:
retriever.invoke(query_2)

[]

In [16]:
retriever.invoke(query_3)

[]

In [17]:
retriever.invoke(query_4)

[]

In [18]:
retriever.invoke(query_5)

[]

In [14]:
retriever.invoke(query_6)

OutputParserException: Parsing text
```json
{
    "query": "European Joint Programme on Radioactive Waste Management",
    "filter": "eq(\"project title\", \"European Joint Programme on Radioactive Waste Management\") and eq(\"project legal basis\", val)"
}
```

Please note that the `val` in the filter should be replaced with the actual value of the project's legal basis, which is not provided in the user query.
 raised following error:
Unexpected token Token('CNAME', 'and') at line 1, column 81.
Expected one of: 
	* $END
Previous tokens: [Token('RPAR', ')')]


In [15]:
retriever.invoke(query_7)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


[]

In [16]:
retriever.invoke(query_8)

KeyboardInterrupt: 

In [17]:
retriever.invoke(query_9)

[]

In [18]:
retriever.invoke(query_10)

[]

In [19]:
retriever.invoke(query_11)

[]

In [20]:
retriever.invoke(query_12)

KeyboardInterrupt: 

In [21]:
retriever.invoke(query_13)

[]

In [22]:
retriever.invoke(query_14)

KeyboardInterrupt: 

In [25]:
import langchain

langchain.debug = True

retriever.invoke(query_20)

[32;1m[1;3m[chain/start][0m [1m[retriever:Retriever > chain:query_constructor] Entering Chain run with input:
[0m{
  "query": "Provide the PIC of an organisation that is a small or medium enterprise and has participated in projects which objective is related to geolocation."
}
[32;1m[1;3m[chain/start][0m [1m[retriever:Retriever > chain:query_constructor > prompt:FewShotPromptTemplate] Entering Prompt run with input:
[0m{
  "query": "Provide the PIC of an organisation that is a small or medium enterprise and has participated in projects which objective is related to geolocation."
}
[36;1m[1;3m[chain/end][0m [1m[retriever:Retriever > chain:query_constructor > prompt:FewShotPromptTemplate] [0ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[retriever:Retriever > chain:query_constructor > llm:Ollama] Entering LLM run with input:
[0m{
  "prompts": [
    "Your goal is to structure the user's query to match the request schema provided below.\n\

[]