In [1]:
# pip install -r /work/requirements.txt

In [2]:
from faq_chatbot.utils import load_faqs
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [3]:
def load_pdfs(path=str):
    pdfs = []
    for filename in os.listdir(path):
        if filename.endswith(".pdf"):
            file_path  = os.path.join(path, filename)
            loader = PyPDFLoader(file_path )
            pdfs.extend(loader.load())
    return pdfs

path = "/work/docs"
docs = load_pdfs(path)

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(docs)

print(split_docs[0])

page_content='REGUL A TION (EU) 2024/1689 OF THE EUR OPEAN P ARLIAMENT AND OF THE CO UNCIL
of 13 June 2024
laying do wn har monised r ules on ar tif icial intelligence and amending Regulations (EC) No 300/2008, 
(EU) No 167/2013, (EU) No 168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and 
Directiv es 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Ar tif icial Intelligence A ct)
(T ext with EEA relevance)
THE EUR OPEAN P ARLIAMENT AND THE COUNCIL OF THE EUR OPEAN UNION,
Having regard to the T reaty on the Functioning of the European Union, and in par ticular Ar ticles 16 and 114 thereof,
Having regard to the proposal from the European Commission,
Af ter transmission of the draf t legislative act to the national parliaments,
Having regard to the opinion of the European Economic and Social Committe e (
1
),
Having regard to the opinion of the European Central Bank (
2
),
Having regard to the opinion of the Committee of the Regions (
3
),' metadata={'producer': 'PDFlib+PDI 9.0.7

In [5]:
for doc in split_docs:
    category = doc.metadata["source"].split("/")[-2]
    doc.metadata = {
        "source": doc.metadata["source"],
        "category": category
    }

In [6]:
split_docs[0]

Document(metadata={'source': '/work/docs/AI ACT.pdf', 'category': 'docs'}, page_content='REGUL A TION (EU) 2024/1689 OF THE EUR OPEAN P ARLIAMENT AND OF THE CO UNCIL\nof 13 June 2024\nlaying do wn har monised r ules on ar tif icial intelligence and amending Regulations (EC) No 300/2008, \n(EU) No 167/2013, (EU) No 168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and \nDirectiv es 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Ar tif icial Intelligence A ct)\n(T ext with EEA relevance)\nTHE EUR OPEAN P ARLIAMENT AND THE COUNCIL OF THE EUR OPEAN UNION,\nHaving regard to the T reaty on the Functioning of the European Union, and in par ticular Ar ticles 16 and 114 thereof,\nHaving regard to the proposal from the European Commission,\nAf ter transmission of the draf t legislative act to the national parliaments,\nHaving regard to the opinion of the European Economic and Social Committe e (\n1\n),\nHaving regard to the opinion of the European Central Bank (\n2\n),\nHaving regard to 

In [7]:
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def store_docs_with_embeddings(docs):
    model = HuggingFaceEmbeddings(
        model_name=EMB_MODEL,
        model_kwargs={"device": "cpu"}  
    )
    vectorstore = Chroma.from_documents(docs, embedding=model, collection_name =f"docs_{random.random()}")
    return vectorstore

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import chromadb
import random

vectorstore = store_docs_with_embeddings(split_docs)

In [9]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [10]:
query = "Can we use personnal data with AI?"
docs = retriever.invoke(query)
docs 

[Document(metadata={'source': '/work/docs/GDPR.pdf', 'category': 'docs'}, page_content="personal data of the data subject's request. \n(67)  Methods by whic h to restr ict the processing of personal data could include, inter alia, te mporar ily mo ving the \nselect ed data to another processing system, making the selecte d personal data unavai lable to users, or te mporar ily \nremo ving published data from a website . In autom ated fi ling syste ms, the restr iction of processing should in \npr inciple be ensured by tec hnical means in such a manner that the personal data are not subject to fur ther \nprocessing operations and cannot be chang ed. The fa ct that the processing of personal data is restr icted should \nbe clearly indicated in the system. \n(68)  T o fur ther strengthen the control o ver his or her o wn data, where the processing of personal data is car r ied out \nby automat ed means, the data subject should also be allo wed to receive personal data concer ning him or he

In [11]:

class test:
    def __init__(self):
        test.name=''

example1 = test()
example1.name = "a"
example2 = test()
example2.name = "b"
example3 = test()
example3.name = "c"

tools = [example1, example2, example3]


# print(example1.name)

result = {tool.name: tool for tool in tools}
print(result)

{'a': <__main__.test object at 0x70d50eb5cb50>, 'b': <__main__.test object at 0x70d50ee37050>, 'c': <__main__.test object at 0x70d50ee354d0>}


In [12]:
import torch
print("CUDA dispo ?", torch.cuda.is_available())

CUDA dispo ? False


In [13]:
from langchain.agents import Tool

def create_rag_tool(retriever):
    """Create a RAG tool for document retrieval."""
    return Tool(
        name="tool_rag",
        description="""Tool to retrieve the k closest documents answering a question on IA regulation.""",
        func=lambda query: retrieve_documents(query, retriever)
    )
tool_rag = create_rag_tool(retriever)

In [14]:
import os
from dotenv import load_dotenv
from pathlib import Path
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

# Charger le .env depuis la racine
ROOT = Path("/work")
load_dotenv(ROOT / ".env")

HF_TOKEN = os.getenv("HF_TOKEN")
HF_MODEL = os.getenv("HF_MODEL", "google/gemma-2-2b-it")

if not HF_TOKEN:
    raise EnvironmentError("Set HF_TOKEN in environment (see .env.example)")


def create_huggingface_client():
    """Create and return a HuggingFace chat model client."""
    # Initialize LangChain HuggingFace client
    llm = HuggingFaceEndpoint(
        repo_id=HF_MODEL,
        huggingfacehub_api_token=HF_TOKEN,
        temperature=0.7,
        max_new_tokens=512,
        do_sample=True,
    )

    # Initialize chat model for better conversation handling
    chat_model = ChatHuggingFace(
        llm=llm,
        verbose=True
    )
    
    return chat_model
    
chat_model = create_huggingface_client()

In [21]:
from langchain_core.messages import ToolMessage
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import START, END, MessagesState, StateGraph
from langchain_huggingface import ChatHuggingFace


class BasicToolNode:
    """Node for executing tools in the agent workflow."""
    
    def __init__(self, tools: list) -> None:
        # Outils disponibles
        self.tools_by_name = {tool.name: tool for tool in tools}

    def __call__(self, inputs: dict):
        # Récupère le dernier message de la liste "messages" dans les entrées
        if messages := inputs.get("messages", []):
            message = messages[-1]
        else:
            raise ValueError("No message found in input")
        outputs = []
        for tool_call in message.tool_calls:
            # Exécute l'outil spécifié et retourne le résultat
            tool_result = self.tools_by_name[tool_call["name"]].invoke(
                tool_call["args"]
            )
            outputs.append(
                ToolMessage(
                    content=str(tool_result),
                    name=tool_call["name"],
                    tool_call_id=tool_call["id"],
                )
            )
        return {"messages": outputs}


def create_prompt():
    """Create the system prompt for the agent."""
    return ChatPromptTemplate.from_messages([
        ("system", "You are a helpful assistant answering user questions using only the provided context. "
                   "Answer the question directly if no context is needed. "
                   "If necessary, call the RAG tool to perform an initial search for relevant information. "
                   "When you no longer call a tool, your last response will be considered the final answer."),
        ("placeholder", "{history}"),
    ])


def call_model(state: MessagesState, chat_model, tools):
    """Call the model with tools bound."""
    prompt = create_prompt()
    chat_model_with_tools = chat_model.bind_tools(tools)
    chat_model_with_prompt = prompt | chat_model_with_tools
    response = chat_model_with_prompt.invoke({"history": state["messages"]})
    return {"messages": response}


def route_tools(state: MessagesState):
    """Route to tools if tool calls are present, otherwise end."""
    if isinstance(state, list):
        ai_message = state[-1]
    elif messages := state.get("messages", []):
        ai_message = messages[-1]
    else:
        raise ValueError(f"No messages found in input state to tool_edge: {state}")
    if hasattr(ai_message, "tool_calls") and ai_message.tool_calls and len(ai_message.tool_calls) > 0:
        return "tools"
    return END


def define_graph(chat_model, tools):
    """Define the agent workflow graph."""
    workflow = StateGraph(MessagesState)
    
    # Créer une fonction partielle pour call_model avec les outils
    def call_model_with_tools(state):
        return call_model(state, chat_model, tools)
    
    workflow.add_node("model", call_model_with_tools)

    tool_node = BasicToolNode(tools=tools)
    workflow.add_node("tools", tool_node)

    workflow.add_conditional_edges(
        "model",
        route_tools,
        {"tools": "tools", END: END},
    )

    workflow.add_edge(START, "model")
    workflow.add_edge("tools", "model")

    agent = workflow.compile()
    return agent
    
agent = define_graph(chat_model, [tool_rag])

In [22]:
question = "Is it legal to use real resumes for chatbot training?"
messages = [{"role": "user", "content": question}]
response = agent.invoke({"messages": messages})

In [23]:
print(response["messages"][-1].content)



Response: The legality of using real resumes for chatbot training ultimately depends on the source of the resumes and the intended use of the chatbot's output. If the resumes were obtained legally and the intended use of the chatbot's output is for internal personnel recruitment and hiring purposes, it is generally acceptable. However, if the resumes were obtained without consent or were gathered from external sources for other purposes, it may violate privacy rights and potentially be considered a breach of GDPR (General Data Protection Regulation) in the European Union or other privacy laws, depending on the specifics of the use case. It is best practice to obtain consent from individuals for any sensitive information used in training AI or to modify such information for anonymization or aggregation to mitigate privacy concerns. When in doubt, seeking legal counsel is advised to ensure compliance with privacy laws.


In [24]:
result = agent.invoke({"messages": question})


In [25]:
print(result["messages"][-1].content)



Yes, it is legal to use real resumes for chatbot training purposes as long as you have obtained explicit consent from the individuals whose resumes you are using. However, there are certain data protection laws that must be followed, and you should ensure that the information is used in compliance with privacy policies and agreements with the individuals and any regulatory requirements in your specific industry and country. You should also consider any confidentiality obligations and any other legal requirements, such as GDPR or HIPAA for healthcare organizations. It's crucial to protect the sensitive information and anonymize the resumes where necessary to ensure compliance and protect privacy. The resumes can be used for training natural language processing (NLP) models for chatbots or for generating human-like responses in recruiting-related tasks such as interview scheduling, job matching, or candidate screening. Real resumes are beneficial because they provide a realistic and di