In [16]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
# !pip install -qU "langchain[openai]"
!pip install -qU langchain-openai
!pip install -qU langchain-core
!pip install pymupdf
!pip install "unstructured[markdown]" --upgrade

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [10]:
!pip uninstall pymupdf -y
!pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.3


## Setup

In [2]:
import getpass
import os

# LANGSMITH_TRACING=true
# LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
# LANGSMITH_API_KEY="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"
# LANGSMITH_PROJECT="pr-essential-quantity-30"
# OPENAI_API_KEY="<your-openai-api-key>"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

 ········


In [3]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

Enter API key for OpenAI:  ········


In [4]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [5]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

## RAG

In [43]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, PyMuPDFLoader, TextLoader, UnstructuredMarkdownLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always write answer in json format: question and answer 

{context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

# Load and chunk contents of the blog
# loader = WebBaseLoader(
#     web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
#     bs_kwargs=dict(
#         parse_only=bs4.SoupStrainer(
#             class_=("post-content", "post-title", "post-header")
#         )
#     ),
# )
# docs = loader.load()

# pdf_loader = PyMuPDFLoader("./data/ULUGBEK_SHERNAZAROV_CV.pdf")
# docs = pdf_loader.load()

folder_path = "./data/"
all_documents = []

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if filename.endswith(".pdf"):
        loader = PyMuPDFLoader(file_path)  # Load PDFs
    elif filename.endswith(".txt"):
        loader = TextLoader(file_path)  # Load text files
    elif filename.endswith(".md"):
        loader = UnstructuredMarkdownLoader(file_path)  # Load Markdown files
    else:
        print(f"Skipping unsupported file: {filename}")
        continue

    docs = loader.load()
    all_documents.extend(docs)


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(all_documents)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
# prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

Skipping unsupported file: .ipynb_checkpoints


In [44]:
questions = [
    "How old are you?",
    "What is your highest level of education?",
    "What major or field of study did you pursue during your education?",
    "How many years of work experience do you have?",
    "What type of work or industry have you been involved in?",
    "Can you describe your current role or job responsibilities?",
    "What are your core beliefs regarding the role of technology in shaping society?",
    "How do you think cultural values should influence technological advancements?",
    "As a master’s student, what is the most challenging aspect of your studies so far?",
    "What specific research interests or academic goals do you hope to achieve during your time as a master's student?"
]

# Iterate over the questions and invoke the model
for idx, question in enumerate(questions, start=1):
    response = graph.invoke({"question": question})
    print(f"{idx}) {question}")
    print(f"Answer: {response['answer']}\n")

1) How old are you?
Answer: ```json
{
  "question": "How old are you?",
  "answer": "I am 23 years old."
}
```

2) What is your highest level of education?
Answer: ```json
{
  "question": "What is your highest level of education?",
  "answer": "I am currently pursuing a master's degree in Data Science and Artificial Intelligence at the Asian Institute of Technology."
}
```

3) What major or field of study did you pursue during your education?
Answer: ```json
{
  "question": "What major or field of study did you pursue during your education?",
  "answer": "I pursued a bachelor's degree in Computer Science and am currently pursuing a master's degree in Data Science and Artificial Intelligence."
}
```

4) How many years of work experience do you have?
Answer: ```json
{
  "question": "How many years of work experience do you have?",
  "answer": "I have approximately 1.5 years of work experience."
}
```

5) What type of work or industry have you been involved in?
Answer: ```json
{
  "questi

## Chains

In [38]:
from langchain_core.messages import SystemMessage
from langgraph.prebuilt import ToolNode
from langchain_core.tools import tool
from langgraph.graph import MessagesState, StateGraph

graph_builder = StateGraph(MessagesState)


@tool(response_format="content_and_artifact")
def retrieve(query: str):
    """Retrieve information related to a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

# Step 1: Generate an AIMessage that may include a tool-call to be sent.
def query_or_respond(state: MessagesState):
    """Generate tool call for retrieval or respond."""
    llm_with_tools = llm.bind_tools([retrieve])
    response = llm_with_tools.invoke(state["messages"])
    # MessagesState appends messages to state instead of overwriting
    return {"messages": [response]}


# Step 2: Execute the retrieval.
tools = ToolNode([retrieve])


# Step 3: Generate a response using the retrieved content.
def generate(state: MessagesState):
    """Generate answer."""
    # Get generated ToolMessages
    recent_tool_messages = []
    for message in reversed(state["messages"]):
        if message.type == "tool":
            recent_tool_messages.append(message)
        else:
            break
    tool_messages = recent_tool_messages[::-1]

    # Format into prompt
    docs_content = "\n\n".join(doc.content for doc in tool_messages)
    system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        f"{docs_content}"
    )
    conversation_messages = [
        message
        for message in state["messages"]
        if message.type in ("human", "system")
        or (message.type == "ai" and not message.tool_calls)
    ]
    prompt = [SystemMessage(system_message_content)] + conversation_messages

    # Run
    response = llm.invoke(prompt)
    return {"messages": [response]}

from langgraph.graph import END
from langgraph.prebuilt import ToolNode, tools_condition

graph_builder.add_node(query_or_respond)
graph_builder.add_node(tools)
graph_builder.add_node(generate)

graph_builder.set_entry_point("query_or_respond")
graph_builder.add_conditional_edges(
    "query_or_respond",
    tools_condition,
    {END: END, "tools": "tools"},
)
graph_builder.add_edge("tools", "generate")
graph_builder.add_edge("generate", END)

graph = graph_builder.compile()


In [40]:
input_message = "Who is Ulugbek Shernazarov?"

for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}]},
    stream_mode="values",
):
    step["messages"][-1].pretty_print()


Who is Ulugbek Shernazarov?
Tool Calls:
  retrieve (call_mUEU2eKqkhWIw7hkiLIyJ9nO)
 Call ID: call_mUEU2eKqkhWIw7hkiLIyJ9nO
  Args:
    query: Ulugbek Shernazarov
Name: retrieve

Source: {'source': 'data/myinfo.txt'}
Content: I found the following information about Ulugbek Shernazarov:

Ulugbek Shernazarov's LinkedIn Profile Ulugbek Shernazarov is a Machine Learning and Computer Vision Engineer with expertise in developing and deploying AI solutions1. He has a strong foundation in software engineering and a deep understanding of Python, PyTorch, TensorFlow, and Django1. Ulugbek is passionate about leveraging cutting-edge technologies to solve complex problems and create impactful innovations.

Source: {'source': 'data/myinfo.txt'}
Content: I found the following information about Ulugbek Shernazarov:

Ulugbek Shernazarov's LinkedIn Profile Ulugbek Shernazarov is a Machine Learning and Computer Vision Engineer with expertise in developing and deploying AI solutions1. He has a strong foun