In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline
from langchain_core.messages import SystemMessage
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.runnables import RunnableLambda


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader = PyPDFLoader("Introduction_to_Tableau.pdf")
docs = loader.load()
splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)
docs_split = splitter.split_documents(docs)


In [None]:
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts, convert_to_tensor=False).tolist()

    def embed_query(self, text):
        return self.model.encode(text, convert_to_tensor=False).tolist()

embedding = SentenceTransformerEmbeddings("all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(
    documents=docs_split,
    embedding=embedding,
    persist_directory="./chroma_db"
)

retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 2, "lambda": 0.7})


In [4]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

hf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,
    truncation=True,
    do_sample=False
)

chat = HuggingFacePipeline(pipeline=hf_pipeline)


Device set to use cpu
  chat = HuggingFacePipeline(pipeline=hf_pipeline)


In [5]:
def truncate_prompt(prompt: str, max_tokens: int = 900):
    tokens = tokenizer.encode(prompt)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        return tokenizer.decode(tokens, skip_special_tokens=True)
    return prompt


In [6]:
PROMPT_CREATING_QUESTION = '''Lecture: {question_lecture}
Title: {question_title}
Body: {question_body}'''

PROMPT_RETRIEVING_S = '''You will receive a question from a student taking a Tableau course, which includes a title and a body. 
The corresponding lecture will also be provided.

Answer the question using only the provided context.

At the end of your response, include the section and lecture names where the context was drawn from, formatted as follows: 
Resources: 
Section: *Section Title*, Lecture: *Lecture Title*'''

PROMPT_TEMPLATE_RETRIEVING_H = '''This is the question:
{question}

This is the context:
{context}'''

prompt_creating_question = PromptTemplate.from_template(PROMPT_CREATING_QUESTION)
prompt_retrieving_s = SystemMessage(content=PROMPT_RETRIEVING_S)
prompt_template_retrieving_h = HumanMessagePromptTemplate.from_template(PROMPT_TEMPLATE_RETRIEVING_H)
chat_prompt_template_retrieving = ChatPromptTemplate(messages=[prompt_retrieving_s, prompt_template_retrieving_h])


In [7]:
def format_context(dictionary):
    formatted_string = ""
    retrieved_list = dictionary['context']

    for i, doc in enumerate(retrieved_list):
        section = doc.metadata.get("section_title", "Unknown Section")
        lecture = doc.metadata.get("lecture_title", "Unknown Lecture")
        formatted_string += f'''
Document {i + 1}
Section Title: {section}
Lecture Title: {lecture}
Content: {doc.page_content.strip()}

-------------------
'''
    new_dictionary = dictionary.copy()
    new_dictionary['context'] = formatted_string
    return new_dictionary

format_context_runnable = RunnableLambda(format_context)


In [8]:
get_text = RunnableLambda(lambda x: x.to_string())
combine_with_context = RunnableLambda(lambda q: {
    "context": retriever.invoke(q),
    "question": q
})
apply_truncation = RunnableLambda(lambda d: {
    "question": d["question"],
    "context": truncate_prompt(d["context"])
})
final_prompt_truncation = RunnableLambda(lambda messages: truncate_prompt(str(messages)))

chain_retrieving_improved = (
    prompt_creating_question
    | get_text
    | combine_with_context
    | format_context_runnable
    | apply_truncation
    | chat_prompt_template_retrieving
    | final_prompt_truncation
    | chat
)


In [9]:
question_input = {
    "question_lecture": "Adding a custom calculation",
    "question_title": "Why are we using SUM here? It's unclear to me.",
    "question_body": "This question refers to calculating the GM%."
}

result = chain_retrieving_improved.invoke(question_input)
print(result)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


messages=[SystemMessage(content='You will receive a question from a student taking a Tableau course, which includes a title and a body. \nThe corresponding lecture will also be provided.\n\nAnswer the question using only the provided context.\n\nAt the end of your response, include the section and lecture names where the context was drawn from, formatted as follows: \nResources: \nSection: *Section Title*, Lecture: *Lecture Title*', additional_kwargs={}, response_metadata={}), HumanMessage(content="This is the question:\nLecture: Adding a custom calculation\nTitle: Why are we using SUM here? It's unclear to me.\nBody: This question refers to calculating the GM%.\n\nThis is the context:\n\nDocument 1\nSection Title: Unknown Section\nLecture Title: Unknown Lecture\nContent: ## Adding a custom calculation \nOk, excellent. \nWe're doing good. \nWe've seen quite a few interesting tableau \ntools so far and we'll continue to do so during \nthis lesson as well. \nOur table is almost ready. \n

In [10]:
# Clean and format the final output
def format_response_text(text: str) -> str:
    return text.replace("\\n", "\n").replace("  ", " ").strip()

formatted_result = format_response_text(str(result))
print(formatted_result)


messages=[SystemMessage(content='You will receive a question from a student taking a Tableau course, which includes a title and a body. 
The corresponding lecture will also be provided.

Answer the question using only the provided context.

At the end of your response, include the section and lecture names where the context was drawn from, formatted as follows: 
Resources: 
Section: *Section Title*, Lecture: *Lecture Title*', additional_kwargs={}, response_metadata={}), HumanMessage(content="This is the question:
Lecture: Adding a custom calculation
Title: Why are we using SUM here? It's unclear to me.
Body: This question refers to calculating the GM%.

This is the context:

Document 1
Section Title: Unknown Section
Lecture Title: Unknown Lecture
Content: ## Adding a custom calculation 
Ok, excellent. 
We're doing good. 
We've seen quite a few interesting tableau 
tools so far and we'll continue to do so during 
this lesson as well. 
Our table is almost ready. 
We have revenue cogs and