# Load epub book

In [25]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [64]:
# TODO: Load document
chunk_size = 1024
chunk_overlap = 30

# Create a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [65]:
# TODO Split document
# Load the document
epub_loader = UnstructuredEPubLoader(file_path="/content/docs/charles-dickens_a-christmas-carol.epub")
chunks = epub_loader.load_and_split(text_splitter)

print(len(chunks))



201


In [52]:
# TODO Examine chunk
print(chunks[100].page_content)
print(chunks[100].metadata)



Scrooge closed the window, and examined the door by which the Ghost had entered. It was double locked, as he had locked it with his own hands, and the bolts were undisturbed. He tried to say “Humbug!” but stopped at the first syllable. And being, from the emotions he had undergone, or the fatigues of the day, or his glimpse of the invisible world, or the dull conversation of the Ghost, or the lateness of the hour, much in need of repose, went straight to bed without undressing, and fell asleep upon the
{'source': '/content/docs/charles-dickens_a-christmas-carol.epub'}


# Create embeddings

In [125]:
# TODO: Create embedding model
embed_model_name = "BAAI/bge-small-en-v1.5" # 384
#embed_model_name = "sentence-transformers/all-MiniLM-L6-v2" # 384
#embed_model_name = "sentence-transformers/all-mpnet-base-v2" # 768

embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=embed_model_name
)

In [126]:
# TODO: Explore embedding model
text = 'big black bug bleeds black blood'
emb_text = embed_func([ chunks[50].page_content ])

print(emb_text[0].shape)
#print(emb_text[0])



(384,)


In [127]:
# TODO: Prepare the chunks for inserting into Chroma
# Extract the texts
texts = [ d.page_content for d in chunks ]

# Create a PK for every texts
text_ids = [ str(uuid4())[:8] for _ in range(len(texts)) ]

print(len(texts), len(text_ids))

idx = 10
print(text_ids[idx])
print(texts[idx])


201 201
1fda8ea3
“What else can I be,” returned the uncle, “when I live in such a world of fools as this? Merry Christmas! Out upon merry Christmas! What’s Christmastime to you but a time for paying bills without money; a time for finding yourself a year older, and not an hour richer; a time for balancing your books, and having every item in ’em through a round dozen of months presented dead against you? If I could work my will,” said Scrooge indignantly, “every idiot who goes about with ‘Merry Christmas’ on his lips should be boiled with his own pudding, and buried with a stake of holly through his heart. He should!”

“Uncle!” pleaded the nephew.

“Nephew!” returned the uncle sternly, “keep Christmas in your own way, and let me keep it in mine.”

“Keep it!” repeated Scrooge’s nephew. “But you don’t keep it.”

“Let me leave it alone, then,” said Scrooge. “Much good may it do you! Much good it has ever done you!”


In [128]:
# TODO: Create ephemeral Chroma client and save chunks
col_name = 'epub'

# Create the chroma client
ch_client = chromadb.Client()

# Delete the collection if it exists
try:
    ch_client.delete_collection(name=col_name)
except:
    pass


In [129]:
# Create the collection epub, use "BAAI/bge-small-en-v1.5" embed function
epub_col = ch_client.create_collection(
    name=col_name,
    embedding_function=embed_func
)

In [130]:
# TODO: Print number of documents in collection
epub_col.add(
    documents = texts,
    ids = text_ids
)


In [102]:
print(epub_col.count())

201


In [131]:
# TODO: Query collection
query = "Who is Scrooge?"

results = epub_col.query(
    query_texts=query,
    n_results=5
)

for k, v in results.items():
    print('>> ', k)
    print(v)

>>  ids
[['25494123', 'd04cc012', '59f99c4d', 'f11db712', '2c8df139']]
>>  embeddings
None
>>  documents
[['“I don’t know what day of the month it is,” said Scrooge. “I don’t know how long I have been among the Spirits. I don’t know anything. I’m quite a baby. Never mind. I don’t care. I’d rather be a baby. Hallo! Whoop! Hallo here!”\n\nHe was checked in his transports by the churches ringing out the lustiest peals he had ever heard. Clash, clash, hammer; ding, dong, bell! Bell, dong, ding; hammer, clash, clash! Oh, glorious, glorious!\n\nRunning to the window, he opened it, and put out his head. No fog, no mist; clear, bright, jovial, stirring, cold; cold, piping for the blood to dance to; golden sunlight; heavenly sky; sweet fresh air; merry bells. Oh, glorious! Glorious!\n\n“What’s today?” cried Scrooge, calling downward to a boy in Sunday clothes, who perhaps had loitered in to look about him.\n\n“Eh?” returned the boy with all his might of wonder.\n\n“What’s today, my fine fellow?

In [None]:
for id in results['ids'][0]:
  print(id)
  result = epub_col.get(id)
  print('>>>> ', result['documents'][0])

2f1e226b
>>>>  “I don’t know what day of the month it is,” said Scrooge. “I don’t know how long I have been among the Spirits. I don’t know anything. I’m quite a baby. Never mind. I don’t care. I’d rather be a baby. Hallo! Whoop! Hallo here!”

He was checked in his transports by the churches ringing out the lustiest peals he had ever heard. Clash, clash, hammer; ding, dong, bell! Bell, dong, ding; hammer, clash, clash! Oh, glorious, glorious!

Running to the window, he opened it, and put out his head. No fog, no mist; clear, bright, jovial, stirring, cold; cold, piping for the blood to dance to; golden sunlight; heavenly sky; sweet fresh air; merry bells. Oh, glorious! Glorious!

“What’s today?” cried Scrooge, calling downward to a boy in Sunday clothes, who perhaps had loitered in to look about him.

“Eh?” returned the boy with all his might of wonder.

“What’s today, my fine fellow?” said Scrooge.

“Today!” replied the boy. “Why, Christmas Day.”
1c2cc646
>>>>  “I don’t know what to d

# Question and Answer LLM
In this exercise you will implement a question and answer LLM for the 'A Christmas Carol' book that you have chunked and saved.

The workflow is as follows:
1. Assume you ask the following question regarding the book eg. `"Who is Scrooge?"`?
2. Query the relevant context from Chroma with the question or facts from the question.
3. Combine the question and the top 5 context return by Chroma into a prompt
4. Use `google/flan-t5-base` to answer the question.

Look through the FLAN templates in [Github](https://github.com/google-research/FLAN/blob/main/flan/templates.py) and select an appropriate template for this workshop.

Do not worry about the accuracy of the result. Focus on implementing the solution. We will discuss the nuances of the solution at the end of the workshop.

Use your RAG workflow to answer the provided questions in `questions_for_rag.txt` file.

In [132]:
# TODO Your code
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [149]:
# TODO Your code

question = "What is the name of Scrooge's underpaid clerk?"
question = "What is the name of Bob Cratchit's youngest son who is ill?"
question = "Who was Scrooge engaged to in his youth, and why did she leave him?"
#question = "What does Scrooge see written on the gravestone that frightens him into changing his ways?"
question = "What is Scrooge's response when his nephew Fred invites him to Christmas dinner at the beginning of the story?"
#question = "What does Scrooge do on Christmas morning after his transformation?"
question = "What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?"

prompt = f'{question}\n\nWhat is the sentence that verbalizes this data?'

enc_prompt = tokenizer(prompt, return_tensors='pt').input_ids
enc_answer = model.generate(enc_prompt)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
print(answer)

Scrooge performs a Christmas jolly act for the Cratchit family on


In [150]:
# TODO Your code
n = 5
results = epub_col.query(
    query_texts = [ answer ],
    n_results = n
)

contexts = ""

for i in results['ids'][0]:
  contexts += epub_col.get(i)['documents'][0] + "\n\n"

print('distances: ', results['distances'][0])


distances:  [0.3020668625831604, 0.314846396446228, 0.3149707317352295, 0.31559789180755615, 0.3187754154205322]


In [151]:
question_prompt = f"{contexts}\nAnswer this question: {question}"

enc_question_prompt = tokenizer(question_prompt, return_tensors='pt').input_ids
enc_answer_enc = model.generate(enc_question_prompt)
answer = tokenizer.decode(enc_answer_enc[0], skip_special_tokens=True)

print()
print(question)
print(answer)



What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?
a merry Christmas


# Discussion

1. How did your solution perform?
2. Where do you think are the issues?
3. How can you improve it?