# Load epub book

In [1]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [25]:
# TODO: Load document
chunk_size = 1024
chunk_overlap = 128
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)


In [26]:
# TODO Split document
epub_loader = UnstructuredEPubLoader(file_path='/content/docs/charles-dickens_a-christmas-carol.epub')
chunks = epub_loader.load_and_split(text_splitter)



In [None]:
# TODO Examine chunk
print(len(chunks))

idx = 50
print(chunks[idx].page_content)
print(chunks[idx].metadata)

207
“Ding, dong!”

“Half past,” said Scrooge.

“Ding, dong!”

“A quarter to it,” said Scrooge.

“Ding, dong!”

“The hour itself,” said Scrooge triumphantly, “and nothing else!”

He spoke before the hour bell sounded, which it now did with a deep, dull, hollow, melancholy One. Light flashed up in the room upon the instant, and the curtains of his bed were drawn.

The curtains of his bed were drawn aside, I tell you, by a hand. Not the curtains at his feet, nor the curtains at his back, but those to which his face was addressed. The curtains of his bed were drawn aside; and Scrooge, starting up into a half-recumbent attitude, found himself face to face with the unearthly visitor who drew them: as close to it as I am now to you, and I am standing in the spirit at your elbow.
{'source': '/content/docs/charles-dickens_a-christmas-carol.epub'}


# Create embeddings

In [27]:
# TODO: Create embedding model
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

# Create an embedding function
# sentence -> f(x) -> embedding / fixed size vector
embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(embed_model_name)


In [None]:
# TODO: Explore embedding model
text = 'hello, world'
enc_text = embed_func([ text, chunks[idx].page_content ])

print(f'len text = {len(text)}')
print(f'len chunk = {len(chunks[idx].page_content)}')

print(len(enc_text[0]))
print(len(enc_text[1]))
#print(enc_text[0])


len text = 12
len chunk = 778
384
384


In [28]:
# TODO: Prepare the chunks for inserting into Chroma

# Extract all .page_content/chunks into an array
texts = [ d.page_content for d in chunks ]

# Generate UUID for every single text, take the first 8 characters
text_ids = [ str(uuid4())[:8] for _ in range(len(texts)) ]

print(text_ids)
print(len(text_ids))
print(len(texts))

['c0d3cc70', '50c13013', '61a24762', 'cc5f77c2', '2b79f8c4', '0768ddd5', 'f9b8b169', '52992307', '1264bc9c', 'bb990c7d', '79247d5c', 'd0ad0c28', 'e2bc9155', 'a165516e', 'e3f4b1a3', 'ab65f1b3', 'fbe56147', '43feac95', '52a9a12b', '12d28a6c', '95fc0fc2', '014c6887', '7e7f4384', 'a3545c80', '80b2398d', '58427fa2', '9d4ca031', '093117d4', 'e728dd37', 'db7bfb24', 'ec4284f8', '5845d168', '312c07c1', 'fc7338d9', 'dfa1dfcc', 'cd7db4f9', '0061a36f', 'ec15e543', 'f73f6ae3', '7db0ea73', 'c51d71ef', 'c5b3ec61', '5051915b', 'f1003862', '848fdb29', 'd072e01c', 'c95742c4', '1f89eefc', 'a171a942', 'bb961473', 'c2abc445', '08e0df37', 'dc02436a', '78dda0c7', '2317e706', '940f4951', 'a5c9f5c1', 'a6d58f14', '8854aef5', '465b9c4d', 'a3e94ca9', '3c5e0fe9', 'a26e586a', '2ffaacfa', '507ebf62', '8a01e0c2', '40f6ecd2', 'b2ec4299', 'd684187d', '396244bf', 'ca4db78c', 'bed6c365', '8280e584', '8c58dc14', '033eecce', '536998de', 'b6a79c92', '668a77b1', '4be2faeb', 'e2752a37', 'c8b89543', '500b884c', '1d529150', '56

In [29]:
# TODO: Create ephemeral Chroma client and save chunks
col_name = 'epub'

# Create Chroma client
ch_client = chromadb.Client()

In [30]:
# clean up
try:
  ch_client.delete_collection(col_name)
except:
  pass

# Create the collection
epub_col = ch_client.create_collection(
    name=col_name,
    embedding_function=embed_func
)


In [31]:
# TODO: Print number of documents in collection
# number of records in collection
print(epub_col.count())

# insert docs into collection
epub_col.add(
    documents=texts,
    ids = text_ids
)

print(epub_col.count())

0
203


In [9]:
# TODO: Query collection
query = "How many ghosts were there?"

results = epub_col.query(
    query_texts = [ query ],
    n_results = 5
)

for k, v in results.items():
  print(f'{k}: {v}')


ids: [['e744279f', '1043b2ef', '9a40864f', 'a91ba614', 'a4315838']]
embeddings: None
documents: [['Scrooge trembled more and more.\n\n“Or would you know,” pursued the Ghost, “the weight and length of the strong coil you bear yourself? It was full as heavy and as long as this seven Christmas Eves ago. You have laboured on it since. It is a ponderous chain!”\n\nScrooge glanced about him on the floor, in the expectation of finding himself surrounded by some fifty or sixty fathoms of iron cable; but he could see nothing.\n\n“Jacob!” he said imploringly. “Old Jacob Marley, tell me more! Speak comfort to me, Jacob!”\n\n“I have none to give,” the Ghost replied. “It comes from other regions, Ebenezer Scrooge, and is conveyed by other ministers, to other kinds of men. Nor can I tell you what I would. A very little more is all permitted to me. I cannot rest, I cannot stay, I cannot linger anywhere. My spirit never walked beyond our counting house\ufeff—mark me;\ufeff—in life my spirit never rove

In [None]:
print(results['documents'][0])
print(len(results['documents']))

['“How now!” said Scrooge, caustic and cold as ever. “What do you want with me?”\n\n“Much!”\ufeff—Marley’s voice; no doubt about it.\n\n“Who are you?”\n\n“Ask me who I was.”\n\n“Who were you, then?” said Scrooge, raising his voice. “You’re particular, for a shade.” He was going to say “to a shade,” but substituted this, as more appropriate.\n\n“In life I was your partner, Jacob Marley.”\n\n“Can you\ufeff—can you sit down?” asked Scrooge, looking doubtfully at him.\n\n“I can.”\n\n“Do it, then.”\n\nScrooge asked the question, because he didn’t know whether a ghost so transparent might find himself in a condition to take a chair; and felt that in the event of its being impossible, it might involve the necessity of an embarrassing explanation. But the Ghost sat down on the opposite side of the fireplace, as if he were quite used to it.\n\n“You don’t believe in me,” observed the Ghost.\n\n“I don’t,” said Scrooge.\n\n“What evidence would you have of my reality beyond that of your own senses?

In [None]:
# loop thru the ids, and retrieve the document
for id in results['ids'][0]:
  # get document by id
  doc = epub_col.get(id)
  print(f'{id}: {doc}')

39f8ce93: {'ids': ['39f8ce93'], 'embeddings': None, 'documents': ['“How now!” said Scrooge, caustic and cold as ever. “What do you want with me?”\n\n“Much!”\ufeff—Marley’s voice; no doubt about it.\n\n“Who are you?”\n\n“Ask me who I was.”\n\n“Who were you, then?” said Scrooge, raising his voice. “You’re particular, for a shade.” He was going to say “to a shade,” but substituted this, as more appropriate.\n\n“In life I was your partner, Jacob Marley.”\n\n“Can you\ufeff—can you sit down?” asked Scrooge, looking doubtfully at him.\n\n“I can.”\n\n“Do it, then.”\n\nScrooge asked the question, because he didn’t know whether a ghost so transparent might find himself in a condition to take a chair; and felt that in the event of its being impossible, it might involve the necessity of an embarrassing explanation. But the Ghost sat down on the opposite side of the fireplace, as if he were quite used to it.\n\n“You don’t believe in me,” observed the Ghost.\n\n“I don’t,” said Scrooge.\n\n“What evid

# Question and Answer LLM
In this exercise you will implement a question and answer LLM for the 'A Christmas Carol' book that you have chunked and saved.

The workflow is as follows:
1. Assume you ask the following question regarding the book eg. `"Who is Scrooge?"`?
2. Query the relevant context from Chroma with the question or facts from the question.
3. Combine the question and the top 5 context return by Chroma into a prompt
4. Use `google/flan-t5-base` to answer the question.

Look through the FLAN templates in [Github](https://github.com/google-research/FLAN/blob/main/flan/templates.py) and select an appropriate template for this workshop.

Do not worry about the accuracy of the result. Focus on implementing the solution. We will discuss the nuances of the solution at the end of the workshop.

Use your RAG workflow to answer the provided questions in `questions_for_rag.txt` file.

In [32]:
# TODO Your code
# Load model and tokenizer
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [33]:
# TODO Your code
# Extract core ideas from the question

question = "What is the name of Scrooge's underpaid clerk?"
#question = "Who was Scrooge's deceased business partner?"
#question = "Who was Scrooge engaged to in his youth, and why did she leave him?"
question = "What is the name of Bob Cratchit's youngest son who is ill?"
#question = "What does Scrooge see written on the gravestone that frightens him into changing his ways?"
#question = " What is Scrooge's response when his nephew Fred invites him to Christmas dinner at the beginning of the story?"
question = " What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?"

prompt = f"{question}\n\nWhat is sentence that verbalizes this data?"
#prompt = f"{question}\n\nWhat data can be extracted from this sentence?"
#prompt = f"Generate an approximately fifteen-word sentence that describes all this data: {question}"

# convert to a statement
enc_prompt = tokenizer(prompt, return_tensors='pt')
enc_answer = model.generate(enc_prompt.input_ids)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)

print(answer)

Scrooge performs a Christmas morning act for the Cratchit family.


In [34]:
# TODO Your code
# Query chroma based on the description of the question
n_results = 3
context = ""
results = epub_col.query(
   query_texts=[ answer ],
   n_results=n_results
)
print(results['distances'])
for id in results['ids'][0]:
   result = epub_col.get(id)
   context += result['documents'][0]

print(context)

[[0.33702629804611206, 0.3377726078033447, 0.33886241912841797]]
“A merry Christmas, Bob!” said Scrooge, with an earnestness that could not be mistaken, as he clapped him on the back. “A merrier Christmas, Bob, my good fellow, than I have given you for many a year! I’ll raise your salary, and endeavour to assist your struggling family, and we will discuss your affairs this very afternoon, over a Christmas bowl of smoking bishop, Bob! Make up the fires and buy another coal scuttle before you dot another i, Bob Cratchit!”Scrooge did as he was told, and held it fast.

Holly, mistletoe, red berries, ivy, turkeys, geese, game, poultry, brawn, meat, pigs, sausages, oysters, pies, puddings, fruit, and punch, all vanished instantly. So did the room, the fire, the ruddy glow, the hour of night, and they stood in the city streets on Christmas morning, where (for the weather was severe) the people made a rough, but brisk and not unpleasant kind of music, in scraping the snow from the pavement in 

In [35]:
# TODO Create a prompt with the question and context
question_prompt = f"Answer based on context:\n\n{context}\n\n{question}"
#print(question_prompt)

enc_query_prompt = tokenizer(question_prompt, return_tensors='pt')

enc_query_answer = model.generate(enc_query_prompt.input_ids)

query_answer = tokenizer.decode(enc_query_answer[0], skip_special_tokens=True)

print(question)
print(query_answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


 What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?
raise his salary


# Discussion

1. How did your solution perform?
2. Where do you think are the issues?
3. How can you improve it?