# Load epub book

In [1]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
# TODO: Load document
chunk_size = 1024
chunk_overlap = 128

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)


In [3]:
# TODO Split document
epub_loader = UnstructuredEPubLoader(file_path='/content/docs/charles-dickens_a-christmas-carol.epub')
chunks = epub_loader.load_and_split(text_splitter)



In [8]:
# TODO Examine chunk
print(len(chunks))

idx = 50
print(chunks[idx].page_content)
print(chunks[idx].metadata)

207
“Ding, dong!”

“Half past,” said Scrooge.

“Ding, dong!”

“A quarter to it,” said Scrooge.

“Ding, dong!”

“The hour itself,” said Scrooge triumphantly, “and nothing else!”

He spoke before the hour bell sounded, which it now did with a deep, dull, hollow, melancholy One. Light flashed up in the room upon the instant, and the curtains of his bed were drawn.

The curtains of his bed were drawn aside, I tell you, by a hand. Not the curtains at his feet, nor the curtains at his back, but those to which his face was addressed. The curtains of his bed were drawn aside; and Scrooge, starting up into a half-recumbent attitude, found himself face to face with the unearthly visitor who drew them: as close to it as I am now to you, and I am standing in the spirit at your elbow.
{'source': '/content/docs/charles-dickens_a-christmas-carol.epub'}


# Create embeddings

In [9]:
# TODO: Create embedding model
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

# Create an embedding function
# sentence -> f(x) -> embedding / fixed size vector
embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(embed_model_name)


In [12]:
# TODO: Explore embedding model
text = 'hello, world'
enc_text = embed_func([ text, chunks[idx].page_content ])

print(f'len text = {len(text)}')
print(f'len chunk = {len(chunks[idx].page_content)}')

print(len(enc_text[0]))
print(len(enc_text[1]))
#print(enc_text[0])


len text = 12
len chunk = 778
384
384


In [18]:
# TODO: Prepare the chunks for inserting into Chroma

# Extract all .page_content/chunks into an array
texts = [ d.page_content for d in chunks ]

# Generate UUID for every single text, take the first 8 characters
text_ids = [ str(uuid4())[:8] for _ in range(len(texts)) ]

print(text_ids)
print(len(text_ids))
print(len(texts))

['9b1b4cda', '1a16aa98', '74fd8b71', 'acea34a2', 'dd5e8d87', '29fc479f', 'a3a23790', '8bbc7401', 'b9521d03', '35789e02', '816fa914', '000d2789', '6227f906', '116461c9', '5ae45267', 'aa1ac4b5', '979d84aa', 'cdc8abfd', 'e4dce4d0', 'c4eaf3a2', '6f326577', '877eb9cf', '3e4932e6', 'd7b35dec', '8016fa80', 'efefcbb6', 'fa07dba4', 'b3c10eb4', '3a97ff6a', '81f8de90', '720527a0', '7b9afde9', '24bb5c6f', '39f8ce93', '21c5b49c', '51390ed1', '7e703fab', 'e5ab9f75', '787cdf1c', '5a3167dd', 'd5a0ca80', '1c817554', '3744e57b', '042a2787', '7601bd0c', 'b795b79b', '2f1f6a84', 'e16b2345', 'e9994895', '07cf2426', '73b587f1', '5cedf52c', 'f37b5491', '094c8c7c', 'b57cc387', '2b6ceed2', '0ad59076', '2e6b6267', 'dd1b0cc6', '9544d767', '250f481b', 'e8a5d858', '1a946cae', 'aa2ca8aa', '20fd4373', '48eb1801', 'f2cc4016', 'aebc8706', '48b6fcf1', '18883953', 'd3561a57', '3c89aa14', '18de68ae', 'e766f690', 'c023298a', '8e5172bc', '2a4f00fe', '4b412def', '9ae5a66c', '819a1a77', 'dd5b7a16', '8f464dbc', 'eb41bb9f', '7d

In [None]:
# TODO: Create ephemeral Chroma client and save chunks
col_name = 'epub'

# Create Chroma client
ch_client = chromadb.Client()

In [20]:
# clean up
try:
  ch_client.delete_collection(col_name)
except:
  pass

# Create the collection
epub_col = ch_client.create_collection(
    name=col_name,
    embedding_function=embed_func
)


In [23]:
# TODO: Print number of documents in collection
# number of records in collection
print(epub_col.count())

# insert docs into collection
epub_col.add(
    documents=texts,
    ids = text_ids
)

print(epub_col.count())

0
207


In [31]:
# TODO: Query collection
query = "How many ghosts were there?"

results = epub_col.query(
    query_texts = [ query ],
    n_results = 5
)

for k, v in results.items():
  print(f'{k}: {v}')


ids: [['787cdf1c', 'dd1b0cc6', 'b795b79b', '239445ca', '48b6fcf1']]
embeddings: None
documents: [['Scrooge trembled more and more.\n\n“Or would you know,” pursued the Ghost, “the weight and length of the strong coil you bear yourself? It was full as heavy and as long as this seven Christmas Eves ago. You have laboured on it since. It is a ponderous chain!”\n\nScrooge glanced about him on the floor, in the expectation of finding himself surrounded by some fifty or sixty fathoms of iron cable; but he could see nothing.\n\n“Jacob!” he said imploringly. “Old Jacob Marley, tell me more! Speak comfort to me, Jacob!”\n\n“I have none to give,” the Ghost replied. “It comes from other regions, Ebenezer Scrooge, and is conveyed by other ministers, to other kinds of men. Nor can I tell you what I would. A very little more is all permitted to me. I cannot rest, I cannot stay, I cannot linger anywhere. My spirit never walked beyond our counting house\ufeff—mark me;\ufeff—in life my spirit never rove

In [28]:
print(results['documents'][0])
print(len(results['documents']))

['“How now!” said Scrooge, caustic and cold as ever. “What do you want with me?”\n\n“Much!”\ufeff—Marley’s voice; no doubt about it.\n\n“Who are you?”\n\n“Ask me who I was.”\n\n“Who were you, then?” said Scrooge, raising his voice. “You’re particular, for a shade.” He was going to say “to a shade,” but substituted this, as more appropriate.\n\n“In life I was your partner, Jacob Marley.”\n\n“Can you\ufeff—can you sit down?” asked Scrooge, looking doubtfully at him.\n\n“I can.”\n\n“Do it, then.”\n\nScrooge asked the question, because he didn’t know whether a ghost so transparent might find himself in a condition to take a chair; and felt that in the event of its being impossible, it might involve the necessity of an embarrassing explanation. But the Ghost sat down on the opposite side of the fireplace, as if he were quite used to it.\n\n“You don’t believe in me,” observed the Ghost.\n\n“I don’t,” said Scrooge.\n\n“What evidence would you have of my reality beyond that of your own senses?

In [30]:
# loop thru the ids, and retrieve the document
for id in results['ids'][0]:
  # get document by id
  doc = epub_col.get(id)
  print(f'{id}: {doc}')

39f8ce93: {'ids': ['39f8ce93'], 'embeddings': None, 'documents': ['“How now!” said Scrooge, caustic and cold as ever. “What do you want with me?”\n\n“Much!”\ufeff—Marley’s voice; no doubt about it.\n\n“Who are you?”\n\n“Ask me who I was.”\n\n“Who were you, then?” said Scrooge, raising his voice. “You’re particular, for a shade.” He was going to say “to a shade,” but substituted this, as more appropriate.\n\n“In life I was your partner, Jacob Marley.”\n\n“Can you\ufeff—can you sit down?” asked Scrooge, looking doubtfully at him.\n\n“I can.”\n\n“Do it, then.”\n\nScrooge asked the question, because he didn’t know whether a ghost so transparent might find himself in a condition to take a chair; and felt that in the event of its being impossible, it might involve the necessity of an embarrassing explanation. But the Ghost sat down on the opposite side of the fireplace, as if he were quite used to it.\n\n“You don’t believe in me,” observed the Ghost.\n\n“I don’t,” said Scrooge.\n\n“What evid

# Question and Answer LLM
In this exercise you will implement a question and answer LLM for the 'A Christmas Carol' book that you have chunked and saved.

The workflow is as follows:
1. Assume you ask the following question regarding the book eg. `"Who is Scrooge?"`?
2. Query the relevant context from Chroma with the question or facts from the question.
3. Combine the question and the top 5 context return by Chroma into a prompt
4. Use `google/flan-t5-base` to answer the question.

Look through the FLAN templates in [Github](https://github.com/google-research/FLAN/blob/main/flan/templates.py) and select an appropriate template for this workshop.

Do not worry about the accuracy of the result. Focus on implementing the solution. We will discuss the nuances of the solution at the end of the workshop.

Use your RAG workflow to answer the provided questions in `questions_for_rag.txt` file.

In [None]:
# TODO Your code

In [None]:
# TODO Your code

In [None]:
# TODO Your code

# Discussion

1. How did your solution perform?
2. Where do you think are the issues?
3. How can you improve it?