# Load epub book

In [1]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
# TODO: Load document
chunk_size = 1024
chunk_overlap = 50

# Create a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [3]:
# TODO Split document
# Load the document
epub_loader = UnstructuredEPubLoader(file_path="/content/docs/charles-dickens_a-christmas-carol.epub")
chunks = epub_loader.load_and_split(text_splitter)

print(len(chunks))



203


In [5]:
# TODO Examine chunk
print(chunks[100].page_content)
print(chunks[100].metadata)



For the people who were shovelling away on the housetops were jovial and full of glee; calling out to one another from the parapets, and now and then exchanging a facetious snowball﻿—better-natured missile far than many a wordy jest﻿—laughing heartily if it went right, and not less heartily if it went wrong. The poulterers’ shops were still half open, and the fruiterers’ were radiant in their glory. There were great, round, potbellied baskets of chestnuts, shaped like the waistcoats of jolly old gentlemen, lolling at the doors, and tumbling out into the street in their apoplectic opulence: There were ruddy, brown-faced, broad-girthed Spanish onions, shining in the fatness of their growth like Spanish friars, and winking from their shelves in wanton slyness at the girls as they went by, and glanced demurely at the hung-up mistletoe. There were pears and apples clustered high in blooming pyramids; there were bunches of grapes, made, in the shopkeepers’ benevolence, to dangle from conspic

# Create embeddings

In [6]:
# TODO: Create embedding model
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=embed_model_name
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
# TODO: Explore embedding model
text = 'big black bug bleeds black blood'
emb_text = embed_func([ chunks[100].page_content ])

print(emb_text[0].shape)
#print(emb_text[0])



(384,)


In [14]:
# TODO: Prepare the chunks for inserting into Chroma
# Extract the texts
texts = [ d.page_content for d in chunks ]

# Create a PK for every texts
text_ids = [ str(uuid4())[:8] for _ in range(len(texts)) ]

print(len(texts), len(text_ids))

idx = 10
print(text_ids[idx])
print(texts[idx])


203 203
ea7b8682
“Don’t be cross, uncle!” said the nephew.

“What else can I be,” returned the uncle, “when I live in such a world of fools as this? Merry Christmas! Out upon merry Christmas! What’s Christmastime to you but a time for paying bills without money; a time for finding yourself a year older, and not an hour richer; a time for balancing your books, and having every item in ’em through a round dozen of months presented dead against you? If I could work my will,” said Scrooge indignantly, “every idiot who goes about with ‘Merry Christmas’ on his lips should be boiled with his own pudding, and buried with a stake of holly through his heart. He should!”

“Uncle!” pleaded the nephew.

“Nephew!” returned the uncle sternly, “keep Christmas in your own way, and let me keep it in mine.”

“Keep it!” repeated Scrooge’s nephew. “But you don’t keep it.”

“Let me leave it alone, then,” said Scrooge. “Much good may it do you! Much good it has ever done you!”


In [15]:
# TODO: Create ephemeral Chroma client and save chunks
col_name = 'epub'

# Create the chroma client
ch_client = chromadb.Client()

# Delete the collection if it exists
try:
    ch_client.delete_collection(name=col_name)
except:
    pass


In [16]:
# Create the collection epub, use "BAAI/bge-small-en-v1.5" embed function
epub_col = ch_client.create_collection(
    name=col_name,
    embedding_function=embed_func
)

In [17]:
# TODO: Print number of documents in collection
epub_col.add(
    documents = texts,
    ids = text_ids
)


In [18]:
print(epub_col.count())

203


In [21]:
# TODO: Query collection
query = "Who is Scrooge?"

results = epub_col.query(
    query_texts=query,
    n_results=5
)

for k, v in results.items():
    print('>> ', k)
    print(v)

>>  ids
[['2f1e226b', '1c2cc646', '5ce65a15', 'faa77059', '3e1d1d72']]
>>  embeddings
None
>>  documents
[['“I don’t know what day of the month it is,” said Scrooge. “I don’t know how long I have been among the Spirits. I don’t know anything. I’m quite a baby. Never mind. I don’t care. I’d rather be a baby. Hallo! Whoop! Hallo here!”\n\nHe was checked in his transports by the churches ringing out the lustiest peals he had ever heard. Clash, clash, hammer; ding, dong, bell! Bell, dong, ding; hammer, clash, clash! Oh, glorious, glorious!\n\nRunning to the window, he opened it, and put out his head. No fog, no mist; clear, bright, jovial, stirring, cold; cold, piping for the blood to dance to; golden sunlight; heavenly sky; sweet fresh air; merry bells. Oh, glorious! Glorious!\n\n“What’s today?” cried Scrooge, calling downward to a boy in Sunday clothes, who perhaps had loitered in to look about him.\n\n“Eh?” returned the boy with all his might of wonder.\n\n“What’s today, my fine fellow?

In [24]:
for id in results['ids'][0]:
  print(id)
  result = epub_col.get(id)
  print('>>>> ', result['documents'][0])

2f1e226b
>>>>  “I don’t know what day of the month it is,” said Scrooge. “I don’t know how long I have been among the Spirits. I don’t know anything. I’m quite a baby. Never mind. I don’t care. I’d rather be a baby. Hallo! Whoop! Hallo here!”

He was checked in his transports by the churches ringing out the lustiest peals he had ever heard. Clash, clash, hammer; ding, dong, bell! Bell, dong, ding; hammer, clash, clash! Oh, glorious, glorious!

Running to the window, he opened it, and put out his head. No fog, no mist; clear, bright, jovial, stirring, cold; cold, piping for the blood to dance to; golden sunlight; heavenly sky; sweet fresh air; merry bells. Oh, glorious! Glorious!

“What’s today?” cried Scrooge, calling downward to a boy in Sunday clothes, who perhaps had loitered in to look about him.

“Eh?” returned the boy with all his might of wonder.

“What’s today, my fine fellow?” said Scrooge.

“Today!” replied the boy. “Why, Christmas Day.”
1c2cc646
>>>>  “I don’t know what to d

# Question and Answer LLM
In this exercise you will implement a question and answer LLM for the 'A Christmas Carol' book that you have chunked and saved.

The workflow is as follows:
1. Assume you ask the following question regarding the book eg. `"Who is Scrooge?"`?
2. Query the relevant context from Chroma with the question or facts from the question.
3. Combine the question and the top 5 context return by Chroma into a prompt
4. Use `google/flan-t5-base` to answer the question.

Look through the FLAN templates in [Github](https://github.com/google-research/FLAN/blob/main/flan/templates.py) and select an appropriate template for this workshop.

Do not worry about the accuracy of the result. Focus on implementing the solution. We will discuss the nuances of the solution at the end of the workshop.

Use your RAG workflow to answer the provided questions in `questions_for_rag.txt` file.

In [None]:
# TODO Your code

In [None]:
# TODO Your code

In [None]:
# TODO Your code

# Discussion

1. How did your solution perform?
2. Where do you think are the issues?
3. How can you improve it?