# Load epub book

In [1]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
# TODO: Load document 
chunk_size = 1024
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

epub_loader = UnstructuredEPubLoader('./docs/charles-dickens_a-christmas-carol.epub')

In [3]:
# TODO Split document
chunks = epub_loader.load_and_split(text_splitter)

  data file translations/en.yaml not found



In [4]:
# TODO Examine chunk
print(len(chunks))
print(chunks[100])

203
page_content='For the people who were shovelling away on the housetops were jovial and full of glee; calling out to one another from the parapets, and now and then exchanging a facetious snowball﻿—better-natured missile far than many a wordy jest﻿—laughing heartily if it went right, and not less heartily if it went wrong. The poulterers’ shops were still half open, and the fruiterers’ were radiant in their glory. There were great, round, potbellied baskets of chestnuts, shaped like the waistcoats of jolly old gentlemen, lolling at the doors, and tumbling out into the street in their apoplectic opulence: There were ruddy, brown-faced, broad-girthed Spanish onions, shining in the fatness of their growth like Spanish friars, and winking from their shelves in wanton slyness at the girls as they went by, and glanced demurely at the hung-up mistletoe. There were pears and apples clustered high in blooming pyramids; there were bunches of grapes, made, in the shopkeepers’ benevolence, to d

# Create embeddings

In [5]:
# TODO: Create embedding model
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

chroma_embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

In [6]:
# TODO: Explore embedding model
text = 'hello world'
emb_text = chroma_embed_func([ 'hello, world', 'big black bug bleeds black blood' ])


In [7]:
print(len(emb_text))
print(len(emb_text[0]))
print(len(emb_text[1]))
print(emb_text[0])

2
384
384
[-3.15818675e-02 -4.86476496e-02  3.21324095e-02 -6.57483190e-02
 -1.12417666e-03  1.14272060e-02 -1.62244460e-03  5.49600683e-02
  4.48704362e-02 -2.09960667e-03  7.87414052e-03 -2.20074598e-02
  3.43550555e-02  6.57045916e-02  2.98711844e-02 -2.77335406e-04
  1.02015398e-03 -3.47685143e-02 -1.21079251e-01 -1.47990324e-02
  9.72587019e-02  3.53695117e-02 -1.68968774e-02 -4.28635813e-02
 -2.48042475e-02  5.63809928e-03  6.80471864e-03  1.35493753e-02
  6.07592007e-03 -9.83635634e-02 -6.45543709e-02 -1.15323812e-02
  3.96090671e-02  2.41095200e-02  4.54739295e-02 -2.10404973e-02
  2.52140928e-02 -1.03885606e-02 -7.94328749e-02  3.64228617e-03
  4.60232161e-02 -5.09504005e-02  1.40664512e-02 -3.41335894e-03
  1.36136133e-02 -4.93411645e-02  1.70672331e-02  5.47222309e-02
 -2.78037973e-02  4.88183287e-04 -5.45995012e-02 -8.51241872e-03
 -1.97877828e-02 -2.24600383e-03  2.84831394e-02  9.09864828e-02
  7.97384828e-02  2.93898419e-03  4.68927287e-02  8.69192462e-03
  1.88648663e-0

In [8]:
# TODO: Prepare the chunks for inserting into Chroma
# Extract the text
texts = [ c.page_content for c in chunks ]
print(texts[100])
print(len(texts))


For the people who were shovelling away on the housetops were jovial and full of glee; calling out to one another from the parapets, and now and then exchanging a facetious snowball﻿—better-natured missile far than many a wordy jest﻿—laughing heartily if it went right, and not less heartily if it went wrong. The poulterers’ shops were still half open, and the fruiterers’ were radiant in their glory. There were great, round, potbellied baskets of chestnuts, shaped like the waistcoats of jolly old gentlemen, lolling at the doors, and tumbling out into the street in their apoplectic opulence: There were ruddy, brown-faced, broad-girthed Spanish onions, shining in the fatness of their growth like Spanish friars, and winking from their shelves in wanton slyness at the girls as they went by, and glanced demurely at the hung-up mistletoe. There were pears and apples clustered high in blooming pyramids; there were bunches of grapes, made, in the shopkeepers’ benevolence, to dangle from conspic

In [9]:
text_ids = [  str(uuid4())[:8] for _ in range(len(texts))]
print(text_ids)
print(len(text_ids))

['6890c8da', 'ca9b2006', '30b1a943', 'ceafae48', 'ec17a6f9', '22f73b50', '65aa926a', '682da151', '259de1fa', '39c74947', 'b87c8c66', '7299775a', '2c69b756', '1aa90dee', '09a098fb', '61fa3f3f', 'c6c639d8', '744bdba0', 'a3485827', 'e75a6d78', 'c1f7a76a', '5e8c5e66', '2053fa81', 'eb59b2f3', 'aa7ae8ea', 'f85a7a37', '06b81901', '1c3e408e', 'ed0f4412', '6c6b20ca', 'bb976b7d', '110dd213', 'cb626ece', '98e56afb', 'c539ec27', '1c2be7ab', '9b0efcc6', 'c5eb7659', '26c025cb', 'bfa261d7', '0c5d0b8b', '8625c2e9', '7d4ac536', '4be21a92', '2b109ec3', '352b806c', '02d7d450', '9f38440e', '5fdcca01', 'c85f8bc4', 'bebd2856', '8bf262f3', '1bc41980', 'b09ac48c', '1f0fc5e5', 'f8a10ab2', 'f9d04497', '79dacaaa', '3a5f7006', '0d0fe73e', '7e654ba6', '709b80ab', '0de9e38e', '88ba6f19', 'a10fd76a', '8aced0a4', 'f57c9db3', 'c66a69cc', 'aca8400d', '9daba227', '73e4e13b', 'b71663d4', '24452506', '548c31e8', '49780e88', 'caa24087', '828d01f6', '59497421', '57964a46', 'aa7a687d', '9f8078b0', 'c88ad541', '68873b90', 'ef

In [10]:
# TODO: Create ephemeral Chroma client and save chunks
col_name = 'carol'

# Create a the chromadb client
ch_client = chromadb.Client()

# drop the table
try:
   ch_client.delete_collection(col_name)
except:
   pass

# Insert the texts into the database
carol_col = ch_client.create_collection(
   name = col_name,
   embedding_function=chroma_embed_func
)


In [11]:
#Insert the docs into the collection
carol_col.add(
   documents = texts,
   ids = text_ids
)

In [12]:
# TODO: Print number of documents in collection 
print(carol_col.count())

203


In [13]:
# TODO: Query collection 
query = "What happened Marley?"


results = carol_col.query(
   query_texts=[ query ],
   n_results=5
)

print(results)

{'ids': [['9f38440e', 'ceafae48', 'eb59b2f3', '6c6b20ca', '110dd213']], 'embeddings': None, 'documents': [['Marley’s Ghost bothered him exceedingly. Every time he resolved within himself, after mature inquiry that it was all a dream, his mind flew back again, like a strong spring released, to its first position, and presented the same problem to be worked all through, “Was it a dream or not?”\n\nScrooge lay in this state until the chime had gone three-quarters more, when he remembered, on a sudden, that the Ghost had warned him of a visitation when the bell tolled one. He resolved to lie awake until the hour was passed; and, considering that he could no more go to sleep than go to heaven, this was, perhaps, the wisest resolution in his power.\n\nThe quarter was so long, that he was more than once convinced he must have sunk into a doze unconsciously, and missed the clock. At length it broke upon his listening ear.\n\n“Ding, dong!”\n\n“A quarter past,” said Scrooge, counting.\n\n“Ding, 

In [14]:
for id in results['ids'][0]:
   result = carol_col.get(id)
   print(result['documents'])

['Marley’s Ghost bothered him exceedingly. Every time he resolved within himself, after mature inquiry that it was all a dream, his mind flew back again, like a strong spring released, to its first position, and presented the same problem to be worked all through, “Was it a dream or not?”\n\nScrooge lay in this state until the chime had gone three-quarters more, when he remembered, on a sudden, that the Ghost had warned him of a visitation when the bell tolled one. He resolved to lie awake until the hour was passed; and, considering that he could no more go to sleep than go to heaven, this was, perhaps, the wisest resolution in his power.\n\nThe quarter was so long, that he was more than once convinced he must have sunk into a doze unconsciously, and missed the clock. At length it broke upon his listening ear.\n\n“Ding, dong!”\n\n“A quarter past,” said Scrooge, counting.\n\n“Ding, dong!”\n\n“Half past,” said Scrooge.\n\n“Ding, dong!”\n\n“A quarter to it,” said Scrooge.\n\n“Ding, dong!”

# Question and Answer LLM
In this exercise you will implement a question and answer LLM for the 'A Christmas Carol' book that you have chunked and saved. 

The workflow is as follows:
1. Assume you ask the following question regarding the book eg. `"Who is Scrooge?"`?
2. Query the relevant context from Chroma with the question or facts from the question.
3. Combine the question and the top 5 context return by Chroma into a prompt 
4. Use `google/flan-t5-base` to answer the question.

Look through the FLAN templates in [Github](https://github.com/google-research/FLAN/blob/main/flan/templates.py) and select an appropriate template for this workshop.

Do not worry about the accuracy of the result. Focus on implementing the solution. We will discuss the nuances of the solution at the end of the workshop.

Use your RAG workflow to answer the provided questions in `questions_for_rag.txt` file. 

In [15]:
# TODO Your code 
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [43]:
# Extract the core ideas of the question 
question = "What is the name of Scrooge's underpaid clerk?"
#question = "Who was Scrooge's deceased business partner?"
#question = "Who was Scrooge engaged to in his youth, and why did she leave him?"
question = "What is the name of Bob Cratchit's youngest son who is ill?"
#question = "What does Scrooge see written on the gravestone that frightens him into changing his ways?"
#question = " What is Scrooge's response when his nephew Fred invites him to Christmas dinner at the beginning of the story?"
question = " What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?"
question = "Who is Voldermort?"

prompt = f"{question}\n\nWhat is sentence that verbalizes this data?"
#prompt = f"{question}\n\nWhat data can be extracted from this sentence?"
#prompt = f"Generate an approximately fifteen-word sentence that describes all this data: {question}"

# convert to a statement
enc_prompt = tokenizer(prompt, return_tensors='pt')
enc_answer = model.generate(enc_prompt.input_ids)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)

print(answer)

Voldermort is a character in the film "The Last of Us".


In [44]:
# TODO Your code
context = ""
results = carol_col.query(
   query_texts=[ answer ],
   n_results=3
)
print(results['distances'])
for id in results['ids'][0]:
   result = carol_col.get(id)
   context += result['documents'][0]

print(context)

[[0.47243183851242065, 0.474808931350708, 0.4823372960090637]]
The kind hand trembled.

“I will honour Christmas in my heart, and try to keep it all the year. I will live in the Past, the Present, and the Future. The Spirits of all Three shall strive within me. I will not shut out the lessons that they teach. Oh, tell me I may sponge away the writing on this stone!”

In his agony he caught the spectral hand. It sought to free itself, but he was strong in his entreaty, and detained it. The Spirit stronger yet, repulsed him.

Holding up his hands in a last prayer to have his fate reversed, he saw an alteration in the Phantom’s hood and dress. It shrunk, collapsed, and dwindled down into a bedpost.

Stave V

The End of It

Yes! and the bedpost was his own. The bed was his own, the room was his own. Best and happiest of all, the time before him was his own, to make amends in!The bell struck twelve.

Scrooge looked about him for the Ghost, and saw it not. As the last stroke ceased to vibrat

In [45]:
question_prompt = f"Answer based on context:\n\n{context}\n\n{question}"
print(question_prompt)

Answer based on context:

The kind hand trembled.

“I will honour Christmas in my heart, and try to keep it all the year. I will live in the Past, the Present, and the Future. The Spirits of all Three shall strive within me. I will not shut out the lessons that they teach. Oh, tell me I may sponge away the writing on this stone!”

In his agony he caught the spectral hand. It sought to free itself, but he was strong in his entreaty, and detained it. The Spirit stronger yet, repulsed him.

Holding up his hands in a last prayer to have his fate reversed, he saw an alteration in the Phantom’s hood and dress. It shrunk, collapsed, and dwindled down into a bedpost.

Stave V

The End of It

Yes! and the bedpost was his own. The bed was his own, the room was his own. Best and happiest of all, the time before him was his own, to make amends in!The bell struck twelve.

Scrooge looked about him for the Ghost, and saw it not. As the last stroke ceased to vibrate, he remembered the prediction of ol

In [46]:
# TODO Your code
enc_query_prompt = tokenizer(question_prompt, return_tensors='pt')

enc_query_answer = model.generate(enc_query_prompt.input_ids)

query_answer = tokenizer.decode(enc_query_answer[0], skip_special_tokens=True)

print(question)
print(query_answer)

Who is Voldermort?
a creditor


# Discussion

1. How did your solution perform?
2. Where do you think are the issues?
3. How can you improve it?