# Load epub book

In [1]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
# TODO: Load document 
chunk_size = 300
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)


In [3]:
# TODOL Load document 
loader = UnstructuredEPubLoader('docs/charles-dickens_a-christmas-carol.epub')


In [4]:
# TODO Split document
chunks = loader.load_and_split(text_splitter)

  data file translations/en.yaml not found



In [6]:
# TODO Examine chunk
print(len(chunks))
print(chunks[100])

744
page_content='You may talk vaguely about driving a coach and six up a good old flight of stairs, or through a bad young act of Parliament; but I mean to say you might have got a hearse up that staircase, and taken it broadwise, with the splinter-bar towards the wall, and the door towards the balustrades: and' metadata={'source': 'docs/charles-dickens_a-christmas-carol.epub'}


# Create embeddings

In [7]:
# TODO: Create embedding model
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)


In [16]:
# TODO: Explore embedding model
text = 'hello, world'
vec = embed_func([ text, chunks[100].page_content ])

print(len(vec))
print(len(vec[0]))
print(len(vec[1]))
print(vec[1])


2
384
384
[-0.01406609 -0.05661467  0.07830305 -0.04875449  0.00596363 -0.01706757
  0.08130031 -0.0210314  -0.01871066 -0.01363143 -0.02491988  0.04526637
 -0.02644806  0.00555288  0.05561635  0.03250337  0.06175813  0.01218211
 -0.0480439  -0.02590557 -0.05691833  0.03213967 -0.03054944  0.00851408
 -0.02802141  0.01222622  0.05819888 -0.02450805 -0.06763179 -0.02908871
  0.03240736 -0.05694105 -0.01163537 -0.09179083  0.00310931  0.03893713
 -0.03298819  0.04430678  0.02069351 -0.02949887  0.07331575 -0.01143006
 -0.05255749 -0.03126996  0.02452269  0.00557517  0.0615153   0.01905978
 -0.03911695 -0.02892844 -0.10031246 -0.03796116 -0.01743324 -0.03871292
  0.02596189  0.02467587  0.1008152   0.02152361  0.0202664   0.07833607
 -0.04237329 -0.02769306 -0.07793275 -0.05570993  0.04013534 -0.03672814
 -0.05398371  0.00429994 -0.0053317   0.04662592  0.01837179 -0.07751473
 -0.02993473  0.00075232 -0.02814893 -0.05050031 -0.01127363  0.05091122
 -0.02370237  0.04509485  0.02788643 -0.0

In [None]:
# TODO: Prepare the chunks for inserting into Chroma
# extract page_content from the chunks into an array
texts = [ d.page_content for d in chunks ]
print(len(texts))

# for every text, generate a unique id for the text
ids = [ str(uuid4())[:8] for _ in range(len(texts)) ]
print(ids)
   


744
['2ea84307', '244480ee', '13449d75', 'd8f422a4', 'bde58654', 'ae491370', '0b0c404c', 'ead66caa', 'a9e82b47', '9d4f164b', 'f6df14eb', '65ff7bc5', '5f838865', '675e9e89', '5d4bbc82', '77c48d97', '06e2b7a6', '45d2055a', '0b14f976', '69ff95a4', '547d0dab', '53ef472a', 'fc2057a5', 'ab6abdd4', 'c9aa4c6f', '7f38fa6b', 'c68b3f48', '9884fad9', '2c217c3e', '643f8e0d', '63fd5cbf', 'ba4f42a3', '71e6e3fa', 'a0be20f2', 'e222765c', '5f0877b2', '91f58589', 'a83c41a9', 'bf2478c8', '863da780', '304912f2', '140c82aa', '0d548ce5', 'c9ebf769', '5ad6c03e', '2a7e19b2', '0df24c0c', '2a98c601', 'b10f21d0', '058bec20', '04ca2d8a', '54bb286c', '27324029', 'ff6ea196', '5308b893', '70741075', '15bf4dfd', 'd1c0f896', 'cbac158a', '1172bcf3', 'b171dda2', '432fb930', 'fe8be2bd', '1c716c91', 'd1f53445', '6c04404e', '9de35d9a', '43f8e689', '68f58d62', 'ea63f45e', '4fc087bb', '5ddc174d', '3206963e', '77be50bd', '42d5ec71', '50e04e5a', '59865330', '11feb298', '77ff8349', '3b112ea3', '24005c14', 'c88d494d', 'ae5059dd',

In [19]:
# TODO: Create ephemeral Chroma client and save chunks
col_name = 'epub'
embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

# create the chroma client
ch_client = chromadb.Client()
try:
   ch_client.delete_collection(col_name)
except:
   pass

# create the collection
epub_col = ch_client.create_collection(name=col_name, embedding_function=embed_func)


In [None]:
# display the number of documents in the collection 
print(epub_col.count())

0


In [21]:
# Add text into collection 
epub_col.add(documents=texts, ids=ids)

In [22]:
# TODO: Print number of documents in collection 
print('after inserting: ', epub_col.count())

after inserting:  744


In [None]:
# TODO: Query collection 
query = 'Who is Marley'
results = epub_col.query(
   query_texts=[ query ],
   n_results=5,
)

print(results)
print(results['documents'])
print(len(results['documents']))

{'ids': [['7c283f49', '9d4f164b', 'f6df14eb', '0f4e23f2', '15bf4dfd']], 'embeddings': None, 'documents': [['“How now!” said Scrooge, caustic and cold as ever. “What do you want with me?”\n\n“Much!”\ufeff—Marley’s voice; no doubt about it.\n\n“Who are you?”\n\n“Ask me who I was.”', 'Marley was dead, to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge’s name was good upon ’Change for anything he chose to put his hand to. Old Marley was as', 'hand to. Old Marley was as dead as a doornail.', '“Ask me who I was.”\n\n“Who were you, then?” said Scrooge, raising his voice. “You’re particular, for a shade.” He was going to say “to a shade,” but substituted this, as more appropriate.\n\n“In life I was your partner, Jacob Marley.”', '“Scrooge and Marley’s, I believe,” said one of the gentlemen, referring to his list. “Have I the pleasure of addressing Mr. Scro

In [31]:
# Get doc by id
result = epub_col.get(ids='7c283f49')
print(result)

{'ids': ['7c283f49'], 'embeddings': None, 'documents': ['“How now!” said Scrooge, caustic and cold as ever. “What do you want with me?”\n\n“Much!”\ufeff—Marley’s voice; no doubt about it.\n\n“Who are you?”\n\n“Ask me who I was.”'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [None]}


# Question and Answer
Implement a question and answer LLM with the vector database. You can use the Question and Answer LLM from day 1's workshop.

Do not worry about the accuracy of the result. Focus on implementing the solution. We will discuss the nuances of the solution at the end of the workshop

In [None]:
# TODO Your code 

In [None]:
# TODO Your code

In [None]:
# TODO Your code

# Discussion

1. How does your solution perform?
2. Where do you think are the issues?
3. How can you improve it?