In [32]:
# Load args
model_name = 'LegolasTheElf/Long-T5-Booksum'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer       = AutoTokenizer.from_pretrained(model_name)
model           = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [2]:
# Load vectordatabase
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from pcg_loader import Loader
from langchain.text_splitter import RecursiveCharacterTextSplitter

if load_database:
    vectordb = Chroma(persist_directory=os.path.join(directory,version),
                  embedding_function=HuggingFaceEmbeddings(model_name=model_name))
else:
    docs = Loader(directory=directory,version=version).load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    vectordb = Chroma.from_documents(persist_directory=os.path.join(directory,version),
                                        documents=splits, 
                                        embedding=HuggingFaceEmbeddings(model_name=model_name))



  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name fnando1995/t5-small-ft-bookSum. Creating a new one with mean pooling.


In [40]:
# Query
# query_text = "Summarize the book about incidents in the life of a slave girl"

query_text = "Summarize the book about the strand magazine, vol. 05, issue 25"

In [41]:
# Retrieve using retriever
retriever = vectordb.as_retriever()
retriever.invoke(query_text)

[Document(page_content='the strand magazine, vol. 05, issue 28, april 1893 produced by jonathan ingram, janet blenkinship and the online distributed proofreading team at the strand an illustrated monthly vol. 5, issue. 28. april 1893 illustration sandringham from a photo. by bedford lemere. the prince of wales at sandringham. the prince of wales is, of course, precluded by his position from granting interviews like private persons, but his royal highness has been so good as to give us special permission to insert the following extremely interesting article, which we are happy to be able to present to our readers in place of the illustrated interview for the present month. the next of the series of illustrated interviews, by mr. harry how, will appear next month. sir robert rawlinson, the celebrated engineer, whose work saved so many lives in the crimea, has given mr. how a most interesting interview, with special illustrations. "far from the busy haunt of man" might be fitly applied to

In [42]:
# Retrieve using directly database
vectordb.similarity_search(query_text,k=4)

[Document(page_content='the strand magazine, vol. 05, issue 28, april 1893 produced by jonathan ingram, janet blenkinship and the online distributed proofreading team at the strand an illustrated monthly vol. 5, issue. 28. april 1893 illustration sandringham from a photo. by bedford lemere. the prince of wales at sandringham. the prince of wales is, of course, precluded by his position from granting interviews like private persons, but his royal highness has been so good as to give us special permission to insert the following extremely interesting article, which we are happy to be able to present to our readers in place of the illustrated interview for the present month. the next of the series of illustrated interviews, by mr. harry how, will appear next month. sir robert rawlinson, the celebrated engineer, whose work saved so many lives in the crimea, has given mr. how a most interesting interview, with special illustrations. "far from the busy haunt of man" might be fitly applied to

In [33]:
# Load model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer       = AutoTokenizer.from_pretrained(model_name)
model           = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [7]:
# Post-processing prompt (join)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [43]:
# Define RAG prompt

from langchain_core.prompts.chat import PromptTemplate
prompt_template = """
You are a great book summarizer. Below is a context followed by a question. The context is about a book. Please provide a detailed and accurate summarization based on the context for the question.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

# from langchain.prompts import ChatPromptTemplate
# prompt_template = """
# You are a great book summarizer. Below is a context followed by a question. The context is about a book. Please provide a detailed and accurate summarization based on the context for the question.

# Context:
# {context}

# Question:
# {question}

# Answer:
# """
# prompt = ChatPromptTemplate.from_template(prompt_template)


In [9]:
prompt

PromptTemplate(input_variables=['context', 'question'], template='\nYou are a great book summarizer. Below is a context followed by a question. The context is about a book. Please provide a detailed and accurate summarization based on the context for the question.\n\nContext:\n{context}\n\nQuestion:\n{question}\n\nAnswer:\n')

In [56]:
# chain using ChatPromptTemplate
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.runnables import RunnablePassthrough
# # Chain
# chain = (
#     {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )
# chain.invoke(query_text)

In [58]:
# Chain using PromptTemplate
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.runnables import RunnablePassthrough
# # Chain
# chain = (
#     {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )
# chain.invoke(query_text)

In [48]:
# Custom Rag
class CustomRag():
    def __init__(self,promptTemplate,model,tokenizer,vectordb):
        self.prompt_template = promptTemplate
        self.model = model
        # self.retriever = retriever
        self.tokenizer = tokenizer
        self.vectordb = vectordb    
    def query_rag(self,query):
        results = self.vectordb.similarity_search_with_score(query, k=5)
        context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
        prompt = self.prompt_template.format(context=context_text, question=query)
        print('prompt>>',prompt)
        print("len prompt>>",len(prompt))
        inputs_ids = self.tokenizer(prompt, return_tensors='pt',max_length=512, truncation=True, padding="max_length").input_ids
        print("shape prompt tokenized>>",inputs_ids.shape)
        print("decoded input>>",self.tokenizer.decode(inputs_ids[0], skip_special_tokens=True))
        response = self.model.generate(inputs_ids,max_new_tokens=150)
        print("decoded response>>",self.tokenizer.decode(response[0], skip_special_tokens=True))

        return response

tokenizer_config.json:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [49]:
rag = CustomRag(prompt,model,tokenizer,vectordb)
r = rag.query_rag(query_text)

prompt>> 
You are a great book summarizer. Below is a context followed by a question. The context is about a book. Please provide a detailed and accurate summarization based on the context for the question.

Context:
the strand magazine, vol. 05, issue 28, april 1893 produced by jonathan ingram, janet blenkinship and the online distributed proofreading team at the strand an illustrated monthly vol. 5, issue. 28. april 1893 illustration sandringham from a photo. by bedford lemere. the prince of wales at sandringham. the prince of wales is, of course, precluded by his position from granting interviews like private persons, but his royal highness has been so good as to give us special permission to insert the following extremely interesting article, which we are happy to be able to present to our readers in place of the illustrated interview for the present month. the next of the series of illustrated interviews, by mr. harry how, will appear next month. sir robert rawlinson, the celebrat



decoded response>> This is a book summary. It's about a novel, and it's been published in the Strand magazine from 1893 to 1893. The prince of Wales has asked us to put an interview on his site for the next month. We're going to have to wait until then, but we'll give you the details.


In [31]:
r.shape

torch.Size([1, 150])

In [51]:
from datasets import load_dataset
data=load_dataset('ubaada/booksum-complete-cleaned','books')

Downloading data:   0%|          | 0.00/78.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/151 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19 [00:00<?, ? examples/s]

In [52]:
data

DatasetDict({
    train: Dataset({
        features: ['bid', 'title', 'text', 'summary'],
        num_rows: 151
    })
    test: Dataset({
        features: ['bid', 'title', 'text', 'summary'],
        num_rows: 17
    })
    validation: Dataset({
        features: ['bid', 'title', 'text', 'summary'],
        num_rows: 19
    })
})

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")