In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loader import UnstructuredPDFLoader

In [None]:
!pip install unstructured -q

In [None]:
!pip install pdf2image

In [None]:
loader = UnstructuredPDFLoader('attention_is_all_you_need.pdf')
data = loader.load()

In [None]:
#print(data[0].page_content)

In [None]:
text_splitter=RecursiveCharacterTextSplitter(chunks_size=10000, chunks_overlap=100)
chunks = text_splitter.split_documents(data)

In [None]:
len(chunks)

In [None]:
llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo')

In [None]:
def print_embedding_cost(texts):
    import tiktoken
    enc=tiktoken.encoding_for_model('gpt-3.5-turbo')
    total_tokens=sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens:{total_tokens}')
    print(f'Embedding Cost in USD:{total_tokens/1000*0.002:.6f}')
    
print_embedding_cost(chunks)

In [None]:
chain = load_summarize_chain(
   llm=llm,
   chain_type='refine',
    verbose=False
)

output_summary = chain.run(chunks)

In [None]:
print(output_sumamry)

# Refine With Custom Prompts

In [1]:
prompt_template = """
Write a concise summary of the following extracting the key information:
Text:'{text}'
CONCISE SUMMARY"""

initial_prompt = PromptTemplate(template= prompt_template, input_variable=['text'])

refine_template ='''
Your job is to produce a final summary.
I have provided an existing sumamry up to certain point: {existing_answer}.
Please refine the existing sumamry with some more context below.
--------
{text}
--------
Start the final summary with an INTRODUCTION PARAGRAPH that gives an overview of the topic FOLLOWED
by BULLET POINTS if possible AND end the summary with a CONCLUSION PHRASE.
'''
refine_prompt = PromptTemplate(template=refine_template,
            input_variables=['existing_answer','text'])


In [None]:
chain=load_summarize_chain(
    llm=llm,
    chain_type='refine',
    question_prompt=initial_prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=False
)
output_summary=chain.run(chunks)

In [None]:
print(output_summary)