In [1]:
from dotenv import load_dotenv
import os 
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS,DocArrayInMemorySearch
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# A simple script for RAG experiments 

Code from: https://github.com/Tech-Watt/RAG-Application-Using-OpenAI-Model/blob/main/main.ipynb 

In [2]:
load_dotenv()
# Models: 'gpt-3.5-turbo' # model lib: https://platform.openai.com/docs/models/gpt-4o
model = ChatOpenAI(api_key=os.getenv('OPENAI_API_KEY'),model='gpt-4o' )
model.invoke('what is the meaning of life?') #Where we pass the question

AIMessage(content='The question "What is the meaning of life?" has been a central philosophical inquiry for centuries and can be approached from various perspectives:\n\n1. **Philosophical Perspective**: Different philosophical traditions offer varied answers. Existentialists, like Jean-Paul Sartre, suggest that life has no inherent meaning and it is up to each individual to create their own purpose. In contrast, some religious philosophies, like those in many forms of Christianity, Islam, and Hinduism, propose that life\'s meaning is derived from a higher power or divine plan.\n\n2. **Scientific Perspective**: From a biological standpoint, the meaning of life could be seen as survival and reproduction. Evolutionary theory suggests that life exists to perpetuate itself through natural selection.\n\n3. **Psychological Perspective**: Psychologists such as Viktor Frankl argue that finding meaning is crucial for mental health and well-being. According to Frankl\'s logotherapy, meaning can 

In [72]:
# how to get the pure reponse with out clutter 
parser = StrOutputParser()
chain = model|parser # So called chaining
chain.invoke('what is Chalmers in Sweden?') #Less verbose

"Chalmers University of Technology, commonly known simply as Chalmers, is a prestigious technical university located in Gothenburg, Sweden. It was founded in 1829 and is named after its benefactor, William Chalmers, a director of the Swedish East India Company who bequeathed part of his fortune to establish the school. \n\nChalmers is renowned for its research and education in technology, natural sciences, architecture, maritime management, and other engineering disciplines. The university emphasizes innovation, sustainability, and collaboration with industry. It offers a range of undergraduate, master's, and doctoral programs and is known for its strong international presence and partnerships.\n\nThe university is also involved in various cutting-edge research projects and has a number of specialized research centers and labs. It operates two campuses in Gothenburg: Johanneberg and Lindholmen."

# Loading pdfs

In [73]:
#todo Add a dicrectory and loop though it.

file_loader = PyPDFLoader('Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.pdf')
page = file_loader.load_and_split()
len(page) # number of pages read

7

In [None]:
# Load entire directories
directory_path = '/Users/kailashdejesushornig/Documents/GitHub/Stipendier/data/pdfs/testSubset'  # Replace with your directory path
pdf_files = load_pdfs(directory_path)

# taqadum bar
with tqdm(total=len(pdf_files), desc="Processing PDFs") as pbar:
        for pdf_path in pdf_files:
            images = extract_images_from_pdf(pdf_path)
            ocr_texts = apply_ocr_to_images(images)
            # print(f"Here comes the OCR: {ocr_texts}") # only for trouble shooting
            transcriptions = transcribe_texts(ocr_texts)
            
            # Save the transcription to a text file named after the PDF file
            output_filename = f"{os.path.splitext(os.path.basename(pdf_path))[0]}.txt"
            with open(output_filename, 'w', encoding='utf-8') as f:
                for idx, transcription in enumerate(transcriptions):
                    f.write(f"Transcription for image {idx + 1}:\n{transcription}\n\n")
            
            print(f"Transcriptions saved to {output_filename}")
            pbar.update(1)


In [74]:
#The doc is still to large, chunk it
spliter = RecursiveCharacterTextSplitter(chunk_size = 200,chunk_overlap = 50) #experiment here? eg. 10 -> destroys the understanding, 50+ works

# Todo: write a script that optimizes the chunk sizes (obs this has been solved before)

pages = spliter.split_documents(page)
pages[0] #look at first

Document(page_content='STYRDOKUMENT: Riktlinje för inköp och upphandling vid Chalmers tekniska högskola . Dnr C 20 2 1 -\n1529 . Beslut av CPO Tommy Bothin , 20 2 1 - 10 - 08 .Dokumentets metadata:\nBeslut av:', metadata={'source': 'Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.pdf', 'page': 0})

In [75]:
# FAISS from facebook 
vector_storage = FAISS.from_documents(pages,OpenAIEmbeddings())
retriever = vector_storage.as_retriever()

# todo: change to chroma

In [77]:
question_template = """
You're a smart bot that answers questions based on the context given to you only.
You don't make things up, this is important.
context:{context}
question:{question}

"""

prompt = PromptTemplate.from_template(template=question_template)
print(prompt.format(context = ' Here is the context to use',
              question = ' Answer this question based on the context'
              ))


your a smart bot that answers questions based on the context given to you only.
You don't make things up.
context: Here is the context to use
question: Answer this question based on the context




In [80]:
#Passing to vectorDB (fetch context) and to LLM in parallell (ask based on fetched context)
result = RunnableParallel(context= retriever,question = RunnablePassthrough()) # explain this again
chain = result |prompt | model | parser # new chain
#question based on wrong context
chain.invoke('What is a LLMs?')

'The provided context does not contain any information about what "LLM" or "LLMs" refers to. Therefore, based on the given context, I cannot provide an answer to your question.'

In [81]:
#question based on right context
chain.invoke('Vem får ingå avtal för Chalmers räkning?')

'Enligt den givna kontexten får endast den som har delegation ingå avtal för Chalmers räkning.'

In [83]:
#to get the source page we use the retreiver
retriever.invoke('Vem får ingå avtal för Chalmers räkning?')

#First one correct: Document(page_content='Chalmers, 201 8 - 06 - 25 , C201 8 - 0 262 i dess senaste version) får företräda Chalmers. Endast den som\nhar delegation kan ingå avtal för Chalmers räkning.', metadata={'source': 'Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.pdf', 'page': 1}),

[Document(page_content='Chalmers, 201 8 - 06 - 25 , C201 8 - 0 262 i dess senaste version) får företräda Chalmers. Endast den som\nhar delegation kan ingå avtal för Chalmers räkning.', metadata={'source': 'Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.pdf', 'page': 1}),
 Document(page_content='3 Grundläggande förutsättningar\nChalmers är en upphandlande enhet och omfattas av lagstiftningen avseende offentlig upphandling.\nVid all upphandling ska särskilt följande principer beaktas:', metadata={'source': 'Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.pdf', 'page': 0}),
 Document(page_content='– Styrdokument vid Chalmers , den 31 oktober 201 6, C 2016 - 1759 .\n2 Mål\nMål et är att visa att Chalmers gör goda affärer .\n3 Grundläggande förutsättningar', metadata={'source': 'Riktlinje för inköp och upphandling vid Chalmers tekniska högskola 211008.pdf', 'page': 0}),
 Document(page_content='Institutionerna ansvarar utifrån sin