In [2]:
# Import libraries
import os
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, JSONLoader, UnstructuredXMLLoader


import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [7]:
import random

In [8]:
# Helper functions
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

In [9]:
# TODO: Create a text splitter
chunk_size = 300
chunk_overlap = 30

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [10]:
# Load Notes (PDF)
chunks = []
folder_path = "./notes"
for filename in os.listdir(folder_path):
    full_path = os.path.join(folder_path, filename)
    pdf_loader = PyPDFLoader(file_path=full_path, mode="page", extract_images=False)
    local_chunks = pdf_loader.load_and_split(text_splitter)
    chunks += local_chunks

# Print chunk info
print_chunk_info(chunks)

No of chunks: 2316
Chunk index: 1204
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-04-17T15:08:22+08:00', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_enabled': 'true', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_setdate': '2025-04-17T07:07:39Z', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_method': 'Privileged', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_name': 'OFFICIAL (OPEN)', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_siteid': '6590cdd4-8337-4198-bacc-47645c4a4d4d', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_actionid': 'ecd1b5c1-6303-42e3-b754-f273c6ea1b62', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_contentbits': '0', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_tag': '10, 0, 1, 1', 'moddate': '2025-04-17T15:08:22+08:00', 'source': './notes\\RVHS 16. Inheritance 9477.pdf', 'total_pages': 70, 'page': 34, 'page_label': '35'})
	2 = ('page_content', 'e.g. a yellow-round seed m

In [None]:
# Create embeddings
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

chroma_embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

# Extract the text
texts = [ c.page_content for c in chunks ]
print(texts[100])
print(len(texts))

text_ids = [  str(uuid4())[:8] for _ in range(len(texts))]
print(text_ids)
print(len(text_ids))

2. Synthesis of secretory polysaccharides (e.g. mucus, cell wall materials such as pectin) 
 
3. Synthesis of lysosomes – via budding from the trans face
2316
['3b7c0302', '11adbbcb', 'aebe6079', '456f61f4', 'e67948e4', '6b37da87', 'e44254b1', '08dafb8d', '51037015', 'd8eb171e', '116a4f52', '7a07b6bd', '81841560', '165c67db', 'e409a149', 'cad5f978', '818b052e', '5462d623', 'ea97cea6', 'ac0a8f17', 'faa1a8ff', '2e6dd3a3', 'edf3a62a', '9834c193', 'fcac78e6', 'e70b37b1', 'b5dd8db5', 'c3628e49', 'cc077257', '0098b109', '526652fc', '2c864800', '3799718a', '881e6680', '48a44233', 'e17c07af', '236dfd69', '77cf963a', 'f3425cb4', 'd45c70fe', '8ac045dc', '6cc4ae94', 'ff5769e6', '29cb733e', '9ba0d3f4', '17ab913c', '7399f709', '9d2420d4', '686c912a', 'f2ef707d', 'fbfa2b54', 'ed28e7b2', '25d48974', '2ccef9f9', '9c3cbd4d', 'fe1be003', '08296984', 'a1739fde', '160705cc', '1e72caed', '6fde5e80', '2e0bf7f1', '59da0392', '7475ed0f', '4c1d13e4', '76930d7a', '2800172f', '93e9d665', 'b975a469', '9381e503', 

In [12]:
# Insert the chunks embdedings into ChromaDB
col_name = 'biology'

# Create a the chromadb client
ch_client = chromadb.Client()

# drop the table
try:
   ch_client.delete_collection(col_name)
except:
   pass

# Insert the texts into the database
carol_col = ch_client.create_collection(
   name = col_name,
   embedding_function=chroma_embed_func
)

#Insert the docs into the collection
carol_col.add(
   documents = texts,
   ids = text_ids
)

In [15]:
# Import the questions
# questions is an array of text

delimiter = "==[DELIMITER]=="

with open("./questions.txt", "r", encoding="utf-8") as f:
    content = f.read()

questions = [part.strip() for part in content.split(delimiter)]
print(questions[0])
print('\n')

# Import the answers
# answers is an array of letters (A, B, C, D)
with open("./answer.txt", "r", encoding="utf-8") as f:
    content = f.read()

answers = [part.strip() for part in content.split()]
print(len(answers))
print(answers[0])


1. An unknown organism has a linear double-stranded DNA genome like that in a
eukaryote. When its DNA replication was examined, it was revealed that although the
process is semi-conservative, no Okazaki fragments were observed in the multiple
replication forks. In addition, the end-replication problem of shortened daughter strands
was not observed.
Which statement correctly explains this phenomenon?
A The organism’s DNA is antiparallel.
B DNA replication only starts at the 3’ end of each template strand.
C DNA polymerases synthesise DNA in both 5’ to 3’ and 3’ to 5’ direction.
D DNA ligases are not involved in the DNA replication process.


31
C


In [19]:
# RAG retrieval (Individual question)
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Extract the core ideas of the question 
question = questions[0]

question = '''
1. An unknown organism has a linear double-stranded DNA genome like that in a
eukaryote. When its DNA replication was examined, it was revealed that although the
process is semi-conservative, no Okazaki fragments were observed in the multiple
replication forks. In addition, the end-replication problem of shortened daughter strands
was not observed. What correctly explains this phenomenon?
'''
#question = "Who was Scrooge's deceased business partner?"
#question = "Who was Scrooge engaged to in his youth, and why did she leave him?"
# question = "What is the name of Bob Cratchit's youngest son who is ill?"
#question = "What does Scrooge see written on the gravestone that frightens him into changing his ways?"
#question = " What is Scrooge's response when his nephew Fred invites him to Christmas dinner at the beginning of the story?"
# question = " What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?"
# question = "Who is Voldermort?"

prompt = f"{question}\n\nWhat is the sentence that verbalizes this data?"
#prompt = f"{question}\n\nWhat data can be extracted from this sentence?"
#prompt = f"Generate an approximately fifteen-word sentence that describes all this data: {question}"

# convert to a statement
enc_prompt = tokenizer(prompt, return_tensors='pt')
enc_answer = model.generate(enc_prompt.input_ids)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)

print(answer)

An unknown organism has a linear double-stranded DNA genome like that in a 


In [None]:
# TODO Your code
context = ""
results = carol_col.query(
   query_texts=[ answer ],
   n_results=3
)
print(results['distances'])
for id in results['ids'][0]:
   result = carol_col.get(id)
   context += result['documents'][0]

print(context)

In [None]:
question_prompt = f"Answer based on context:\n\n{context}\n\n{question}"
print(question_prompt)

In [None]:
# TODO Your code
enc_query_prompt = tokenizer(question_prompt, return_tensors='pt')

enc_query_answer = model.generate(enc_query_prompt.input_ids)

query_answer = tokenizer.decode(enc_query_answer[0], skip_special_tokens=True)

print(question)
print(query_answer)

In [None]:
# Answer all 31 questions and calculate metrics (Base Model)





In [None]:
# Answer all 31 questions and calculate metrics (RAG Model)



