In [1]:
# Import libraries
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import random

In [2]:
# Helper functions
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

In [3]:
# TODO: Create a text splitter
chunk_size = 500
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [4]:
# Load Notes (PDF)
chunks = []
folder_path = "./notes"
for filename in os.listdir(folder_path):
    full_path = os.path.join(folder_path, filename)
    pdf_loader = PyPDFLoader(file_path=full_path, mode="page", extract_images=False)
    local_chunks = pdf_loader.load_and_split(text_splitter)
    chunks += local_chunks

# Print chunk info
print_chunk_info(chunks)

No of chunks: 1497
Chunk index: 1084
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-02-11T09:41:15+08:00', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_enabled': 'true', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_setdate': '2023-12-13T02:31:28Z', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_method': 'Privileged', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_name': 'OFFICIAL (OPEN)', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_siteid': '6590cdd4-8337-4198-bacc-47645c4a4d4d', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_actionid': '940d517b-da46-4635-916e-8e37c1e708a4', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_contentbits': '0', 'moddate': '2025-02-11T09:41:15+08:00', 'source': './notes\\RVHS 3. Carbohydrates.pdf', 'total_pages': 19, 'page': 10, 'page_label': '11'})
	2 = ('page_content', 'a carboxylic (-COOH) group. \n• Both tests make use of an alkaline solution of copper (II) sulphate,

In [5]:
# Create embeddings
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

chroma_embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

# Extract the text
texts = [ c.page_content for c in chunks ]
print(texts[100])
print(len(texts))

text_ids = [  str(uuid4())[:8] for _ in range(len(texts))]
print(text_ids)
print(len(text_ids))

3. Transverse section reveals 9 triplets of microtubules arranged in a ring 
 
Function 
1. Key role in nuclear division in animal cells by acting as microtubules organising centres (MTOCs).  
o Centrioles produce a system of microtubules called spindle fibres that radiates towards the equator 
of the cell.  
o Spindle fibres attach to kinetochore proteins found in centromere region of chromosomes .  
 
 
Cilia 
 
 
 
 
 
 
 
 
Structure
1497
['b301b448', '21e71b65', '9c9dbeb8', 'db6c225c', '04c003b0', '423f8511', 'ae75c7c2', 'e4ceb963', '5a5cba20', 'fbd7c89b', 'f7a64f5b', 'c52e34f8', '457982ce', 'b7289a43', '0cae1e3c', '182d01ed', '8d058d8a', '73708abe', '234f4857', '53540b73', 'fc8947ed', '8be4f154', 'e78ad794', '5222e600', 'c909dbad', '58cee58c', 'e927a4da', 'd513a6f8', '9ab9787d', '5a39a08f', '23ab18e5', 'd25afbec', 'd2590d0c', '98bfdb3e', '19830b4f', '53e6f817', '7ebb1180', '5fcc5185', 'fe2b71fa', '8cb2ed76', '47731fc3', '73dc9ec5', '8663453d', '5d252374', '3aa9577b', '0bf4303a', 

In [6]:
# Insert the chunks embdedings into ChromaDB
col_name = 'biology'

# Create a the chromadb client
ch_client = chromadb.Client()

# drop the table
try:
   ch_client.delete_collection(col_name)
except:
   pass

# Insert the texts into the database
carol_col = ch_client.create_collection(
   name = col_name,
   embedding_function=chroma_embed_func
)

#Insert the docs into the collection
carol_col.add(
   documents = texts,
   ids = text_ids
)

In [26]:
OPTIONS_DELIMITER = '[OPTION]'
STATEMENTS_DELIMITER = '[STATEMENT]'
TRUTHY_DELIMITER = '[TRUTH]'

'''
questions now have the following format
{
'isStatement': True/False
'truthyValue': True/False
'question': 'The question text'
'statements': [ 'statement 1', 'statement 2', ... ]
'options': [ 'option A', 'option B', ... ]
}

'''
def split_questions(question_string):
    question = {}
    question_and_options = question_string.strip().split(OPTIONS_DELIMITER)
    question_part = question_and_options[0]

    if STATEMENTS_DELIMITER in question_part:
        question['isStatement'] = True

        question_and_statements = question_part.split(STATEMENTS_DELIMITER)
        question_and_truthy = question_and_statements[0].strip().split(TRUTHY_DELIMITER)

        question['question'] = question_and_truthy[0].strip()
        question['truthyValue'] = (question_and_truthy[1].strip().lower() == 'true')
        question['statements'] = [ s.strip() for s in question_and_statements[1:]]
        question['options'] = [option.strip() for option in question_and_options[1:]]
    else:
        question['isStatement'] = False
        question['truthyValue'] = False 
        question['question'] = question_part.strip()
        question['statements'] = []
        question['options'] = [option.strip() for option in question_and_options[1:]]
    
    return question



In [30]:
# Import the questions
# questions is an array of text

delimiter = "==[DELIMITER]=="

with open("./questions.txt", "r", encoding="utf-8") as f:
    content = f.read()

questions = [part.strip() for part in content.split(delimiter)]
print(questions[0])
print('\n')

with open("./questions2.txt", "r", encoding="utf-8") as f:
    content = f.read()

modified_questions = [split_questions(part.strip()) for part in content.split(delimiter)]
print(modified_questions[0])
print('\n')
print(modified_questions[1])
print('\n')


# Import the answers
# answers is an array of letters (A, B, C, D)
with open("./answer.txt", "r", encoding="utf-8") as f:
    content = f.read()

answers = [part.strip() for part in content.split()]
print(answers[0])


1. An unknown organism has a linear double-stranded DNA genome like that in a
eukaryote. When its DNA replication was examined, it was revealed that although the
process is semi-conservative, no Okazaki fragments were observed in the multiple
replication forks. In addition, the end-replication problem of shortened daughter strands
was not observed.
Which statement correctly explains this phenomenon?
A The organism’s DNA is antiparallel.
B DNA replication only starts at the 3’ end of each template strand.
C DNA polymerases synthesise DNA in both 5’ to 3’ and 3’ to 5’ direction.
D DNA ligases are not involved in the DNA replication process.


{'isStatement': False, 'truthyValue': False, 'question': 'An unknown organism has a linear double-stranded DNA genome like that in a\neukaryote. When its DNA replication was examined, it was revealed that although the\nprocess is semi-conservative, no Okazaki fragments were observed in the multiple\nreplication forks. In addition, the end-replicatio

In [32]:
# function to retrieve context
def retrieve_context(question: str) -> any:
    context = ""
    results = carol_col.query(
       query_texts=[ question ],
       n_results=3
    )
    for doc in results['documents'][0]:
        context += doc + "\n"
    return context

def retrieve_statement_context(question_object: dict) -> any:
    context = ""
    query_text = question_object['question']
    for statement in question_object['statements']:
        query_text += " " + statement
    results = carol_col.query(
       query_texts=[ query_text ],
       n_results=3
    )
    for doc in results['documents'][0]:
        context += doc + "\n"
    return context

In [40]:
print("Generated | Correct")

generated_answers_context = []

questions = modified_questions


for i in range(len(questions)):
    question_object = questions[i]

    if question_object['isStatement']:
        generated_answers_context += 'Y'

    else:

        question_text = question_object['question']
        context_prompt = retrieve_context(question_text)
        prompt = f"""
given this context:
{context_prompt}

Answer the following question using A, B, C or D.

Question:
{question_text}

Options:
A. {question_object['options'][0]}
B. {question_object['options'][1]}
C. {question_object['options'][2]}
D. {question_object['options'][3]}

If you do not know the answer, please respond with "X".
        """

        # print(prompt)

        # convert to a statement
        enc_prompt = tokenizer(prompt, return_tensors='pt')
        enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
        answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
        
        # print(answer)

        if len(answer) > 1:
            answer = 'X'
        generated_answers_context += answer

        print(f'{i+1}: {generated_answers_context[i]} | {answers[i]}')


answers = answers[:len(generated_answers_context)]

num_of_correct_context = 0
for i in range(len(answers)):
    if (generated_answers_context[i] == answers[i]):
        num_of_correct_context += 1
print(f'Correct: {num_of_correct_context}/{len(answers)}')


Generated | Correct
1: A | C
3: B | D
Correct: 0/5


In [13]:
# Retrieve context from ChromaDB
context = ""
results = carol_col.query(
   query_texts=[ questions[0] ],
   n_results=3
)
print(results['distances'])
for id in results['ids'][0]:
   result = carol_col.get(id)
   context += result['documents'][0]

print(context)

[[0.1736697554588318, 0.1759093999862671, 0.19135987758636475]]
River Valley High School 21     2025 JC1 H2 Biology 
Lecture Topic 7: DNA Replication   
 
7. DNA ligase catalyses the formation of a phosphoester bond between the 3’ end of each new Okazaki 
fragment and the 5’ end of the growing strand to form a continuous strand. 
8. Each daughter DNA molecule now consists of a newly synthesised strand and a parental strand. 
 
End-Replication Problem 
Upon the completion of DNA replication of linear DNA in eukaryotes, the RNA primers complementary to4. Each growing new DNA strand is antiparallel to its parental template strand. 
5. The leading strand is synthesised continuously as a single polymer along the template strand.  
o The leading strand is polymerised in the mandatory 5’ to 3’ manner towards replication fork. 
6. The lagging strand  is synthesised discontinuously as a series of short fragments called Okazaki 
fragments along the template strand. 
o Each Okazaki fragment requi

In [14]:
# RAG retrieval (Individual question)

# Extract the core ideas of the question 

#question = "Who was Scrooge's deceased business partner?"
#question = "Who was Scrooge engaged to in his youth, and why did she leave him?"
# question = "What is the name of Bob Cratchit's youngest son who is ill?"
#question = "What does Scrooge see written on the gravestone that frightens him into changing his ways?"
#question = " What is Scrooge's response when his nephew Fred invites him to Christmas dinner at the beginning of the story?"
# question = " What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?"
# question = "Who is Voldermort?"

#prompt = f"{question}\n\nWhat is the sentence that verbalizes this text?"

final_prompt = f"""
given this context:
{context}

Answer the following question using A, B, C or D.

Question:
{questions[0]}

If you do not know the answer, please respond with "X".
"""
#prompt = f"{question}\n\nWhat data can be extracted from this sentence?"
#prompt = f"Generate an approximately fifteen-word sentence that describes all this data: {question}"

# convert to a statement
enc_prompt = tokenizer(final_prompt, return_tensors='pt')
enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)

print(answer)

A


In [15]:
# Answer all 31 questions and calculate metrics (Base Model)

num_of_correct = 0
for i in range(len(answers)):
    if (generated_answers[i] == answers[i]):
        num_of_correct += 1
print(f'Correct: {num_of_correct}/{len(answers)}')




Correct: 10/31


In [16]:
# Answer all 31 questions and calculate metrics (RAG Model)

num_of_correct_context = 0
for i in range(len(answers)):
    if (generated_answers_context[i] == answers[i]):
        num_of_correct_context += 1
print(f'Correct: {num_of_correct_context}/{len(answers)}')




Correct: 14/31


In [None]:
# Generate answers using FLAN-T5 without RAG
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

generated_answers = []

print("Generated | Correct")

for i in range(len(questions)):
  prompt = f"""
  Answer the following question using A, B, C or D.

  Question:
  {questions[i]}

  If you do not know the answer, please respond with "X".
  """

  # convert to a statement
  enc_prompt = tokenizer(prompt, return_tensors='pt')
  enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
  answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
  generated_answers += answer

  print(f'{i+1}: {generated_answers[i]} | {answers[i]}')



# Calculate correctness
num_of_correct = 0
for i in range(len(answers)):
    if (generated_answers[i] == answers[i]):
        num_of_correct += 1
print(f'Correctness: {num_of_correct}/{len(answers)}')

In [None]:
# Summarize the questions for query to ChhromaDB

prompt = f"""
Summarize the following question:

{questions[0]}
"""

# convert to a statement
enc_prompt = tokenizer(prompt, return_tensors='pt')
enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
generated_answers += answer

print(f'{answer}')
