In [17]:
# Import libraries
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import random

In [18]:
# Helper functions
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

In [123]:
# TODO: Create a text splitter
chunk_size = 500
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [124]:
# Load Notes (PDF)
chunks = []
folder_path = "./notes"
for filename in os.listdir(folder_path):
    full_path = os.path.join(folder_path, filename)
    pdf_loader = PyPDFLoader(file_path=full_path, mode="page", extract_images=False)
    local_chunks = pdf_loader.load_and_split(text_splitter)
    chunks += local_chunks

# Print chunk info
print_chunk_info(chunks)

No of chunks: 1497
Chunk index: 788
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-02-11T09:38:47+08:00', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_enabled': 'true', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_setdate': '2023-12-13T02:42:03Z', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_method': 'Privileged', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_name': 'OFFICIAL (OPEN)', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_siteid': '6590cdd4-8337-4198-bacc-47645c4a4d4d', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_actionid': 'aa4e5100-65e3-4b4c-a57b-4d81900ff324', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_contentbits': '0', 'moddate': '2025-02-11T09:38:47+08:00', 'source': './notes/RVHS 1. Organelles and Cellular Structures.pdf', 'total_pages': 30, 'page': 14, 'page_label': '15'})
	2 = ('page_content', 'River Valley High School 15 2025 JC1 H2 Biology \nLecture Topic 1: Organelles and

In [125]:
# Create embeddings
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

chroma_embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

# Extract the text
texts = [ c.page_content for c in chunks ]
print(texts[100])
print(len(texts))

text_ids = [  str(uuid4())[:8] for _ in range(len(texts))]
print(text_ids)
print(len(text_ids))

with the restriction enzyme that recognises the site will produce a different mixture of fra gments 
from each allele.  Each mixture will give its own band pattern in gel electrophoresis.   
 
 
 
 
 
 
 
 
 
 
 
 
Analysis of the normal and sickle-cell alleles of the β-globin gene.  
 
 
 
Figure shows the restriction site for the restriction 
enzyme EcoRI. Restriction enzymes always cut DNA at 
a specific sequence of DNA. 
 
Source:
1497
['c047fa12', 'e4d8081e', 'd9deb782', '633420c1', 'f6620dda', 'fe382e98', '425c7f64', '12cc8501', '229c4a05', '547f8f14', 'e8f37308', 'cf59f36d', 'e97a2fe4', '28ccdc35', '0360457e', '0c47cb1a', '365d5289', 'bd097fdb', '00f6635d', 'eb780872', '887b44ee', 'b56c07a9', '3dfaaec6', 'ba4ec3e5', '14037f10', '22c63e38', '9157400e', '85e8f709', '5e566fef', '36c4b91c', 'f18f62cc', 'cf3b06a5', 'afb1256b', '63161cd0', 'd0be6b96', '99a049ce', 'c3bab7c1', 'ba0f5e3e', 'd5ad4365', '8c2e84b4', 'c23911d8', '8ec3b56e', '1ca428f3', 'dfcf6de5', '41ee44fc', '9dfc86c1', 'a8

In [126]:
# Insert the chunks embdedings into ChromaDB
col_name = 'biology'

# Create a the chromadb client
ch_client = chromadb.Client()

# drop the table
try:
   ch_client.delete_collection(col_name)
except:
   pass

# Insert the texts into the database
carol_col = ch_client.create_collection(
   name = col_name,
   embedding_function=chroma_embed_func
)

#Insert the docs into the collection
carol_col.add(
   documents = texts,
   ids = text_ids
)

In [120]:
# Import the questions
# questions is an array of text

delimiter = "==[DELIMITER]=="

with open("./questions.txt", "r", encoding="utf-8") as f:
    content = f.read()

questions = [part.strip() for part in content.split(delimiter)]
print(questions[0])
print('\n')

# Import the answers
# answers is an array of letters (A, B, C, D)
with open("./answer.txt", "r", encoding="utf-8") as f:
    content = f.read()

answers = [part.strip() for part in content.split()]
print(answers[0])


1. An unknown organism has a linear double-stranded DNA genome like that in a
eukaryote. When its DNA replication was examined, it was revealed that although the
process is semi-conservative, no Okazaki fragments were observed in the multiple
replication forks. In addition, the end-replication problem of shortened daughter strands
was not observed.
Which statement correctly explains this phenomenon?
A The organism’s DNA is antiparallel.
B DNA replication only starts at the 3’ end of each template strand.
C DNA polymerases synthesise DNA in both 5’ to 3’ and 3’ to 5’ direction.
D DNA ligases are not involved in the DNA replication process.


C


In [103]:
# Generate answers using FLAN-T5 without RAG
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

generated_answers = []

print("Generated | Correct")

for i in range(len(questions)):
  prompt = f"""
  Answer the following question using A, B, C or D.

  Question:
  {questions[i]}

  If you do not know the answer, please respond with "X".
  """

  # convert to a statement
  enc_prompt = tokenizer(prompt, return_tensors='pt')
  enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
  answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
  generated_answers += answer

  print(f'{i+1}: {generated_answers[i]} | {answers[i]}')



Generated | Correct
1: A | C
2: D | D
3: A | D
4: D | D
5: B | C
6: A | C
7: A | B
8: D | B
9: B | C
10: D | A
11: B | B
12: D | B
13: D | B
14: D | D
15: A | D
16: B | B
17: D | D
18: A | C
19: C | A
20: B | C
21: A | B
22: A | C
23: A | A
24: B | A
25: A | A
26: A | A
27: B | C
28: D | B
29: D | B
30: A | A
31: A | B


In [104]:
# Calculate correctness
num_of_correct = 0
for i in range(len(answers)):
    if (generated_answers[i] == answers[i]):
        num_of_correct += 1
print(f'Correctness: {num_of_correct}/{len(answers)}')

Correctness: 10/31


In [127]:
# Summarize the questions for query to ChhromaDB

prompt = f"""
Summarize the following question:

{questions[0]}
"""

# convert to a statement
enc_prompt = tokenizer(prompt, return_tensors='pt')
enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
generated_answers += answer

print(f'{answer}')


A.


In [None]:
# function to retrieve context
def retrieve_context(question: str) -> any:
    context = ""
    results = carol_col.query(
       query_texts=[ question ],
       n_results=3
    )
    for doc in results['documents'][0]:
        context += doc + "\n"
    return context

print("Generated | Correct")

generated_answers_context = []
for i in range(len(questions)):
  context_prompt = retrieve_context(questions[i])
  prompt = f"""
  given this context:
  {context_prompt}

  Answer the following question using A, B, C or D.

  Question:
  {questions[i]}

  If you do not know the answer, please respond with "X".
  """

  # convert to a statement
  enc_prompt = tokenizer(prompt, return_tensors='pt')
  enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
  answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
  if len(answer) > 1:
      answer = 'X'
  generated_answers_context += answer

  print(f'{i+1}: {generated_answers_context[i]} | {answers[i]}')


num_of_correct_context = 0
for i in range(len(answers)):
    if (generated_answers_context[i] == answers[i]):
        num_of_correct_context += 1
print(f'Correct: {num_of_correct_context}/{len(answers)}')


Generated | Correct
1: A | C
2: D | D
3: A | D
4: D | D
5: A | C
6: A | C
7: B | B
8: C | B
9: C | C
10: C | A
11: D | B
12: D | B
13: D | B
14: C | D
15: A | D
16: B | B
17: B | D
18: A | C
19: B | A
20: A | C
21: A | B
22: D | C
23: A | A
24: B | A
25: A | A
26: A | A
27: D | C
28: A | B
29: B | B
30: D | A
31: A | B
Correct: 9/31


In [71]:
# Retrieve context from ChromaDB
context = ""
results = carol_col.query(
   query_texts=[ questions[0] ],
   n_results=3
)
print(results['distances'])
for id in results['ids'][0]:
   result = carol_col.get(id)
   context += result['documents'][0]

print(context)

[[0.1736699938774109, 0.1759093999862671, 0.1913599967956543]]
River Valley High School 21     2025 JC1 H2 Biology 
Lecture Topic 7: DNA Replication   
 
7. DNA ligase catalyses the formation of a phosphoester bond between the 3’ end of each new Okazaki 
fragment and the 5’ end of the growing strand to form a continuous strand. 
8. Each daughter DNA molecule now consists of a newly synthesised strand and a parental strand. 
 
End-Replication Problem 
Upon the completion of DNA replication of linear DNA in eukaryotes, the RNA primers complementary to4. Each growing new DNA strand is antiparallel to its parental template strand. 
5. The leading strand is synthesised continuously as a single polymer along the template strand.  
o The leading strand is polymerised in the mandatory 5’ to 3’ manner towards replication fork. 
6. The lagging strand  is synthesised discontinuously as a series of short fragments called Okazaki 
fragments along the template strand. 
o Each Okazaki fragment requir

In [62]:
# RAG retrieval (Individual question)

# Extract the core ideas of the question 

#question = "Who was Scrooge's deceased business partner?"
#question = "Who was Scrooge engaged to in his youth, and why did she leave him?"
# question = "What is the name of Bob Cratchit's youngest son who is ill?"
#question = "What does Scrooge see written on the gravestone that frightens him into changing his ways?"
#question = " What is Scrooge's response when his nephew Fred invites him to Christmas dinner at the beginning of the story?"
# question = " What specific, generous act does Scrooge perform for the Cratchit family on Christmas morning?"
# question = "Who is Voldermort?"

#prompt = f"{question}\n\nWhat is the sentence that verbalizes this text?"

final_prompt = f"""
given this context:
{context}

Answer the following question using A, B, C or D.

Question:
{questions[0]}

If you do not know the answer, please respond with "X".
"""
#prompt = f"{question}\n\nWhat data can be extracted from this sentence?"
#prompt = f"Generate an approximately fifteen-word sentence that describes all this data: {question}"

# convert to a statement
enc_prompt = tokenizer(final_prompt, return_tensors='pt')
enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)

print(answer)

C


In [63]:
# Answer all 31 questions and calculate metrics (Base Model)

num_of_correct = 0
for i in range(len(answers)):
    if (generated_answers[i] == answers[i]):
        num_of_correct += 1
print(f'Correct: {num_of_correct}/{len(answers)}')




Correct: 14/31


In [76]:
# Answer all 31 questions and calculate metrics (RAG Model)

num_of_correct_context = 0
for i in range(len(answers)):
    if (generated_answers_context[i] == answers[i]):
        num_of_correct_context += 1
print(f'Correct: {num_of_correct_context}/{len(answers)}')




Correct: 12/31
