# Biology AI-assisted notes

This project aims to help A level Biology students do MCQ Questions and understand how the answers are dervived from the notes using RAG

In [66]:
# Import libraries
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

import chromadb
from uuid import uuid4
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import random

In [67]:
# Helper functions
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

# Load and chunk PDF notes

In [68]:
# TODO: Create a text splitter
chunk_size = 500
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [69]:
# Load Notes (PDF)
chunks = []
folder_path = "./notes"
for filename in os.listdir(folder_path):
    full_path = os.path.join(folder_path, filename)
    pdf_loader = PyPDFLoader(file_path=full_path, mode="page", extract_images=False)
    local_chunks = pdf_loader.load_and_split(text_splitter)
    chunks += local_chunks

# Print chunk info
print_chunk_info(chunks)

No of chunks: 1497
Chunk index: 66
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-02-11T09:38:47+08:00', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_enabled': 'true', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_setdate': '2023-12-13T02:42:03Z', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_method': 'Privileged', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_name': 'OFFICIAL (OPEN)', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_siteid': '6590cdd4-8337-4198-bacc-47645c4a4d4d', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_actionid': 'aa4e5100-65e3-4b4c-a57b-4d81900ff324', 'msip_label_5fb06199-574a-4a73-a110-6e35d3b9f147_contentbits': '0', 'moddate': '2025-02-11T09:38:47+08:00', 'source': './notes\\RVHS 1. Organelles and Cellular Structures.pdf', 'total_pages': 30, 'page': 17, 'page_label': '18'})
	2 = ('page_content', 'River Valley High School 18 2025 JC1 H2 Biology \nLecture Topic 1: Organelles and

# Create embeddings

In [None]:
embed_model_name = "BAAI/bge-small-en-v1.5"
#embed_model_name = "all-MiniLM-L6-v2"

chroma_embed_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embed_model_name)

# Extract the text
texts = [ c.page_content for c in chunks ]
print(texts[100])
print(len(texts))

text_ids = [  str(uuid4())[:8] for _ in range(len(texts))]
print(text_ids)
print(len(text_ids))

3. Transverse section reveals 9 triplets of microtubules arranged in a ring 
 
Function 
1. Key role in nuclear division in animal cells by acting as microtubules organising centres (MTOCs).  
o Centrioles produce a system of microtubules called spindle fibres that radiates towards the equator 
of the cell.  
o Spindle fibres attach to kinetochore proteins found in centromere region of chromosomes .  
 
 
Cilia 
 
 
 
 
 
 
 
 
Structure
1497
['41ac65be', '7bd02a9a', 'b9d392ea', '6b7a2aeb', 'e0991cd3', 'a47e04f2', 'e19beb91', 'b177bb73', 'c79b45c6', '0b1817c4', 'deafb180', '4f1a0503', 'c73f49dc', 'a9e00165', '173f959c', '0e8460e2', 'e6369925', '2d48f6b3', '73af7451', '11c029f3', 'd3e3d39d', '6cc40c47', 'ea0cc6a7', '46871872', 'f16b07ab', '715b77d5', 'b4f1ef39', 'c4429b2b', '1c61b81d', '2e6f46ee', 'e0a1671d', 'b2759829', 'ef8ebf79', 'f813f22c', 'a71e4488', '904c47d3', '5ab8292f', '582cae43', 'b632c5be', 'bc94d395', 'd76c48cb', '0ee36f2d', 'f1b65f58', '6be5b528', '050c6885', '8677f252', 

In [71]:
# Insert the chunks embdedings into ChromaDB
col_name = 'biology'

# Create a the chromadb client
ch_client = chromadb.Client()

# drop the table
try:
   ch_client.delete_collection(col_name)
except:
   pass

# Insert the texts into the database
carol_col = ch_client.create_collection(
   name = col_name,
   embedding_function=chroma_embed_func
)

#Insert the docs into the collection
carol_col.add(
   documents = texts,
   ids = text_ids
)

# Import question and answers

In [73]:
OPTIONS_DELIMITER = '[OPTION]'
STATEMENTS_DELIMITER = '[STATEMENT]'
TRUTHY_DELIMITER = '[TRUTH]'
QUESTION_DELIMITER = '[QUESTION]'


'''
questions now have the following format
{
'isStatement': True/False
'truthyValue': True/False
'question': 'The question text'
'statements': [ 'statement 1', 'statement 2', ... ]
'options': [ 'option A', 'option B', ... ]
}

'''
def split_questions(question_string):
    question = {}
    question_and_options = question_string.strip().split(OPTIONS_DELIMITER)
    question_part = question_and_options[0]

    if STATEMENTS_DELIMITER in question_part:
        question['isStatement'] = True

        question_and_statements = question_part.split(STATEMENTS_DELIMITER)
        question_and_truthy = question_and_statements[0].strip().split(TRUTHY_DELIMITER)

        original_question_and_modified_question = question_and_truthy[0].strip().split(QUESTION_DELIMITER)

        # print(original_question_and_modified_question)


        question['question'] = original_question_and_modified_question[0].strip()
        question['modifiedQuestion'] = original_question_and_modified_question[1].strip()

        question['truthyValue'] = (question_and_truthy[1].strip().lower() == 'true')
        question['statements'] = [ s.strip() for s in question_and_statements[1:]]
        question['options'] = [option.strip() for option in question_and_options[1:]]
    else:
        question['isStatement'] = False
        question['truthyValue'] = False 
        question['question'] = question_part.strip()
        question['modifiedQuestion'] = ""
        question['statements'] = []
        question['options'] = [option.strip() for option in question_and_options[1:]]
    
    return question



In [74]:
# Import the questions
# questions is an array of text

delimiter = "==[DELIMITER]=="

with open("./questions.txt", "r", encoding="utf-8") as f:
    content = f.read()

questions_original = [part.strip() for part in content.split(delimiter)]
print(questions_original[0])
print('\n')

with open("./questions2.txt", "r", encoding="utf-8") as f:
    content = f.read()

modified_questions = [split_questions(part.strip()) for part in content.split(delimiter)]
print(modified_questions[0])
print('\n')
print(modified_questions[1])
print('\n')


# Import the answers
# answers is an array of letters (A, B, C, D)
with open("./answer.txt", "r", encoding="utf-8") as f:
    content = f.read()

answers = [part.strip() for part in content.split()]
print(answers[0])


1. An unknown organism has a linear double-stranded DNA genome like that in a
eukaryote. When its DNA replication was examined, it was revealed that although the
process is semi-conservative, no Okazaki fragments were observed in the multiple
replication forks. In addition, the end-replication problem of shortened daughter strands
was not observed.
Which statement correctly explains this phenomenon?
A The organism’s DNA is antiparallel.
B DNA replication only starts at the 3’ end of each template strand.
C DNA polymerases synthesise DNA in both 5’ to 3’ and 3’ to 5’ direction.
D DNA ligases are not involved in the DNA replication process.


{'isStatement': False, 'truthyValue': False, 'question': 'An unknown organism has a linear double-stranded DNA genome like that in a eukaryote. When its DNA replication was examined, it was revealed that although the process is semi-conservative, no Okazaki fragments were observed in the multiple replication forks. In addition, the end-replication p

In [None]:
# function to retrieve context
def retrieve_context(question: str) -> any:
    context = ""
    results = carol_col.query(
       query_texts=[ question ],
       n_results=3
    )
    for doc in results['documents'][0]:
        context += doc + "\n"
    return context

# Generate answers using FLAN-T5 without RAG

In [None]:
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

generated_answers = []

print("Generated | Correct")

for i in range(len(questions_original)):
  prompt = f"""
  Answer the following question using A, B, C or D.

  Question:
  {questions_original[i]}

  If you do not know the answer, please respond with "X".
  """

  # convert to a statement
  enc_prompt = tokenizer(prompt, return_tensors='pt')
  enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
  answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
  generated_answers += answer

  print(f'{i+1}: {generated_answers[i]} | {answers[i]}')



# Calculate correctness
num_of_correct = 0
for i in range(len(answers)):
    if (generated_answers[i] == answers[i]):
        num_of_correct += 1
print(f'Correctness: {num_of_correct}/{len(answers)}')

Generated | Correct
1: A | C
2: D | D
3: A | D
4: D | D
5: B | C
6: A | C
7: A | B
8: D | B
9: B | C
10: D | A
11: B | B
12: D | B
13: D | B
14: D | D
15: A | D
16: B | B
17: D | D
18: A | C
19: C | A
20: B | C
21: A | B
22: A | C
23: A | A
24: B | A
25: A | A
26: A | A
27: B | C
28: D | B
29: D | B
30: A | A
31: A | B
Correctness: 10/31


# Generate answers using FLAN-T5 with simple RAG

In [None]:
rint("Generated | Correct")

generated_answers_context = []

for i in range(len(questions_original)):
  context_prompt = retrieve_context(questions_original[i])
  prompt = f"""
  given this context:
  {context_prompt}

  Answer the following question using A, B, C or D.

  Question:
  {questions_original[i]}

  If you do not know the answer, please respond with "X".
  """

  # convert to a statement
  enc_prompt = tokenizer(prompt, return_tensors='pt')
  enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
  answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)
  if len(answer) > 1:
      answer = 'X'
  generated_answers_context += answer

  print(f'{i+1}: {generated_answers_context[i]} | {answers[i]}')


num_of_correct_context = 0
for i in range(len(answers)):
    if (generated_answers_context[i] == answers[i]):
        num_of_correct_context += 1
print(f'Correct: {num_of_correct_context}/{len(answers)}')

Generated | Correct
1: A | C
2: D | D
3: A | D
4: D | D
5: A | C
6: A | C
7: A | B
8: D | B
9: B | C
10: A | A
11: B | B
12: D | B
13: B | B
14: D | D
15: A | D
16: B | B
17: D | D
18: A | C
19: C | A
20: A | C
21: A | B
22: A | C
23: A | A
24: B | A
25: A | A
26: A | A
27: B | C
28: B | B
29: B | B
30: A | A
31: A | B
Correct: 14/31


# Generate answers using FLAN-T5 with better processing

In [None]:
print("Generated | Correct")

generated_answers_context = []

questions = modified_questions


for i in range(len(questions)):
    question_object = questions[i]

    if question_object['isStatement']:
        question_text = question_object['question']

        stmt_txt = ""

        for idx, stmt in enumerate(question_object['statements']):
            stmt_txt += f'{idx + 1} {stmt}\n'


        question_with_statements = f'''{question_text}
        Statements:
        {stmt_txt}
        '''

        context_prompt = retrieve_context(question_with_statements)


        
        prompt = f"""
given this context:
{context_prompt}

Answer the following question using A, B, C or D.

Question:
{question_text}

Statements:
{stmt_txt}

Options:
A. {question_object['options'][0]}
B. {question_object['options'][1]}
C. {question_object['options'][2]}
D. {question_object['options'][3]}
If you do not know the answer, please respond with "X".
        """

        # print(prompt)

        enc_prompt = tokenizer(prompt, return_tensors='pt')
        enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
        answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)

        # print(answer)

        if len(answer) > 1:
            answer = 'X'

        generated_answers_context += answer
        print(f'{i+1}: {generated_answers_context[i]} | {answers[i]}')


    else:
        question_text = question_object['question']
        context_prompt = retrieve_context(question_text)
        prompt = f"""
given this context:
{context_prompt}

Answer the following question using A, B, C or D.

Question:
{question_text}

Options:
A. {question_object['options'][0]}
B. {question_object['options'][1]}
C. {question_object['options'][2]}
D. {question_object['options'][3]}

If you do not know the answer, please respond with "X".
        """

        # print(prompt)

        # convert to a statement
        enc_prompt = tokenizer(prompt, return_tensors='pt')
        enc_answer = model.generate(enc_prompt.input_ids, max_new_tokens=500)
        answer = tokenizer.decode(enc_answer[0], skip_special_tokens=True)

        # print(answer)

        if len(answer) > 1:
            answer = 'X'
        generated_answers_context += answer

        print(f'{i+1}: {generated_answers_context[i]} | {answers[i]}')


answers = answers[:len(generated_answers_context)]

num_of_correct_context = 0
for i in range(len(answers)):
    if (generated_answers_context[i] == answers[i]):
        num_of_correct_context += 1
print(f'Correct: {num_of_correct_context}/{len(answers)}')


Generated | Correct
1: A | C
2: D | D
3: B | D
4: D | D
5: B | C
6: A | C
7: B | B
8: D | B
9: B | C
10: C | A
11: B | B
12: D | B
13: D | B
14: D | D
15: A | D
16: B | B
17: B | D
18: A | C
19: C | A
20: B | C
21: A | B
22: A | C
23: A | A
24: B | A
25: A | A
26: A | A
27: A | C
28: B | B
29: D | B
30: A | A
31: A | B
Correct: 11/31


# Limitations 
- Biology notes (Data) are not clean
- Need to extract diagrams from PDF
- Need to include questions with diagrams
- Poor performance due to the complexity of biology -- it requires critical thinking
