In [8]:
from kedro.framework.context import KedroContext
from kedro.framework.session import KedroSession
from kedro.framework.startup import bootstrap_project
from pathlib import Path
import pandas as pd
import re
import hashlib
from datetime import datetime

# Bootstrap the Kedro project
project_path = Path.cwd().parents[0]

bootstrap_project(project_path)

# Start a session and access the context and catalog
with KedroSession.create(project_path=project_path) as session:
    context = session.load_context()
    catalog = context.catalog

    # Load the cleaned Grade 12 exam data
    df = catalog.load("cleaned_grade12_exam_data")

# Preview the DataFrame
# df.head(20)


In [9]:
import re

def clean_raw_text(text: str) -> str:
    """Removes boilerplate, numbers, and special characters from exam PDFs."""
    boilerplate_patterns = [
        r"Akavumelekanga ukufotokopa eli phepha.*",
        r"This question paper.*",
        r"SECTION [A-Z]",
        r"INSTRUCTIONS",
        r"QUESTION \d+",
        r"\(Total: \d+ marks\)",
    ]
    for pattern in boilerplate_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # Remove question numbers like 1.1, 2.3.4
    text = re.sub(r"\b\d+(\.\d+)*\b", "", text)

    # Remove non-isiXhosa characters but keep standard punctuation
    text = re.sub(r"[^\w\s’\-.,!?]", " ", text)
    text = re.sub(r"\s{2,}", " ", text)  # collapse spaces
    return text.strip()


def split_into_sentences(text: str) -> list[str]:
    """Splits cleaned text into sentences using basic punctuation rules."""
    # Split on punctuation followed by space and capital letter or number
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9])', text)
    # Strip and filter too-short sentences
    return [s.strip() for s in sentences if len(s.strip()) > 10]


def extract_unique_sentences_from_df(df, text_col="raw_text") -> list[str]:
    """Cleans, splits, and deduplicates all raw_text entries in the DataFrame."""
    seen = set()
    unique_sentences = []

    for raw_text in df[text_col]:
        cleaned = clean_raw_text(raw_text)
        for sentence in split_into_sentences(cleaned):
            key = sentence.lower()
            if key not in seen:
                unique_sentences.append(sentence)
                seen.add(key)

    return unique_sentences


In [12]:
sentences = extract_unique_sentences_from_df(df)
sentences



[1m[[0m
    [32m'Akuvumelekanga ukufotokopa eli phepha Tyhila iphepha AMANQAKU IXESHA iiyure Eli phepha linamaphepha ali-.'[0m,
    [32m'ISIXHOSA ULWIMI LOKUQALA OLONGEZELELWEYO FAL IPHEPHA LOKUQALA P1 NOVEMBA NATIONAL SENIOR CERTIFICATE IBANGA IsiXhosa Ulwimi Lokuqala Olongezelelweyo FAL P1 DoE Novemba NSC Akuvumelekanga ukufotokopa eli phepha Tyhila iphepha IMIYALELO KUNYE NENGCOMBOLO YOLWAZI .'[0m,
    [32m'Eli phepha lemibuzo linamacandelo AMATHATHU, angala ICANDELO A Uvavanyo lokuqonda ICANDELO B Ushwankathelo ICANDELO C Usetyenziso kolwimi . . . . . . .'[0m,
    [32m'Phendula YONKE imibuzo.'[0m,
    [32m'Bhala icandelo ngalinye kwiphepha elitsha uze ukrwele umgca ekupheleni kwecandelo ngalinye.'[0m,
    [32m'Shiya umgca emva kwempendulo nganye.'[0m,
    [32m'Bhala ngokucocekileyo nangokucacileyo.'[0m,
    [32m'Landela imiyalelo ngocoselelo.'[0m,
    [32m'Bhala iimpendulo zakho ngokuchanekileyo ngokwendlela yokunombola esetyenzisiweyo kwiphepha lemibuzo.'[0m,


In [13]:
with open("data/03_primary/isiXhosa_sentences_2008.txt", "w", encoding="utf-8") as f:
    for s in sentences:
        f.write(s + "\n")
