In [36]:
import re
import os
import pandas as pd
import json 
from pathlib import Path
import string

In [37]:
RAW_EXTRACTED_CSV = Path('../data/extracted/papers_extracted.csv')
RAW_RAW_CSV = Path('../data/extracted/papers_raw.csv')
PROCESSED_DIR = Path('../data/processed')
PROCESSED_DIR.mkdir(parents=True, exist_ok=True) 

In [38]:
# load the csvs 
df_raw = pd.read_csv(RAW_RAW_CSV)
df_sections = pd.read_csv(RAW_EXTRACTED_CSV) 

print(f'loaded {len(df_raw)} raw documents')
print(f'loaded {len(df_sections)} sectioned rows') 

loaded 31 raw documents
loaded 73 sectioned rows


In [39]:
# clean text function
def clean_text(text: str) -> str: 
    '''
    cleans extracted academic answers.
    adjust rules based on your dataset
    '''
    # lowercase
    text = text.lower() 
    
    # remove extra whitespaces
    text = re.sub(r'\s+', '', text).strip()
    
    # remove numbers and punctuation
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove multiple new lines -> single new line
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    
    # remove extract spaces
    text = re.sub(r'[ \t]+', ' ', text)
    
    # remove page numbers 
    text = re.sub(r'Page\s*\d+', '', text, flags=re.IGNORECASE) 
    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(r"–\s*\d+\s*–", "", text)    
    text = re.sub(r"P a g e \W \d+", "", text)
    
    text = re.sub(r"\d\) .*", "", text)
    text = re.sub(r"(\d\)|(\d.|(\d.\d)|(\d.\d.\d|\d))) .*", "", text)
    
    # remove dates
    text = re.sub(r"\d{2}\/\d{2}\/\d{4}", "", text)
    text = re.sub(r'\d{4}\/\d{2}\/', "", text)
    text = re.sub(r'\d{2}\/\d{2}\/\d+', "", text)
    
    # Remove references section if present
    text = re.sub(r"(References|BIBLIOGRAPHY).*", "", text, flags=re.IGNORECASE | re.DOTALL)
    
    # remove web links
    text = re.sub(r'https:\/\/.*', "",text)
    
    text = re.sub(r'(Name:|(S|s) number:|Registration Number:) (L.B.D.M.A. Wijesundara|(S|s)\d+|\d+)', "", text)
    text = re.sub(r'(\S+|\S+ \S+) - s\d+', '', text)
    
    # remove table of contents
    text = re.sub(r'\w+\.{5,}\d+', "", text)
    text = re.sub(r'(\w+) \.{5,}', '', text)
    text = re.sub(r'\w+\.{10,} \d', '', text)
    
    # other things to remove
    text = re.sub(r'Student \d\(\w+ \w+\) .{10,}', '', text)
    
    
    # normalize the quotes and dashes 
    text = text.replace("“", '"').replace("”", '"')
    text = text.replace("’", "'").replace("–", "-")
    
    
    
    # remove non-ASCII junk (optional)
    text = re.sub(r"[^\x00-\x7F]+", " ", text)

    text = text.strip()
    return text 

In [40]:
# apply cleaning to raw documents
# Apply cleaning to raw documents
df_raw['cleaned_text'] = df_raw['raw_text'].apply(clean_text)
df_raw = df_raw.dropna()  # Remove NaN/None values

# Filter out empty or whitespace-only strings
df_raw = df_raw[df_raw['cleaned_text'].str.strip() != '']
df_raw.to_csv(os.path.join(PROCESSED_DIR, 'cleaned_raw.csv'), index=False)



In [41]:
# apply cleaning to sectioned documents 
df_sections['cleaned_content'] = df_sections['content'].apply(clean_text)
df_sections = df_sections.dropna()

df_sections = df_sections[df_sections['cleaned_content'].str.strip() != '']
df_sections.to_csv(os.path.join(PROCESSED_DIR, 'cleaned_sections.csv'), index=False) 


In [42]:
# save jsons for embeddings pipeline 
# raw texts json 
raw_dict = {row['filename']: row['cleaned_text'] for _, row in df_raw.iterrows()} 

with open(os.path.join(PROCESSED_DIR, 'cleaned_raw.json'), 'w', encoding='utf-8') as f:
    json.dump(raw_dict,f,ensure_ascii=False, indent=2) 

# sectioned texts json 
section_dict = {}
for _, row in df_sections.iterrows(): 
    fname = row['filename']
    section_dict.setdefault(fname, {})
    section_dict[fname][row['section']] = row['cleaned_content']

with open(os.path.join(PROCESSED_DIR, 'cleaned_sections.json'), 'w', encoding='utf-8') as f: 
    json.dump(section_dict, f, ensure_ascii=False, indent=2) 