In [3]:
import fitz
import docx
import os 
import pandas as pd
from tqdm import tqdm 
from pathlib import Path

In [4]:
# paths 
RAW_DIR = Path('../data/raw')
OUTPUT_DIR = Path('../data/extracted')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 

In [5]:
# function to extract texts from pdf 
def extract_text_from_pdf(file_path): 
    doc = fitz.open(file_path) 
    text_pages = []
    
    for page_num in range(len(doc)): 
        text = doc[page_num].get_text('text')
        text_pages.append(text)
    return '\n'.join(text_pages) 

# function to extract texts from word files 
def extract_from_docx(file_path): 
    doc = docx.Document(file_path) 
    return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()]) 

In [None]:
# sample section headers for splitting
SECTION_HEADERS = [
    "abstract", "introduction", "related work", "methodology",
    "methods", "experiments", "results", "discussion",
    "conclusion", "references", "acknowledgements"
]

# split into section function
def split_into_section(text): 
    sections = {} 
    current_section = 'unknown'
    buffer = [] 
    
    for line in text.split('\n'): 
        clean_line = line.strip().lower() 
        if any(clean_line.startswith(h) for h in SECTION_HEADERS): 
            if buffer: 
                sections[current_section] = '\n'.join(buffer).strip() 
                buffer = []
            current_section = clean_line
        buffer.append(line)
    
    if buffer: 
        sections[current_section] = '\n'.join(buffer).strip() 
    
    return sections     
        

In [8]:
# process all the files 
section_records = []
raw_records = [] 

for file in tqdm(os.listdir(RAW_DIR)): 
    if not (file.endswith('.pdf') or file.endswith('.docx')):
        continue
    
    file_path = os.path.join(RAW_DIR, file) 
    
    # extract text depending on type 
    if file.endswith('.pdf'): 
        raw_text = extract_text_from_pdf(file_path)
    elif file.endswith('.docx'): 
        raw_text = extract_from_docx(file_path)
    
    # save raw version
    raw_records.append({
        'filename': file, 
        'raw_text': raw_text
    })
    
    # save section split version 
    sections = split_into_section(raw_text)
    for section, content in sections.items(): 
        section_records.append({
            'filename': file,
            'section': section, 
            'content': content
        })



  0%|          | 0/31 [00:00<?, ?it/s]

100%|██████████| 31/31 [00:02<00:00, 10.67it/s]


In [9]:
df_sections = pd.DataFrame(section_records) 
df_raw = pd.DataFrame(raw_records) 

df_sections.to_csv(os.path.join(OUTPUT_DIR, "papers_extracted.csv"), index=False)
df_raw.to_csv(os.path.join(OUTPUT_DIR, "papers_raw.csv"), index=False)