In [None]:
# Cell 1: Setup & Imports
import os
import sys
import asyncio
import tempfile
import logging

# Add backend to path để import được modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('.'))))

print("✅ Setup completed!")
print(f"Current working directory: {os.getcwd()}")
print(f"Python path: {sys.path[-1]}")

✅ Setup completed!
Current working directory: /Users/vominhthinh/Workspace/Exam-hub/backend/app/processors
Python path: /Users/vominhthinh/Workspace/Exam-hub/backend


In [3]:
# Cell 2: Import Processors
try:
    from app.processors import DocumentProcessor, ProcessingResult
    from app.processors.pdf_processor import PDFProcessor
    from app.processors.docx_processor import DOCXProcessor
    from app.processors.txt_processor import TXTProcessor
    
    print("✅ All processors imported successfully!")
    
    # Initialize main processor
    doc_processor = DocumentProcessor()
    print(f"📁 Supported extensions: {doc_processor.get_supported_extensions()}")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Make sure you're running from the correct directory")

✅ All processors imported successfully!
📁 Supported extensions: {'.pdf', '.docx', '.txt'}


In [11]:
# Cell: Test PDF Simple
pdf_path = "/Users/vominhthinh/Workspace/Exam-hub/backend/app/processors/s41586-024-07421-0.pdf"

pdf_processor = PDFProcessor()
result = await pdf_processor.extract_text(pdf_path)

if result.success:
    print(f"✅ PDF OK: {len(result.content):,} characters")
    print(f"Preview: {result.content[:200]}...")
else:
    print(f"❌ PDF Error: {result.error_message}")

INFO - Extracted 75796 chars from PDF


✅ PDF OK: 75,796 characters
Preview: Nature  |  Vol 630  |  20 June 2024  |  625
Article
Detecting hallucinations in large language 
models using semantic entropy
Sebastian Farquhar1,2 ✉, Jannik Kossen1,2, Lorenz Kuhn1,2 & Yarin Gal1
Lar...


In [13]:
# Cell: Test DOCX Simple
docx_path = "/Users/vominhthinh/Workspace/Exam-hub/backend/app/processors/Đơn đăng ký Seminar chuyên ngành - thịnh võ.docx"

docx_processor = DOCXProcessor()
result = await docx_processor.extract_text(docx_path)

if result.success:
    print(f"✅ DOCX OK: {len(result.content):,} characters")
    print(f"Preview: {result.content[:200]}...")
else:
    print(f"❌ DOCX Error: {result.error_message}")

INFO - Extracted 1178 chars from DOCX


✅ DOCX OK: 1,178 characters
Preview: ĐƠN ĐĂNG KÝ HỌC PHẦN SEMINAR CHUYÊN NGÀNH 
		      		Kính gửi:  Ban chủ nhiệm Khoa Toán – Tin học
Họ và tên sinh viên: …Võ Minh Thịnh……………………MSSV: …22280087………..…
Thuộc chương trình: Đại trà          ...


In [None]:
# Cell: Test TXT Simple
txt_path = "/Users/vominhthinh/Workspace/Exam-hub/backend/app/processors/test.txt"

# Create test file
with open(txt_path, 'w', encoding='utf-8') as f:
    f.write("Test content\nMultiple lines\nFor testing")

txt_processor = TXTProcessor()
result = await txt_processor.extract_text(txt_path)

if result.success:
    print(f"✅ TXT OK: {len(result.content):,} characters")
    print(f"Content: {result.content}")
else:
    print(f"❌ TXT Error: {result.error_message}")