In [2]:
from bs4 import BeautifulSoup
import os 
import pytesseract
from PIL import Image
import pandas as pd
from paddleocr import PaddleOCR
from docling.document_converter import DocumentConverter




In [3]:
# Initialize PaddleOCR once (supports French, English, etc.)
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Use 'fr' for French if needed

def extract_text_from_html(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "lxml")
        text = soup.get_text(separator=" ", strip=True)
        return text
    except Exception as e:
        print(f"❌ BeautifulSoup failed for {file_path}: {e}")
        return ""

def extract_text_with_docling_or_ocr(file_path):
    # Handle HTML separately with BeautifulSoup
    if file_path.lower().endswith(('.html', '.htm')):
        return extract_text_from_html(file_path)

    # Try Docling
    try:
        converter = DocumentConverter()
        doc = converter.convert(file_path)
        if hasattr(doc, 'text'):
            text = doc.text
            if text.strip():
                return text
    except Exception as e:
        print(f"❌ Docling failed for {file_path}: {e}")

    # Fallback to PaddleOCR for images
    if file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        print(f"🔁 Falling back to PaddleOCR for {file_path}")
        try:
            result = ocr.ocr(file_path, cls=True)
            extracted_text = ""
            for line in result:
                for box in line:
                    extracted_text += box[1][0] + "\n"
            return extracted_text.strip()
        except Exception as e:
            print(f"❌ PaddleOCR also failed for {file_path}: {e}")

    return ""



download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /home/nonso/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:41<00:00, 94.13it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /home/nonso/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:56<00:00, 175.59it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /home/nonso/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:25<00:00, 83.20it/s] 

[2025/05/11 22:56:44] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/nonso/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/nonso/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6




In [4]:
def process_documents_to_text(root_dir):
    rows = []

    for folder_name in os.listdir(root_dir):
        label = folder_name
        folder_path = os.path.join(root_dir, folder_name)

        if not os.path.isdir(folder_path):
            continue

        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)

            if file_path.lower().endswith(('.html', '.htm', '.jpg', '.jpeg', '.png')):
                text = extract_text_with_docling_or_ocr(file_path)
                if text:
                    rows.append({
                        "text": text,
                        "label": label
                    })

    return pd.DataFrame(rows)

In [None]:

df = process_documents_to_text("/home/nonso/ai-multimodal-learning-project/Finance-Ai-Project/data/processed/sujet_images_by_class")

df.to_csv("finance_text_dataset.csv", index=False)
print("✅ Dataset saved with", len(df), "entries.")
print(df.head())

<function posix.listdir(path=None)>