In [None]:
import os

import PyPDF2
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
class PdfTextChunker:
    def read_pdf(self, file_path: str) -> str:
        reader = PyPDF2.PdfReader(file_path)
        texts = []
        print(len(reader.pages))

        for page in reader.pages:
            text = page.extract_text()
            text = (
                text.strip()
                .replace("\n", " ")
                .replace("\\0", " ")
                .replace("      ", "")
            )
            texts.append(text)

        return texts

    def process(self, file_url: str):
        text = self.read_pdf(file_url)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,  # Adjust this as you see fit
            chunk_overlap=400,  # This let''s text have some form of overlap. Useful for keeping chunks contextual
            length_function=len,
        )

        chunks = text_splitter.split_text(text)
        df = pd.DataFrame(chunks, columns=["chunks"])

        yield from df.itertuples(index=False, name=None)

In [None]:
chunker = PdfTextChunker()

texts = chunker.read_pdf("./data/PDF/sample.pdf")
texts

In [None]:
chunker = PdfTextChunker()
path_folder = "./data/PDF"

texts = []

for file_name in os.listdir(path_folder):
    if file_name == ".DS_Store":
        continue

    file_path = os.path.join(path_folder, file_name)
    texts.extend(chunker.read_pdf(file_path=file_path))

texts

In [None]:
import pptx
from pptx import Presentation


def extract_text_from_graphic_frame(graphic_frame):
    text = ""

    # 테이블의 텍스트 추출
    if graphic_frame.has_table:
        table = graphic_frame.table
        for row in table.rows:
            for cell in row.cells:
                text += cell.text.strip() + " "

    # SmartArt 텍스트 추출 (이 부분은 python-pptx에서 직접 지원되지 않을 수 있음)
    elif graphic_frame.has_text_frame:
        text_frame = graphic_frame.text_frame
        for paragraph in text_frame.paragraphs:
            text += paragraph.text.strip() + " "

    text = text.replace("\n", "").replace("  ", "").strip()

    return text


def extract_text_from_pptx(file_path):
    # PowerPoint 파일 열기
    presentation = Presentation(file_path)

    # 슬라이드 내의 텍스트를 저장할 리스트
    text_runs = []

    # 각 슬라이드에 대해
    for slide in presentation.slides:
        # 슬라이드 내의 모든 셰이프(텍스트 상자 등)를 순회
        for shape in slide.shapes:
            # print(shape)
            # 셰이프가 텍스트 프레임을 가지고 있다면
            if hasattr(shape, "text_frame"):
                # 텍스트 프레임 내의 모든 문단을 순회
                for paragraph in shape.text_frame.paragraphs:
                    # 각 문단 내의 텍스트를 리스트에 추가
                    text = paragraph.text.strip()

                    if text != "":
                        text_runs.append(text)

            if isinstance(shape, pptx.shapes.graphfrm.GraphicFrame):
                text = extract_text_from_graphic_frame(shape)

                if text not in ["", " "]:
                    text_runs.append(text)

    # 모든 텍스트를 하나로 연결하여 반환
    return " ".join(text_runs)


# PowerPoint 파일에서 텍스트 추출
file_path = "./data/PPTX/sample.pptx"
text = extract_text_from_pptx(file_path)


# 추출한 텍스트 출력
print(text)

In [None]:
df = pd.read_excel("./data/XLS/sample.xls")
df

In [None]:
import numpy as np

texts = ""

for idx, row in df.iterrows():
    text = " ".join([str(x) for x in row.values if x not in ["nan", np.nan]])

    if text != "":
        texts += text.replace("\n", "").replace("\xad", "")

In [None]:
texts