-
Notifications
You must be signed in to change notification settings - Fork 5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add RapidOCRPDFLoader and RapidOCRLoader (#1275)
* add RapidOCRPDFLoader * update mypdfloader.py and requirements.txt * add myimgloader.py * add test samples * add TODO to mypdfloader * add loaders to KnowledgeFile class * add loaders to KnowledgeFile class
- Loading branch information
1 parent
72b9da2
commit 6c4ef26
Showing
8 changed files
with
74 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .mypdfloader import RapidOCRPDFLoader | ||
from .myimgloader import RapidOCRLoader |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from typing import List | ||
from langchain.document_loaders.unstructured import UnstructuredFileLoader | ||
|
||
|
||
class RapidOCRLoader(UnstructuredFileLoader): | ||
def _get_elements(self) -> List: | ||
def img2text(filepath): | ||
from rapidocr_onnxruntime import RapidOCR | ||
resp = "" | ||
ocr = RapidOCR() | ||
result, _ = ocr(filepath) | ||
if result: | ||
ocr_result = [line[1] for line in result] | ||
resp += "\n".join(ocr_result) | ||
return resp | ||
|
||
text = img2text(self.file_path) | ||
from unstructured.partition.text import partition_text | ||
return partition_text(text=text, **self.unstructured_kwargs) | ||
|
||
|
||
if __name__ == "__main__": | ||
loader = RapidOCRLoader(file_path="../tests/samples/ocr_test.jpg") | ||
docs = loader.load() | ||
print(docs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from typing import List | ||
from langchain.document_loaders.unstructured import UnstructuredFileLoader | ||
|
||
|
||
class RapidOCRPDFLoader(UnstructuredFileLoader): | ||
def _get_elements(self) -> List: | ||
def pdf2text(filepath): | ||
import fitz | ||
from rapidocr_onnxruntime import RapidOCR | ||
import numpy as np | ||
ocr = RapidOCR() | ||
doc = fitz.open(filepath) | ||
resp = "" | ||
for page in doc: | ||
# TODO: 依据文本与图片顺序调整处理方式 | ||
text = page.get_text("") | ||
resp += text + "\n" | ||
|
||
img_list = page.get_images() | ||
for img in img_list: | ||
pix = fitz.Pixmap(doc, img[0]) | ||
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) | ||
result, _ = ocr(img_array) | ||
if result: | ||
ocr_result = [line[1] for line in result] | ||
resp += "\n".join(ocr_result) | ||
return resp | ||
|
||
text = pdf2text(self.file_path) | ||
from unstructured.partition.text import partition_text | ||
return partition_text(text=text, **self.unstructured_kwargs) | ||
|
||
|
||
if __name__ == "__main__": | ||
loader = RapidOCRPDFLoader(file_path="../tests/samples/ocr_test.pdf") | ||
docs = loader.load() | ||
print(docs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.