# Using Jina Flow to Segment a Collection of PDFs

Stepping through
https://medium.com/jina-ai/building-an-ai-powered-pdf-search-engine-with-python-part-1-9102654e6ea1

In [41]:
from docarray import Document, DocumentArray

### Loading pdf files into DocArray object

Note that the tutorial article is missing the `load_uri_to_blob()` method, which seems to be necessary.

In [42]:
docs = DocumentArray.from_files("toy_data/*", recursive=True)

for doc in docs:
    doc.load_uri_to_blob()

Note that we have 4 documents

- 3 pdfs and a jpg

In [43]:
display(docs)

### Logging into Jina and using its Flow Executor to Segment the PDF

In [44]:
!jina auth login

🔐 You are already logged in as [1;32msamtonetto[0m.

If you want to log in to another account, please run either:
- [1mjina auth logout[0m
- or, [1mjina auth login -f[0m


In [45]:
from jina import Flow

In [46]:
# This cell takes a little while to execute...

flow = Flow().add(uses="jinahub+sandbox://PDFSegmenter", install_requirements=True, name="segmenter")

with flow:
    indexed_docs = flow.index(docs)

Taking a look at the indexed documents

- `Homogenous Documents = True` because only have PDFs. It would be false if we had multiple document types.

In [47]:
indexed_docs

Summary of first document

In [48]:
indexed_docs[0].chunks.summary()

In [49]:
chunks = indexed_docs[0].chunks
print(len(chunks))

58


Some chunks are numpy arrays representing images:

In [95]:
chunks[0].content

array([[[184., 193., 188.],
        [163., 174., 158.],
        [150., 162., 140.],
        ...,
        [ 43.,  42.,  24.],
        [ 41.,  40.,  22.],
        [ 41.,  38.,  23.]],

       [[222., 231., 228.],
        [186., 197., 180.],
        [157., 170., 144.],
        ...,
        [ 49.,  47.,  32.],
        [ 46.,  44.,  29.],
        [ 47.,  44.,  29.]],

       [[220., 226., 226.],
        [208., 220., 200.],
        [166., 180., 147.],
        ...,
        [ 62.,  60.,  48.],
        [ 54.,  50.,  39.],
        [ 54.,  50.,  39.]],

       ...,

       [[ 57.,  54.,  65.],
        [ 44.,  41.,  52.],
        [ 46.,  43.,  52.],
        ...,
        [ 55.,  49.,  49.],
        [ 31.,  25.,  25.],
        [ 31.,  25.,  27.]],

       [[ 45.,  43.,  54.],
        [ 47.,  45.,  56.],
        [ 40.,  39.,  47.],
        ...,
        [ 98.,  92.,  92.],
        [ 39.,  33.,  35.],
        [ 39.,  33.,  35.]],

       [[ 72.,  70.,  81.],
        [ 48.,  46.,  57.],
        [ 29.,  

Other chunks are strings of text:

In [60]:
chunks[-1]

## Sentencizing

The strings can be quite long, so we need a **sentencizer** to break them up into shorter sentences.

In [65]:
import jina

# Splits into sentences based on periods with a space. 
# Alternative is "jinahub://Sentencizer", but that would split "10a.m." into multiple sentences.
exec = jina.Executor.from_hub("jinahub://SpacySentencizer", install_requirements = True)

Perform the sentencizing segmentation

In [66]:
exec.segment(docs, parameters={})

In [68]:
with flow:
    indexed_docs = flow.index(docs)

In [69]:
indexed_docs[0].chunks[-1]

In [82]:
class TextChunkMerger(Executor):
    """
    Applies Sentencizing to a chunk only if it is a text chunk.
    Then flattens the sentencization to level 1 chunks.
    i.e.
       [text_chunk_A, text_chunk_B, ... ] -> [[A1, A2, A3, ...], [B1, B2, ...], ...]
                                          -> [A1, A2, A3, ..., B1, B2, ...]
    """

    @requests(on="/index")  # <---- WHAT DOES THIS DO?
    def sentencize_text_chunks(self, docs, **kwargs):
        for doc in docs:  # level 0 document
            chunks_lvl_1 = DocumentArray()  # level 0 is original Document
            for chunk in doc.chunks:
                if chunk.mime_type == "text/plain":
                    chunks_lvl_1.append(chunk)

                # Break chunk into sentences
                sentencizer = Executor.from_hub("jinahub://Sentencizer")
                sentencizer.segment(chunks_lvl_1, parameters={})

            # Extend level 1 chunk DocumentArray with the sentences
            for lvl_1_chunk in chunks_lvl_1:
                doc.chunks.extend(lvl_1_chunk.chunks) 

In [83]:
flow = (
    Flow()
    .add(uses="jinahub+sandbox://PDFSegmenter", install_requirements=True, name="segmenter")
    .add(uses=TextChunkMerger)
)

In [84]:
with flow:
    indexed_docs = flow.index(docs)

            sentencizer shadows one of built-in Python module name.
            It is imported as `user_module.sentencizer`

            Affects:
            - Either, change your code from using `from sentencizer import ...`
              to `from user_module.sentencizer import ...`
            - Or, rename sentencizer to another name
            [0m [1;30m(raised from /Users/samtonetto/Learning/FSDL2022/FSDL-2022-Semantic-Search-QA/.venv/lib/python3.10/site-packages/jina/importer.py:112)[0m


In [87]:
indexed_docs[0].chunks[-2]

In [89]:
class ImageNormalizer(Executor):
    """
    Normalizes images and resizes them to 64x64 to be fed into a neural network.
    """
    @requests(on="/index")
    def normalize_chunks(self, docs, **kwargs):
        for doc in docs:
            for chunk in doc.chunks:
                if chunk.blob:
                    chunk.convert_blob_to_image_tensor()

                if hasattr(chunk, "tensor") and chunk.tensor is not None:
                    chunk.convert_image_tensor_to_uri()
                    chunk.tags["image_datauri"] = chunk.uri
                    chunk.tensor = chunk.tensor.astype(np.uint8)
                    chunk.set_image_tensor_shape((64, 64))
                    chunk.set_image_tensor_normalization()

In [90]:
flow = (
    jina.Flow()
    .add(uses="jinahub+sandbox://PDFSegmenter", install_requirements=True, name="segmenter")
    .add(uses=TextChunkMerger, name="text_chunk_merger")
    .add(uses=ImageNormalizer, name="image_normalizer")
)

In [91]:
with flow:
    indexed_docs = flow.index(docs)

            sentencizer shadows one of built-in Python module name.
            It is imported as `user_module.sentencizer`

            Affects:
            - Either, change your code from using `from sentencizer import ...`
              to `from user_module.sentencizer import ...`
            - Or, rename sentencizer to another name
            [0m [1;30m(raised from /Users/samtonetto/Learning/FSDL2022/FSDL-2022-Semantic-Search-QA/.venv/lib/python3.10/site-packages/jina/importer.py:112)[0m


Exception in thread Thread-123:
Traceback (most recent call last):
  File "/Users/samtonetto/Learning/FSDL2022/FSDL-2022-Semantic-Search-QA/.venv/lib/python3.10/site-packages/jina/clients/base/grpc.py", line 87, in _get_results
    async for resp in stub.Call(
  File "/Users/samtonetto/Learning/FSDL2022/FSDL-2022-Semantic-Search-QA/.venv/lib/python3.10/site-packages/grpc/aio/_call.py", line 326, in _fetch_stream_responses
    await self._raise_for_status()
  File "/Users/samtonetto/Learning/FSDL2022/FSDL-2022-Semantic-Search-QA/.venv/lib/python3.10/site-packages/grpc/aio/_call.py", line 236, in _raise_for_status
    raise _create_rpc_error(await self.initial_metadata(), await
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "Unexpected <class 'grpc.aio._call.AioRpcError'>: <AioRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "Unexpected <class 'TypeError'>: format_exception() got an unexpected key

BadClient: something wrong when running the eventloop, result can not be retrieved

In [96]:
indexed_docs[0].chunks[0].content

array([[[184., 193., 188.],
        [163., 174., 158.],
        [150., 162., 140.],
        ...,
        [ 43.,  42.,  24.],
        [ 41.,  40.,  22.],
        [ 41.,  38.,  23.]],

       [[222., 231., 228.],
        [186., 197., 180.],
        [157., 170., 144.],
        ...,
        [ 49.,  47.,  32.],
        [ 46.,  44.,  29.],
        [ 47.,  44.,  29.]],

       [[220., 226., 226.],
        [208., 220., 200.],
        [166., 180., 147.],
        ...,
        [ 62.,  60.,  48.],
        [ 54.,  50.,  39.],
        [ 54.,  50.,  39.]],

       ...,

       [[ 57.,  54.,  65.],
        [ 44.,  41.,  52.],
        [ 46.,  43.,  52.],
        ...,
        [ 55.,  49.,  49.],
        [ 31.,  25.,  25.],
        [ 31.,  25.,  27.]],

       [[ 45.,  43.,  54.],
        [ 47.,  45.,  56.],
        [ 40.,  39.,  47.],
        ...,
        [ 98.,  92.,  92.],
        [ 39.,  33.,  35.],
        [ 39.,  33.,  35.]],

       [[ 72.,  70.,  81.],
        [ 48.,  46.,  57.],
        [ 29.,  