# Use pymupdf to convert PDF to JSON
1. Use 
```PyMuPDFLoader```

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import JSONLoader

from pathlib import Path
from pprint import pprint

loader = PyMuPDFLoader("./pdf/eReceipt-TBL_21_2-2746e.pdf")
data = loader.load()
data[0]

Document(page_content='Nobel Burger\nYYZ Terminal 3\nDate: 2019-04-14 16:44:31\nCard Type: MasterCard\nAcct: xxxxxxxxxxxxxxxx\nCard Entry: Swiped\nCheck: 7462 - TBL_21_2-2746e\nServer: iPad\nSubtotal: $15.82\nTip: $2.52\nTotal: $18.34\n-- Thank You --\n-- Customer Copy --\n', metadata={'source': './pdf/eReceipt-TBL_21_2-2746e.pdf', 'file_path': './pdf/eReceipt-TBL_21_2-2746e.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'FPDF 1.7', 'creationDate': 'D:20190414204456', 'modDate': '', 'trapped': ''})

2. Use
```fitz```

In [10]:

import fitz
doc = fitz.open("./pdf/eReceipt-TBL_21_2-2746e.pdf")
page_text_list = []
for page_number, page in enumerate(doc, start=1):
    text = ""
    blocks = page.get_text("blocks")
    print(blocks)
    for block in blocks:
        text += block[4] + "\n"
    page_text = text.strip()
    page_text_list.append({"page_number": page_number, "page_text": page_text})
print(page_text_list)

[(263.05999755859375, 72.32999420166016, 369.09796142578125, 97.06199645996094, 'Nobel Burger\n', 0, 0), (266.4599914550781, 91.15999603271484, 349.8119812011719, 107.64799499511719, 'YYZ Terminal 3\n', 1, 0), (144.57000732421875, 119.5000228881836, 315.3299560546875, 206.85801696777344, 'Date: 2019-04-14 16:44:31\nCard Type: MasterCard\nAcct: xxxxxxxxxxxxxxxx\nCard Entry: Swiped\nCheck: 7462 - TBL_21_2-2746e\nServer: iPad\n', 2, 0), (144.57000732421875, 217.17002868652344, 238.0819854736328, 236.40603637695312, 'Subtotal: $15.82\n', 3, 0), (144.57000732421875, 245.50999450683594, 206.82798767089844, 264.7460021972656, 'Tip: $2.52\n', 4, 0), (144.57000732421875, 273.86004638671875, 226.281982421875, 293.0960388183594, 'Total: $18.34\n', 5, 0), (257.95001220703125, 316.3800354003906, 352.0859375, 335.61602783203125, '-- Thank You --\n', 6, 0), (235.27999877929688, 343.1700134277344, 376.6239929199219, 365.1540222167969, '-- Customer Copy --\n', 7, 0)]


[{'page_number': 1,
  'page_text': 'Nobel Burger\n\nYYZ Terminal 3\n\nDate: 2019-04-14 16:44:31\nCard Type: MasterCard\nAcct: xxxxxxxxxxxxxxxx\nCard Entry: Swiped\nCheck: 7462 - TBL_21_2-2746e\nServer: iPad\n\nSubtotal: $15.82\n\nTip: $2.52\n\nTotal: $18.34\n\n-- Thank You --\n\n-- Customer Copy --'}]

# How to write JSON to file

In [14]:
import json
f = open("./json/output.json", "w") 
f.write(json.dumps(page_text_list))
f.close()

# Combining everything together
Read all PDFs in a folder, use ```fitz``` to convert to JSON and write JSON to file

In [11]:
import os
import json
from pathlib import Path
from pprint import pprint
import fitz

extensions = [".pdf"]
file_paths = []
for root, dirs, files in os.walk("./pdf"):
    for file in files: 
        _, file_extension = os.path.splitext(file)
        if file_extension.lower() in extensions:
            file_paths.append(os.path.join(root, file))
print(file_paths)

data_list = []
for file_path in file_paths: 
    doc = fitz.open(file_path)
    #print(doc)
    doc_json = []
    for page_number, page in enumerate(doc, start=1):
        text = ""
        blocks = page.get_text("blocks")
        #print(blocks)
        for block in blocks:
            text += block[4] + "\n"
        page_text = text.strip()
        doc_json.append({"page_number": page_number, "page_text": page_text, "file_path":file_path})
    
    #print(doc_json)
    #Write To File
    f = open("./json/"+doc.name.replace("./pdf/","")+".json", "w") 
    f.write(json.dumps(doc_json))
    f.close()
    
    data_list.append(doc_json)
#print(data_list)


['./pdf/receipt.pdf', './pdf/eReceipt-TBL_21_2-2746e.pdf']
