In [None]:
import sys
from nlputils import utils
from nlputils.components.pymupdf_util import pymuprocessor
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import json
import os

In [None]:
###### Declare the variables #######

path_to_files_list = "path to files from list from **load_docs.ipynb**"

path_to_save_output_from_pymuprocessor = "path where the output from axaparsr needs to save the files"

# https://pymupdf.readthedocs.io/en/latest/installation.html#enabling-integrated-ocr-support
tessdata = "path to tessdata in Tesseract OCR .../.../Tesseract-OCR/tessdata"

In [None]:
# read the files dataframe
df_pdf = pd.read_json(path_to_files_list)\
                    .query('ImagePDF.notna() & ImagePDF==False').reset_index(drop=True)
df_pdf['pdf_path'] = df_pdf.apply(lambda x: x['orig_filepath'] if x['filetype'] == 'pdf' 
                                  else x['converted_filepath'],axis=1)
df_pdf = df_pdf[df_pdf.pdf_path.notna()]\
                    .sort_values(by = 'page_count',ascending=True)\
                    .reset_index(drop=True)
print("normal pdf files to be processed:",len(df_pdf))

# keeping image pdf as separate dataframe
df_imagepdf = pd.read_json(path_to_files_list)\
                    .query('ImagePDF.notna() & ImagePDF==True').reset_index(drop=True)
df_imagepdf['pdf_path'] = df_imagepdf.apply(lambda x: x['orig_filepath'] if x['filetype'] == 'pdf' 
                                  else x['converted_filepath'],axis=1)
df_imagepdf = df_imagepdf[df_imagepdf.pdf_path.notna()]\
                    .sort_values(by = 'page_count',ascending=True)\
                    .reset_index(drop=True)
print("image pdf files to be processed:",len(df_imagepdf))

# Document Processing

In [None]:
df_pdf['path_to_docs'] = df_pdf.progress_apply(lambda x: 
                    pymuprocessor.create_markdown(filepath=x['pdf_path'],
                    folder_location=path_to_save_output_from_pymuprocessor,
                    filename=os.path.splitext(os.path.basename(x['pdf_path']))[0]),axis=1)

jsonfile = df_pdf.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_save_output_from_pymuprocessor + 'pymupdf_markdown_files.json', 'w') as file:
    json.dump(parsed, file, indent=4)

processing imagepdf using ocr

In [None]:
df_imagepdf['path_to_docs'] = df_imagepdf.progress_apply(lambda x: 
                    pymuprocessor.useOCR_create_text(filepath=x['pdf_path'],
                    tessdata=tessdata,
                    folder_location=path_to_save_output_from_pymuprocessor,
                    filename=os.path.splitext(os.path.basename(x['pdf_path']))[0]),axis=1)

jsonfile = df_imagepdf.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_save_output_from_pymuprocessor + 'pymupdf_text_files.json', 'w') as file:
    json.dump(parsed, file, indent=4)

100%|██████████| 5/5 [00:00<00:00,  9.45it/s]


# Chunking

In [None]:
def create_save_chunks(filename, path_to_docs, file_extension = 'md'):
    # create chunks
    chunks = pymuprocessor.create_chunks(folder_location=path_to_docs,overlap= 10,
                                         chunk_size=800,file_extension=file_extension)
    # save chunks
    with open(path_to_save_output_from_pymuprocessor+f'tmp/{filename}/{filename}.chunks.json', 'w') as file:
        json.dump(chunks, file)
    # return chunks filepath
    return path_to_save_output_from_pymuprocessor+f'{filename}/{filename}.chunks.json'

In [None]:
# process normal pdf as processing returned  pagewise markdown files
df_pdf['chunks_filepath'] = df_pdf.progress_apply(lambda x: create_save_chunks(
                            filename=os.path.splitext(os.path.basename(x['pdf_path']))[0],
                            path_to_docs= x['path_to_docs']) if x['path_to_docs'] else None,axis=1)

# process image_pdf as processing returned  pagewise text files
df_imagepdf['chunks_filepath'] = df_imagepdf.progress_apply(lambda x: create_save_chunks(
                            filename=os.path.splitext(os.path.basename(x['pdf_path']))[0],
                            path_to_docs= x['path_to_docs'],file_extension='txt') 
                                    if x['path_to_docs'] else None,axis=1)

# save the rpcoess files metadata info
df = pd.concat([df_pdf,df_imagepdf],ignore_index=True)
import json
jsonfile = df.to_json(orient="records")
parsed = json.loads(jsonfile)
with open(path_to_save_output_from_pymuprocessor + 'processed_chunks.json', 'w') as file:
    json.dump(parsed, file, indent=4)

# Quality Check

In [None]:
def get_pagewise_text(folder_location):
    pages = utils.get_files(folder_location,file_extensions = "*")
    pages = pages["allfiles"]
    # sort the pages
    pages.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))
    tmp = []
    for page in pages:
        with open(page, 'r') as f:
            markdown_string = f.read()
        tmp.append(markdown_string)
    return tmp

In [None]:
df = df[df.chunks_filepath.notna()].reset_index(drop=True)
df['pages'] = df.progress_apply(lambda x: get_pagewise_text(x['path_to_docs']),axis=1)

# check for text quality page wise
def check_pages(pages):
    page_check= []
    for page in pages:
        # using gibberish function to detect quality
        page_check.append(utils.is_gibberish(page))
    return page_check

df['gibberish_page_check'] = df.pages.progress_apply(lambda x: check_pages(x))
df['extracted_page_count'] = df.gibberish_page_check.apply(lambda x: len(x))
df = df[df.extracted_page_count != 0].reset_index(drop=True)

df['gibberish_doc_percent'] = df.gibberish_page_check.progress_apply(lambda x: round((sum(x)/len(x)),2))
# checking files percent based on percentage of pages good
# usually from the experience its seen its a good benchmark
# reason for .35 thres is from fact that if document is only 3 pages and of these 1 page is not good 
# its better to check.
print("Total Number of okay files:", len(df[df.gibberish_doc_percent <= 0.35]))

In [None]:
import matplotlib.pyplot as plt

# plot to see the distribution of documents which have atleast one page being classified as gibberish
df['gibberish_page_count'] = df.gibberish_page_check.apply(lambda x: sum(x))
print("Total docs having atleast 1 page classified as gibberish:", 
                        len(df.query('gibberish_doc_percent > 0')))
plt.hist(df.query('gibberish_doc_percent > 0').gibberish_doc_percent, color='lightgreen', 
                                                                ec='black', bins=20)
plt.title("Distribution of Percentage of Gibberish page per doc (filtered: >0%)")
plt.show()