In [224]:
# Here are the imports we need
from typing import Callable, Dict, List, Optional

from pathlib import Path
import re
import logging
import string 

logger = logging.getLogger(__name__)

from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document

'''to compare performance to'''
import pdfplumber

from haystack.nodes import PreProcessor

In [225]:
file = "../data/150723_Kenya_First NDC.pdf"

In [174]:
file = "../data/Research_Project_Proposal_15-04-2022.pdf"

In [178]:
file = "../data/test_docx.docx"

In [226]:
#load document

def load_document(
    file: str,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    
    """
    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
    via Haystack.
    """
    
    if file.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file.endswith('.txt'):
        converter = TextConverter()
    if file.endswith('.docx'):
        converter = DocxToTextConverter()

    print(converter)
    documents = []

    logger.info("Converting {}".format(file))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
    document = converter.convert(
                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
            )[0]
    text = document.content
    documents.append(Document(content=text, meta={"name": file}, id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
    for i in documents: 
        if i.content == "":
            text = []
            with pdfplumber.open(file) as pdf:
                for page in pdf.pages:
                    text.append(page.extract_text())
            i.content = ' '.join([page for page in text])
        
    return documents
    

In [227]:
docs = load_document(file)

pdftotext version 4.04 [www.xpdfreader.com]
Copyright 1996-2022 Glyph & Cog, LLC
pdftotext version 4.04 [www.xpdfreader.com]
Copyright 1996-2022 Glyph & Cog, LLC
Usage: pdftotext [options] <PDF-file> [<text-file>]
  -f <int>               : first page to convert
  -l <int>               : last page to convert
  -layout                : maintain original physical layout
  -simple                : simple one-column page layout
  -simple2               : simple one-column page layout, version 2
  -table                 : similar to -layout, but optimized for tables
  -lineprinter           : use strict fixed-pitch/height layout
  -raw                   : keep strings in content stream order
  -fixed <number>        : assume fixed-pitch (or tabular) text
  -linespacing <number>  : fixed line spacing for LinePrinter mode
  -clip                  : separate clipped text
  -nodiag                : discard diagonal text
  -enc <string>          : output text encoding name
  -eol <string>      

<haystack.nodes.file_converter.pdf.PDFToTextConverter object at 0x7f901a7ef1f0>


In [228]:
docs

[<Document: {'content': " \n \nMINISTRY OF ENVIRONMENT AND NATURAL RESOURCES \n   \nKenya’s Intended Nationally Determined Contribution (INDC) \n23 July 2015 \n1.  Introduction \nKenya, like other countries in the region, is bearing the brunt of climate change impacts \nand  the  associated  socio-economic  losses.  The  situation  is  exacerbated  by  the  high \ndependence on climate sensitive natural resources. In response to the challenges posed by \nClimate  Change,  Kenya  has  developed  a  National  Climate  Change  Response  Strategy \n(NCCRS  2010),  National  Climate  Change  Action  Plan  (NCCAP  2013),  and  a  National \nAdaptation Plan (NAP) - under preparation which provides a vision for low carbon and \nclimate resilient development pathway, while a National Climate Change Framework Policy \nand legislation are in their final stages of enactment to facilitate effective response to \nclimate  change.  Kenya  is  operationalising  these  policies  and  plans  through  th

In [239]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=10
)
for i in docs:
    docs_default = preprocessor.process([i])
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 83.93docs/s][A

n_docs_input: 1
n_docs_output: 41





In [241]:
docs_default

[<Document: {'content': '\n\nMINISTRY OF ENVIRONMENT AND NATURAL RESOURCES\n\nKenya’s Intended Nationally Determined Contribution (INDC)\n23 July 2015\n1. Introduction\nKenya, like other countries in the region, is bearing the brunt of climate change impacts\nand  the  associated  socio-economic  losses. The  situation  is  exacerbated  by  the  high\ndependence on climate sensitive natural resources.', 'content_type': 'text', 'score': None, 'meta': {'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 0}, 'embedding': None, 'id': '83a7752488a4ac0fbadcf94486d6c2e0'}>,
 <Document: {'content': 'The  situation  is  exacerbated  by  the  high\ndependence on climate sensitive natural resources. In response to the challenges posed by\nClimate  Change,  Kenya  has  developed  a  National  Climate  Change  Response  Strategy\n(NCCRS  2010),  National  Climate  Change  Action  Plan  (NCCAP  2013),  and  a  National\nAdaptation Plan (NAP) - under preparation which provides a vision for low