In [9]:
import glob, os, sys; sys.path.append('../src')
#import helper
import preprocessing as pre
import cleaning as clean

from typing import Callable, Dict, List, Optional

from pathlib import Path
import re
import logging
import string 

logger = logging.getLogger(__name__)

from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber

In [10]:
file = "../data/150723_Kenya_First NDC.pdf"

In [3]:
file = "../data/Research_Project_Proposal_15-04-2022.pdf"

In [4]:
file = "../data/test_docx.docx"

In [11]:
def load_document(
    file: str,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
    via Haystack.

    Returns a list of type haystack.schema.Document
    """

    if file.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file.endswith('.txt'):
        converter = TextConverter()
    if file.endswith('.docx'):
        converter = DocxToTextConverter()

    print(converter)
    documents = []

    logger.info("Converting {}".format(file))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
    document = converter.convert(
                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
            )[0]
    text = document.content
    documents.append(Document(content=text, meta={"name": file}, id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
    for i in documents: 
        if i.content == "":
            text = []
            with pdfplumber.open(file) as pdf:
                for page in pdf.pages:
                    text.append(page.extract_text())
            i.content = ' '.join([page for page in text])
        
    return documents

In [13]:
docs = load_document(file)

pdftotext version 4.04 [www.xpdfreader.com]
Copyright 1996-2022 Glyph & Cog, LLC
pdftotext version 4.04 [www.xpdfreader.com]
Copyright 1996-2022 Glyph & Cog, LLC
Usage: pdftotext [options] <PDF-file> [<text-file>]
  -f <int>               : first page to convert
  -l <int>               : last page to convert
  -layout                : maintain original physical layout
  -simple                : simple one-column page layout
  -simple2               : simple one-column page layout, version 2
  -table                 : similar to -layout, but optimized for tables
  -lineprinter           : use strict fixed-pitch/height layout
  -raw                   : keep strings in content stream order
  -fixed <number>        : assume fixed-pitch (or tabular) text
  -linespacing <number>  : fixed line spacing for LinePrinter mode
  -clip                  : separate clipped text
  -nodiag                : discard diagonal text
  -enc <string>          : output text encoding name
  -eol <string>      

<haystack.nodes.file_converter.pdf.PDFToTextConverter object at 0x7fbd4a0215e0>


In [14]:
docs_processed = clean.preprocessing(docs)

100%|██████████| 1/1 [00:00<00:00, 71.83docs/s]


In [26]:
import pandas as pd
df = pd.DataFrame(docs_processed)
df

Unnamed: 0,content,content_type,id,meta,score,embedding
0,ministry of environment and natural resources kenya s intended nationally de...,text,83a7752488a4ac0fbadcf94486d6c2e0,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 0}",,
1,in response to the challenges posed by climate change kenya has developed a ...,text,6da8b06b5ce8d49d65f99701cc9a8f26,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 1}",,
2,kenya is operationalising these policies and plans through the implementatio...,text,249f5bb39fde31b215d2da1e94bf602c,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 2}",,
3,kenya s indc builds on the participatory multistakeholder and crosssectoral ...,text,75fbc8c6fcf8779011fae1af85d4d1cf,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 3}",,
4,climate hazards have caused considerable losses across the country s differe...,text,c54b899f567aa012c31356ba142072de,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 4}",,
5,the other significant emissions are from the energy and transport sectors wi...,text,80f55eaf04e76d6331bebe7d1f559e3c,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 5}",,
6,contribution kenya s indc includes both mitigation and adaptation components...,text,7859cdb3e2b8629de340dae25c946081,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 6}",,
7,promotion and implementation of the this is also subject to international su...,text,150b4e096478a75a32b76302d02ef393,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 7}",,
8,sustainable waste management systems ministry of environment and natural res...,text,90c5ba796592ecbb1a4229ef304559cb,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 8}",,
9,assumptions and methodological approaches methodology for the ipcc revised g...,text,ba5b2d699fa84a330659ca18bcf5225d,"{'name': '../data/150723_Kenya_First NDC.pdf', '_split_id': 9}",,


In [27]:
all_text = "".join(df.content.to_list())

In [28]:
all_text

'ministry of environment and natural resources kenya s intended nationally determined contribution indc july introduction kenya like other countries in the region is bearing the brunt of climate change impacts and the associated socioeconomic losses the situation is exacerbated by the high dependence on climate sensitive natural resourcesin response to the challenges posed by climate change kenya has developed a national climate change response strategy nccrs national climate change action plan nccap and a national adaptation plan nap under preparation which provides a vision for low carbon and climate resilient development pathway while a national climate change framework policy and legislation are in their final stages of enactment to facilitate effective response to climate changekenya is operationalising these policies and plans through the implementation of climate change actions in various areas such as afforestation and reforestation geothermal and other clean energy development