Action performed:
- PDFminer for pdf to txt conversion, for all input pdfs
- From txt, description using nltk
- summarization using nltk

In [1]:
# general imports
from pathlib import Path
import os

# processing imports
import pandas as pd

In [72]:
# utils elements to move to utils after development

def getListOfFiles(dirName):
    '''
        For the given path, get the List of all files in the directory tree 
    '''
    paths = []
    for path, subdirs, files in os.walk(dirName):
        for name in files:
            paths.append((Path(path+name)))            
    return paths

from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTPage, LTChar, LTAnno, LAParams, LTTextBox, LTTextLine

class PDFPageDetailedAggregator(PDFPageAggregator):
    def __init__(self, rsrcmgr, pageno=1, laparams=None):
        PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
        self.rows = []
        self.page_number = 0
    def receive_layout(self, ltpage):        
        def render(item, page_number):
            if isinstance(item, LTPage) or isinstance(item, LTTextBox):
                for child in item:
                    render(child, page_number)
            elif isinstance(item, LTTextLine):
                child_str = ''
                for child in item:
                    if isinstance(child, (LTChar, LTAnno)):
                        child_str += child.get_text()
                child_str = ' '.join(child_str.split()).strip()
                if child_str:
                    row = (page_number, item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3], child_str) # bbox == (x1, y1, x2, y2)
                    self.rows.append(row)
                for child in item:
                    render(child, page_number)
            return
        render(ltpage, self.page_number)
        self.page_number += 1
        self.rows = sorted(self.rows, key = lambda x: (x[0], -x[2]))
        self.result = ltpage

from collections import OrderedDict
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams

# TODO: deal with words cut in half "pro- pagation of..."
def convert(input_file):
    fp = open(input_file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    # doc.initialize("passwrd") # leave empty for no password

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        device.get_result()

    # GROUPING
    grouped_text = OrderedDict() # order is of identification in the document.
    for p in range(1000): # max page nb is 1000
        grouped_text[p] = {}
    for (page_nb, x_min, y_min, x_max, y_max, text) in device.rows:
        x_min = round(x_min)//10 # manipulate the level of aggregation --> x_min might be slighly different
        try:
            grouped_text[page_nb][x_min]+= " " + text
        except:
            grouped_text[page_nb][x_min] = text
    
    
    # FULL TEXT BY PAGE
    text_by_page = OrderedDict()
    for key, values in grouped_text.items():
        if values:
            text_by_page[key] = ""
            for _, text in values.items():
                text_by_page[key] += text + " "
    # FULL TEXT
    pdf_full_txt = ""
    for key, text in text_by_page.items():
        pdf_full_txt += text + " "
        
    return (pdf_full_txt, text_by_page)        

In [75]:
# start with one pdf
input_path = "../../data/input/DPEFs/Construction/"
output_path = "../../data/processed/DPEFs/"
all_input_files = getListOfFiles(input_path)
all_input_files = [p for p in all_input_files if p.name.lower().endswith(".pdf")]
# input_filename = "Construction/vinci-rapport-annuel-2018.pdf"
# output_filename = input_filename.replace(".pdf",".txt")
# input_filename = Path(input_path+input_filename)
# output_filename = Path(output_path+output_filename)

In [76]:
# TODO: deal with MICHELIN hanging function...
# accrding to answer in https://stackoverflow.com/questions/53219016/detecting-sections-of-a-pdf-with-pdfminer
# it is not possibmle to parse content in a randomly structured pdf.
for i, input_file in enumerate(all_input_files):
    if input_file.name.endswith("pdf"):
        print("Processing {}/{} - file: {}".format(i,len(all_input_files),input_file.name))
        pdf_full_txt, text_by_page = convert(input_file)
        # create the output folder if needed
        output_file = Path(str(input_file).replace(".pdf",".txt").replace("input","processed"))
        output_file.parents[0].mkdir(parents=True, exist_ok=True)
        fileConverted = open(output_file, "w", encoding="utf-8")
        fileConverted.write(pdf_full_txt)
        fileConverted.close()
#         if i == 0:
#             break # remove to do all of them

Processing 0/4 - file: all.pdf
Processing 1/4 - file: ddr2018_vf.pdf
Processing 2/4 - file: document-de-reference-2018_v2.pdf
Processing 3/4 - file: vinci-rapport-annuel-2018.pdf


In [95]:
# from summa.summarizer import summarize
summarize(text_by_page[206], words=60, split=True, language="french")

['Advance, questionnaire d’autoévaluation développement durable, qui permet aux managers de passer en revue les thématiques sociales, environnementales et sociétales du Groupe et de prendre des décisions stratégiques sur leurs orientations, a été développé par VINCI, conformément aux principes du Global Compact, des conventions fondamentales de l’Organisation internationale du travail (OIT) et de la norme ISO 26000.']

In [85]:
len(pdf_full_txt)

1547176