# Data Preparation - parsing paragraphs from all PDFs

TOD: rename input file, work on entreprises.csv, do this for all files.

In [4]:
# general imports
from pathlib import Path
import os

# processing imports
import pandas as pd
from tqdm.notebook import tqdm

# pdfminer imports
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTPage, LTChar, LTAnno, LAParams, LTTextBox, LTTextLine
from difflib import SequenceMatcher

In [35]:
# utils elements to move to utils after development

def getListOfFiles(dirName):
    '''
        For the given path, get the List of all files in the directory tree 
    '''
    paths = []
    for path, subdirs, files in os.walk(dirName):
        for name in files:
            paths.append((Path(path+"/"+name)))            
    return paths


class PDFPageDetailedAggregator(PDFPageAggregator):
    def __init__(self, rsrcmgr, pageno=1, laparams=None):
        PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
        self.rows = []
        self.page_number = 0
    def receive_layout(self, ltpage):        
        def render(item, page_number):
            if isinstance(item, LTPage) or isinstance(item, LTTextBox):
                for child in item:
                    render(child, page_number)
            elif isinstance(item, LTTextLine):
                child_str = ''
                for child in item:
                    if isinstance(child, (LTChar, LTAnno)):
                        child_str += child.get_text()
                child_str = ' '.join(child_str.split()).strip()
                if child_str:
                    row = (page_number, item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3], child_str) # bbox == (x1, y1, x2, y2)
                    self.rows.append(row)
                for child in item:
                    render(child, page_number)
            return
        render(ltpage, self.page_number)
        self.page_number += 1
        self.rows = sorted(self.rows, key = lambda x: (x[0], -x[2]))
        self.result = ltpage

from collections import OrderedDict
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams

# TODO: deal with words cut in half "pro- pagation of..."
def convert(input_file, rse_ranges):
    """
    :param input_file: PDF filename
    :param rse_ranges: (nb_first_page_rse:int, nb_last_page_rse:int), starting at 1
    """
    fp = open(input_file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    # doc.initialize("passwrd") # leave empty for no password

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    pages_selection = range(rse_ranges[0]-1,(rse_ranges[1]-1)+1)
    for nb_page_parsed, page in enumerate(PDFPage.create_pages(doc)):
        if nb_page_parsed in pages_selection:
            interpreter.process_page(page)
            # receive the LTPage object for this page
            device.get_result()

    # GROUPING BY COLUMN
    column_text = OrderedDict() # keep order is of identification in the document.
    for (page_nb, x_min, y_min, _, y_max, text) in device.rows:
        page_nb = (pages_selection[0]) + page_nb # elsewise device starts again at 0
        if page_nb not in column_text.keys():
            column_text[page_nb] = {}
        x_group = round(x_min)//50 # Si trois paragraphes -> shift de 170, max à droite ~600
        try:
            column_text[page_nb][x_group].append((y_min, y_max, text))
        except:
            column_text[page_nb][x_group] = [(y_min, y_max, text)]

    grouped_data_final = []
    paragraph_index = 0
    
    # CREATE THE PARAGRAPHS IN EACH COLUMN
    # define minimal conditions to define a change of paragraph:
    # Being spaced by more than the size of each line (min if different to accoutn for titles)
    for page_nb, x_groups_dict in column_text.items():
        for x_group_name, x_groups_data in x_groups_dict.items():
            x_groups_data = sorted(x_groups_data, key=lambda x: x[0], reverse=True) # sort vertically, higher y = before
            x_groups_data_paragraphs = []
            
            p = {"y_min":x_groups_data[0][0], 
                 "y_max":x_groups_data[0][1],
                 "paragraph":x_groups_data[0][2]}
            previous_height = p["y_max"] - p["y_min"]
            for y_min, y_max, paragraph in x_groups_data[1:]:
                current_height = y_max - y_min
                min_height = min(previous_height,current_height)
                
                if (p["y_min"]-y_max)<min_height: #paragraph update
                    p["y_min"] = y_min
                    p["paragraph"] = p["paragraph"] + " " + paragraph
                else: # break paragraph, start new one
                    x_groups_data_paragraphs.append(p)
                    p = {"y_min":y_min,
                         "y_max":y_max,
                         "paragraph":paragraph}
                previous_height = current_height
            # add the last paragraph of column
            x_groups_data_paragraphs.append(p)
            # structure the output
            for p in x_groups_data_paragraphs:
                grouped_data_final.append({"paragraph_id":paragraph_index,
                                           "page_nb":page_nb,
                                           "x_group":x_group,
                                           "y_min_paragraph":round(p["y_min"]), 
                                           "y_max_paragraph":round(p["y_max"]), 
                                           "paragraph":p["paragraph"]})
                paragraph_index+=1
    return grouped_data_final


def similar(a, b):
    "Compares two strings and returns a similarity ratio between 0 and 1"
    return SequenceMatcher(None, a, b).ratio()


def cut_footer(Df, p, siren, verbose=False):
    "Cut the paragraph with lowest y_min if other paragraphs are similar"
    "The similarity is measured with function similar"
    try:
        footers=[]
        isFooter=True
        y_footer = Df[Df['SIREN']==siren]['y_min_paragraph'].min()
        while isFooter:        
            if len(Df[Df['y_min_paragraph']==y_footer]['paragraph'].values)>1:
                footers.append(*Df[Df['y_min_paragraph']==y_footer]['paragraph'].values[:1])
                for phrase_1 in Df[Df['y_min_paragraph']==y_footer]['paragraph'].values[1:]:
                    if similar(str(footers[-1]), str(phrase_1))<p:
                        footers.pop(-1)
                        isFooter=False
                        break
                Df[Df["SIREN"]==siren]=Df[(Df["SIREN"]==siren)&(Df['y_min_paragraph']>y_footer)]
            else:
                isFooter=False
            y_footer = Df[Df['SIREN']==siren]['y_min_paragraph'].min()
        
        #Below part is for human check that the function works properly
        if verbose==True:
            print("Denomination:", *Df[Df['SIREN']==siren]['denomination'].unique(), siren)
            if footers!=[]:
                print("Footer(s) --->", *footers)
            print("Not footer --->", \
                  Df[Df['y_min_paragraph']==y_footer]['paragraph'].values[:1][0][:50],\
                  " - Page", *Df[Df['y_min_paragraph']==y_footer]['page_nb'].values[:1])
    except:
        print("Error with SIREN =",siren)
    
def cut_header(Df, p, siren, verbose=False):
    "Same as function cut_footer() but for headers"
    try:
        headers=[]
        isHeader=True
        y_header = Df[Df['SIREN']==siren]['y_max_paragraph'].max()
        while isHeader:        
            if len(Df[Df['y_max_paragraph']==y_header]['paragraph'].values)>1:
                headers.append(*Df[Df['y_max_paragraph']==y_header]['paragraph'].values[:1])
                for phrase_1 in Df[Df['y_max_paragraph']==y_header]['paragraph'].values[1:]:
                    if similar(str(headers[-1]), str(phrase_1))<p:
                        headers.pop(-1)
                        isHeader=False
                        break
                Df[Df["SIREN"]==siren]=Df[(Df["SIREN"]==siren)&(Df['y_max_paragraph']<y_header)]
            else:
                isHeader=False
            y_header = Df[Df['SIREN']==siren]['y_max_paragraph'].max()
        
        #Below part is for human check that the function works properly
        if verbose==True:
            if headers!=[]:
                print("Header(s) --->", headers)
            print("Not header --->", \
                  Df[Df['y_max_paragraph']==y_header]['paragraph'].values[:1][0][:50], \
                  " - Page", *Df[Df['y_max_paragraph']==y_header]['page_nb'].values[:1],'\n')
    except:
        print("Error with SIREN =",siren, '\n')

In [38]:
# filepaths
entreprises_filename = "../../data/input/Entreprises/entreprises.csv"
input_path = "../../data/input/DPEFs/"
output_filename= "../../data/processed/DPEFs/dpef_paragraphs.csv"

# Entreprises : data
dict_entreprises = pd.read_csv(entreprises_filename, sep=";").set_index("project_denomination").T.to_dict()
# Looks like 'vinci': {'SIREN': 552037806, 'denomination': 'VINCI', 'rse_ranges':(38,48)|(207,266)},

# DPEF
all_input_files = getListOfFiles(input_path)
all_input_files = [p for p in all_input_files if p.name.lower().endswith(".pdf")]

# output has shape...
df_parsed_data = pd.DataFrame(columns = ["SIREN",
                                         "project_denomination",
                                         "denomination",
                                         "pdf_name", 
                                         "page_nb",
                                         "paragraph_id", # x_min for now
                                         "paragraph",
                                         "x_group",
                                         "y_min_paragraph",
                                         "y_max_paragraph"])
for i, input_file in tqdm(enumerate(all_input_files)):
    if input_file.name.endswith("pdf"):
        project_denomination = input_file.name.split("\\")[-1].split("_")[0] # first word of pdf name
        print("Processing {}/{} {} [{}]".format(i+1,len(all_input_files),project_denomination, input_file.name))
        for rse_ranges in dict_entreprises[project_denomination]["rse_ranges"].split("|"):
            rse_ranges = eval(rse_ranges) # tuple format str to actual tuple
            print("Pages: {} to {}".format(rse_ranges[0], rse_ranges[1]))
            grouped_data_final = convert(input_file, rse_ranges)
            for paragraph_data in grouped_data_final:
                df_update = {"SIREN": dict_entreprises[project_denomination]["SIREN"],
                             "denomination": dict_entreprises[project_denomination]["denomination"],
                             "project_denomination": project_denomination,
                             "pdf_name": input_file.name.split("\\")[-1]}
                df_update.update(paragraph_data)
                df_parsed_data = df_parsed_data.append(df_update, ignore_index=True)
#         break

for siren in df_parsed_data["SIREN"].unique():
    cut_footer(df_parsed_data, 0.7, siren)
    cut_header(df_parsed_data, 0.7, siren)
df_parsed_data = df_parsed_data[df_parsed_data.paragraph.notna()] # empty lines created by cuter ?
df_parsed_data.to_csv(output_filename,sep=";", index=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Processing 0/14 michelin [michelin_2018_ddr.pdf]
Pages: 204 to 282
Processing 1/14 bouygues [bouygues_2018_ddr.pdf]
Pages: 98 to 124
Processing 2/14 eiffage [eiffage_2018_ddr.pdf]
Pages: 125 to 202
Processing 3/14 saintgobain [saintgobain_2018_ddr.pdf]
Pages: 76 to 79
Pages: 101 to 104
Pages: 329 to 332
Processing 4/14 vinci [vinci_2018_ddr.pdf]
Pages: 38 to 48
Pages: 207 to 266
Processing 5/14 edf [edf_2018_ddr.pdf]
Pages: 149 to 236
Processing 6/14 engie [engie_2018_ddr.pdf]
Pages: 63 to 110
Processing 7/14 orano [orano_2018_ddr.pdf]
Pages: 61 to 78
Processing 8/14 total [total_2018_ddr.pdf]
Pages: 179 to 226
Processing 9/14 auchanholding [auchanholding_2018_ddr.pdf]
Pages: 115 to 165
Processing 10/14 carrefour [carrefour_2018_ddr.pdf]
Pages: 39 to 130
Processing 11/14 casino [casino_2018_dpef.pdf]
Pages: 1 to 76
Processing 12/14 scaouest [scaouest_2018_dpef.pdf]
Pages: 1 to 39
Processing 13/14 lvmh [lvmh_2018_dpef.pdf]
Pages: 1 to 54



In [78]:
# df_parsed_data.groupby("project_denomination").size()