# LaTeX PDF

In [21]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar

In [22]:
pdf = '../verslag/output/CluyseDylanBP.pdf'

## Gathering specific pages from a LaTeX-PDF

In [23]:
pdf

'../verslag/output/CluyseDylanBP.pdf'

In [24]:
all_pages = extract_pages(
    pdf,
    page_numbers=[30],
    maxpages=1
)

full_text = ""


for page_layout in all_pages:
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                full_text += text_line.get_text()

In [25]:
full_text[:200]

'2.4. De verschillende soorten tekstvereenvoudiging\n23\nFiguur (2.12)\nVoorbeeld van manuele tekstvereenvoudiging. Oorspronkelijke tekst uit Historia 5 bron toe te voegen\n2.4.1. Lexicale vereenvoudiging\n'

## Gathering different fonts from a LaTeX-PDF.

In [26]:
fonts = []

for page_layout in extract_pages(pdf):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                for character in text_line:
                    if isinstance(character, LTChar):
                        if character.fontname not in fonts:
                            fonts.append(character.fontname)

In [27]:
fonts[:5]

['EZIVTI+Montserrat-Black',
 'CAYWFX+Montserrat-ExtraBold',
 'LWLRJE+Montserrat-Regular',
 'VSVKJZ+Montserrat-Bold',
 'RSVGBK+Montserrat-Italic']

In [28]:
full_text = []

for page_layout in extract_pages(pdf):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                full_text.append(text_line)  

In [29]:
full_text[:10]

[<LTTextLineHorizontal 42.520,758.625,552.762,783.413 'Scholieren met dyslexie van de derde\n'>,
 <LTTextLineHorizontal 42.520,732.324,570.634,757.112 'graad middelbaar onderwijs ondersteu-\n'>,
 <LTTextLineHorizontal 42.520,706.022,557.571,730.810 'nen bij het lezen van wetenschappe-\n'>,
 <LTTextLineHorizontal 42.520,679.721,500.164,704.509 'lijke papers via tekstsimplificatie.\n'>,
 <LTTextLineHorizontal 42.520,655.321,234.541,672.536 'Optionele ondertitel.\n'>,
 <LTTextLineHorizontal 42.520,617.747,145.813,632.093 'Dylan Cluyse.\n'>,
 <LTTextLineHorizontal 42.520,237.350,355.928,248.259 'Scriptie voorgedragen tot het bekomen van de graad van\n'>,
 <LTTextLineHorizontal 42.520,221.091,326.615,232.000 'Professionele bachelor in de toegepaste informatica\n'>,
 <LTTextLineHorizontal 42.520,164.663,187.712,175.572 'Promotor: Mevr. L. De Mol\n'>,
 <LTTextLineHorizontal 42.520,148.404,261.829,159.313 'Co-promotor: J. Decorte; J. Van Damme\n'>]

## Analysis of a PDF document

In [30]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1

fp = open(pdf, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)

arr_outlines = []
toc = []

if "Outlines" in document.catalog:
    outlines = document.get_outlines()
    for (level,title,dest,a,se) in outlines:
        arr_outlines.append([level, title])

In [31]:
document.is_extractable

True

In [32]:
document.catalog

{'OpenAction': [<PDFObjRef:3>, /'Fit'],
 'PageMode': /'UseOutlines',
 'PageLabels': {'Nums': [0, {'S': /'r'}, 8, {'S': /'D'}]},
 'Names': <PDFObjRef:1103>,
 'Outlines': <PDFObjRef:1104>,
 'Pages': <PDFObjRef:1170>,
 'Type': /'Catalog'}

In [33]:
arr_outlines[:10]

[[1, 'Lĳst van figuren'],
 [1, 'Inleiding'],
 [2, 'Probleemstelling'],
 [2, 'Onderzoeksvraag'],
 [2, 'Onderzoeksdoelstelling'],
 [2, 'Opzet van deze bachelorproef'],
 [1, 'Stand van zaken'],
 [2, 'Onderzoeken rond dyslexie'],
 [3, 'Centraal zicht op dyslexie'],
 [3, 'Fonologische dyslexie']]

# LaTeX (compact format)

In [34]:
pdf = "C:/Users/dylan/Downloads/bap/simplification-of-literary-and-scientific-texts-to-improve-reading-fluency-and-comprehension-in-beginning-readers-of-french.pdf"

## Gathering different fonts

In [35]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar

fonts = []

for page_layout in extract_pages(pdf):
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            for text_line in element:
                for character in text_line:
                    if isinstance(character, LTChar):
                        if character.fontname not in fonts:
                            fonts.append(character.fontname)

In [36]:
fonts

['MOBAKO+AdvOT7d6df7ab.I',
 'MOBAII+AdvOT1ef757c0',
 'MOBAJM+AdvOTb65e897d.B',
 'MOBAJL+AdvOT1ef757c0+20',
 'MOBAIH+AdvOT031da8bf',
 'AAAHFE+NotoSans',
 'MOBBDN+AdvOT6a84369d.BI',
 'MOBAPO+AdvOT6b1ec377',
 'BKMJDE+AdvOT8608a8d1+21',
 'MOBHMM+AdvOT7d6df7ab.I+20',
 'APMKIN+AdvP4C4E51',
 'MOBBFO+AdvOTcabc3928.I',
 'MOBBOO+AdvOTa1b8d78f.I',
 'MOBBKO+AdvOT0e40dc65',
 'ALJJAJ+AdvOTa42dc7d3.B',
 'MOBBOP+AdvOT8608a8d1+22',
 'MOBBKP+AdvOT0e40dc65+20',
 'APMOLI+AdvP49C2A1',
 'ALKALC+AdvOTb65e897d.B+20',
 'APMKLP+AdvOT45bf69de.BI']

In [37]:
fp = open(pdf, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)

arr_outlines = []

if "Outlines" in document.catalog:
    outlines = document.get_outlines()
    for (level,title,dest,a,se) in outlines:
        arr_outlines.append([level, title])

In [38]:
document.is_extractable

True

In [53]:
document.catalog

{'AcroForm': <PDFObjRef:610>,
 'Metadata': <PDFObjRef:210>,
 'Pages': <PDFObjRef:570>,
 'Type': /'Catalog'}

## Shell

In [None]:
import os

pdf2txt = "C:/_hogeschool-gent/pdfminer.six/tools/pdf2txt.py"
file = "C:/_hogeschool-gent/bachelorproef-nlp-tekstvereenvoudiging/verslag/output/CluyseDylanBP.pdf"

output_dir = ""
to_dir = f"-o {output_dir}test.txt"


os.system(pdf2txt + ' ' + file + ' ' + to_dir + '--extract-toc')

0