# Problem 1-2 PDF Conversion and Regular Expressions

## 2. Quantitative comparison via SequenceMatcher.ratio()

Convert PDFs to plain text

To use pdftotext the following dependencies need to be installed: `sudo apt-get install build-essential libpoppler-cpp-dev pkg-config python-dev`

In [1]:
import pdftotext
from PyPDF2 import PdfFileReader
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [2]:
input_file_path = 'data/phone_numbers/FL_SYB_BetriebsaerztlicherDienst_ID8414.pdf'
output_path_pdfminer = 'comparison_txt/pdfminer - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt'
output_path_pdftotext = 'comparison_txt/pdftotext - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt'
output_path_pypdf2 = 'comparison_txt/pypdf2 - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt'

def run_pdfminer(input_file_path):
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    
    with open(input_file_path, "rb") as input_file:
        for page in PDFPage.get_pages(input_file):
            interpreter.process_page(page)    
        converter.close()
        text = output.getvalue()
    return text
    
pdfminer = run_pdfminer(input_file_path)

def run_pdftotext(input_path_file):
    with open(input_file_path, "rb") as input_file:
        text = pdftotext.PDF(input_file)
    return text

pdftotext = run_pdftotext(input_file_path)

def run_pypdf2(input_file_path):
    with open(input_file_path, "rb") as input_file:
        pypdf2reader = PdfFileReader(input_file,strict=False)
        totalPageNumber = pypdf2reader.numPages
        currentPageNumber = 0
        pdf_content = []
        while (currentPageNumber < totalPageNumber):
            page = pypdf2reader.getPage(currentPageNumber)
            text = page.extractText()
            pdf_content.append(text)
            currentPageNumber += 1
    return pdf_content

pypdf2 = run_pypdf2(input_file_path)

def write_file(output_file, content):
    output_file = open(output_file, 'w')
    output_file.write(content)
    output_file.close()

write_file(output_path_pdftotext, "\n\n".join(pdftotext))
write_file(output_path_pypdf2, "\n\n".join(pypdf2))
write_file(output_path_pdfminer, pdfminer)

NameError: name 'StringIO' is not defined

Compare results of pdf to text conversion

In [3]:
import difflib

In [4]:
# read files to strings
filename = " - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt"

online = open('comparison_txt/online pdftotext' + filename, mode='r', encoding="utf8").read()
pdfminer = open('comparison_txt/pdfminer' + filename, mode='r', encoding="utf8").read()
pdftotext = open('comparison_txt/pdftotext' + filename, mode='r', encoding="utf8").read()
pypdf2 = open('comparison_txt/pypdf2' + filename, mode='r', encoding="utf8").read()

In [5]:
baseline = difflib.SequenceMatcher(None, online, online).ratio()
pdfminer_ratio = difflib.SequenceMatcher(None, pdfminer, online).ratio()
pdftotext_ratio = difflib.SequenceMatcher(None, pdftotext, online).ratio()
pypdf2_ratio = difflib.SequenceMatcher(None, pypdf2, online).ratio()

print("baseline:", baseline)
print("pdfminer:", pdfminer_ratio)
print("pdftotext:", pdftotext_ratio)
print("pypdf2:", pypdf2_ratio)

baseline: 1.0
pdfminer: 0.4150660264105642
pdftotext: 0.19538226965802657
pypdf2: 0.39060718899612434


## 4. Regex Extractions

### (i)

In [6]:
import os
import re

directory = "processed_txt/phone_numbers"
phone_numbers = r"\(?\d[\d\ \/\-\–)]{6,}\d"

file = open("extracted_phone_numbers.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(phone_numbers, line)
    if result:
        phone_number = result.group()
        phone_number = phone_number.replace(" ", "")
        phone_number = phone_number.replace("/", "")
        phone_number = phone_number.replace("-", "")
        phone_number = phone_number.replace("–", "")
        phone_number = phone_number.replace("(", "")
        phone_number = phone_number.replace(")", "")
        file.write(phone_number + "\n")
        print(phone_number)
        
file.close()

06221560
06221566220
06221568781
06221568080
06221567211
06221566996
06221567856
06221567971
06221567011
06221562319
06221564467
06221566351
06221564878
062215638888
062215638961
062215638961
062215638961
06221566254
06221566604
06221566655
06221566604
06221566613
06221566625
06221562898
06221566613
06221566695
06221566695
06221566619
06221566613
06221566613
06221566642
06221564573
06221566634
06221564807
06221566201
06221564878
06221566202
06221566220
06221566220
06221566215
06221566209
06221567807
5638421
06221566209
06221566209
062215636217
06221566252
06221566223
06221566220
06221566223
062215638888
06221566323
06221566538
062215636467
016090860372
06221566224
06221566225
06221566281
015117408217
062215639580
06221566282
06221566220
06221566220
06221566282
06221566283
06221566272
06221566272
06221566272
06221566276
06221566110
06221566249
062215639105
06221566249
06221566220
06221566220
06221566226
06221566226
06221566226
06221566226
06221566226
06221566321
06221567501
5636470
0622

### (ii)


In [7]:
import os
import re

directory = "processed_txt/phone_numbers"
urls = r"www.[a-zA-Z0-9+\-&@#\/%?=~_|!:.]*"

file = open("extracted_urls.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(urls, line)
    if result:
        url = result.group()
        file.write(url + "\n")
             
file.close()

In [8]:
import os
import re

directory = "processed_txt/phone_numbers"
emails = r"[a-zA-Z0-9\-_.]+@[a-zA-Z0-9\-_.]*\.[a-zA-Z0-9\-]+"

file = open("extracted_emails.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(emails, line)
    if result:
        email = result.group()
        file.write(email + "\n")
        
file.close()

### (iii)

In [9]:
import os
import re

isbn_path = "processed_txt/isbn/gelbe_seiten_2019-43.txt"
isbn_numbers = r"^(?=(?:\D*\d){10}(?:(?:\D*\d){3})?$)(978|979)[\d-]+$"

file = open("extracted_isbn.txt", mode="w", encoding="utf-8")

lines = open(isbn_path, encoding="utf-8").readlines()
for line in lines:
    result = re.search(isbn_numbers, line)
    if result:
        file.write(result.group() + "\n")
            
file.close()

### (iv)

In [19]:
import os
import regex as re

unit_path = "data/unit_conversion/si.txt"
file = open("converted_units.txt", "w")
regex = r"((\d*?)|(\d+?),)(\d{1,3})(\.\d+)?\s?(ml|milliliters)"

for line in open(unit_path, encoding="utf-8").readlines():
    result = re.search(regex, line)
    if result:
        unit = ""
        if (result.group(2)):
            unit += re.subf(regex, r"{2}", result.group())
        elif (result.group(3)):
            unit += re.subf(regex, r"{3}", result.group())
        else:
            unit = "0"
        
        unit += "."
        
        if len(result.group(4)) == 1:
            unit += re.subf(regex, r"00{4} l", result.group())
        elif len(result.group(4)) == 2:
            unit += re.subf(regex, r"0{4} l", result.group())
        else:
            unit += re.subf(regex, r"{4} l", result.group())
        file.write(unit + "\n")
        print(unit)
        
file.close()

1.337 l
2.500 l
0.012 l
18.421 l
8.321 l


##### 5.

In [11]:
import re
import os

directory = "processed_txt/scans"
phone_numbers = r"\(?\d[\d\ \/\-\–)]{6,}\d"

file = open("extracted_phone_numbers_scans.txt", "w")

for filename in os.listdir(directory):
    lines = open(os.path.join(directory, filename), encoding="utf-8").readlines()
    for line in lines:
        result = re.search(phone_numbers, line)
        if result:
            phone_number = result.group()
            phone_number = phone_number.replace(" ", "")
            phone_number = phone_number.replace("/", "")
            phone_number = phone_number.replace("-", "")
            phone_number = phone_number.replace("–", "")
            phone_number = phone_number.replace("(", "")
            phone_number = phone_number.replace(")", "")
            
            file.write(phone_number + "\n")
            
file.close()

It works quite okay with the scanned and ocr interpreted text. But some phone numbers are missed, because they have wrong characters recognized in between them.
Examples:
- "(06221) 43.41 49-0" is not recognized because of the point in between numbers
- "(0 62 21] 4 18 55 58" is not recognized because the closing bracket is

The extraction can be changed to take these into account, but one never can be sure to get all misinterpretations right.

Furthermore, one difficulty is, that a lot of the phone numbers don't have a prefix, because e.g. it is a prefix for all numbers on one page. 