# Problem 1-2 PDF Conversion and Regular Expressions

## 2. Quantitative comparison via SequenceMatcher.ratio()

Convert PDFs to plain text

To use pdftotext the following dependencies need to be installed: `sudo apt-get install build-essential libpoppler-cpp-dev pkg-config python-dev`

In [101]:
import pdftotext
from PyPDF2 import PdfFileReader
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [102]:
input_file_path = 'data/phone_numbers/FL_SYB_BetriebsaerztlicherDienst_ID8414.pdf'
output_path_pdfminer = 'comparison_txt/pdfminer - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt'
output_path_pdftotext = 'comparison_txt/pdftotext - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt'
output_path_pypdf2 = 'comparison_txt/pypdf2 - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt'

def run_pdfminer(input_file_path):
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    
    with open(input_file_path, "rb") as input_file:
        for page in PDFPage.get_pages(input_file):
            interpreter.process_page(page)    
        converter.close()
        text = output.getvalue()
    return text
    
pdfminer = run_pdfminer(input_file_path)

def run_pdftotext(input_path_file):
    with open(input_file_path, "rb") as input_file:
        text = pdftotext.PDF(input_file)
    return text

pdftotext = run_pdftotext(input_file_path)

def run_pypdf2(input_file_path):
    with open(input_file_path, "rb") as input_file:
        pypdf2reader = PdfFileReader(input_file,strict=False)
        totalPageNumber = pypdf2reader.numPages
        currentPageNumber = 0
        pdf_content = []
        while (currentPageNumber < totalPageNumber):
            page = pypdf2reader.getPage(currentPageNumber)
            text = page.extractText()
            pdf_content.append(text)
            currentPageNumber += 1
    return pdf_content

pypdf2 = run_pypdf2(input_file_path)

def write_file(output_file, content):
    output_file = open(output_file, 'w')
    output_file.write(content)
    output_file.close()

write_file(output_path_pdftotext, "\n\n".join(pdftotext))
write_file(output_path_pypdf2, "\n\n".join(pypdf2))
write_file(output_path_pdfminer, pdfminer)

Compare results of pdf to text conversion

In [1]:
import difflib

In [2]:
# read files to strings
filename = " - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt"

online = open('comparison_txt/online pdftotext' + filename, mode='r', encoding="utf8").read()
pdfminer = open('comparison_txt/pdfminer' + filename, mode='r', encoding="utf8").read()
pdftotext = open('comparison_txt/pdftotext' + filename, mode='r', encoding="utf8").read()
pypdf2 = open('comparison_txt/pypdf2' + filename, mode='r', encoding="utf8").read()

In [3]:
baseline = difflib.SequenceMatcher(None, online, online).ratio()
pdfminer_ratio = difflib.SequenceMatcher(None, pdfminer, online).ratio()
pdftotext_ratio = difflib.SequenceMatcher(None, pdftotext, online).ratio()
pypdf2_ratio = difflib.SequenceMatcher(None, pypdf2, online).ratio()

print("baseline:", baseline)
print("pdfminer:", pdfminer_ratio)
print("pdftotext:", pdftotext_ratio)
print("pypdf2:", pypdf2_ratio)

baseline: 1.0
pdfminer: 0.41503488633805985
pdftotext: 0.19538226965802657
pypdf2: 0.3905145549897393


## 4. Regex Extractions

### (i)

In [2]:
import os
import re

directory = "processed_txt/phone_numbers"
phone_numbers = r"\(?\d[\d\ \/\-\–)]{6,}\d"

file = open("extracted_phone_numbers.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(phone_numbers, line)
    if result:
        phone_number = result.group()
        phone_number = phone_number.replace(" ", "")
        phone_number = phone_number.replace("/", "")
        phone_number = phone_number.replace("-", "")
        phone_number = phone_number.replace("–", "")
        phone_number = phone_number.replace("(", "")
        phone_number = phone_number.replace(")", "")
        file.write(phone_number + "\n")
        print(phone_number)
        
file.close()

06221568966
06221568970
06221565734
06221560
0908
06221566752
0010
06221562337
06221564801
06221566307
06221567510
01798
06221566914
01798
06221564701
06221567611
06221560
0908
06221566752
0010
06221562337
06221564801
06221566307
06221567510
01798
06221566914
01798
06221564701
06221567611
06221567842
07261660
062213190
0726119292
06221884010
062216540263
07253825200
06223822278
062213190
0726119292
07215961209
0724891810
04069791715
09312501300
06221314633
070046746700
06221412481
0711925410
06221160563
0622125263
062213191532
06221882666
06221472135
062222341
06221412819
0622137030
062236060
06223488999
062215838390
062215838390
062215849000
062215837000
06221779188
062213881
0622178750
06221893700
06221149744
0622133900
062215849000
062215221257
062215825300
062215849000
08004050200
06221901037
06221160864
0622190100
0622133030
062215849000
5838390
0622133030
0622153750
06221901033
0622197830
06221160864
0622190100
0622133030
0622153750
06221720022
0622197830
06221182428
062215849000

### (ii)


In [5]:
import os
import re

directory = "processed_txt/phone_numbers"
urls = r"www.[a-zA-Z0-9+\-&@#\/%?=~_|!:.]*"

file = open("extracted_urls.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(urls, line)
    if result:
        url = result.group()
        file.write(url + "\n")
             
file.close()

In [6]:
import os
import re

directory = "processed_txt/phone_numbers"
emails = r"[a-zA-Z0-9\-_.]+@[a-zA-Z0-9\-_.]*\.[a-zA-Z0-9\-]+"

file = open("extracted_emails.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(emails, line)
    if result:
        email = result.group()
        file.write(email + "\n")
        
file.close()

### (iii)

In [9]:
import os
import re

isbn_path = "processed_txt/isbn/gelbe_seiten_2019-43.txt"
isbn_numbers = r"^(?=(?:\D*\d){10}(?:(?:\D*\d){3})?$)(978|979)[\d-]+$"

file = open("extracted_isbn.txt", mode="w", encoding="utf-8")

lines = open(isbn_path, encoding="utf-8").readlines()
for line in lines:
    result = re.search(isbn_numbers, line)
    if result:
        file.write(result.group() + "\n")
            
file.close()

### (iv)

In [10]:
import os
import regex as re

unit_path = "data/unit_conversion/si.txt"
file = open("converted_units.txt", "w")
regex = r"((\d+)(?=((\d{3})|,(\d{3}))),?(\d{3})(\.\d)?|(\d+)?) ?(ml|milliliters)"

for line in open(unit_path, encoding="utf-8").readlines():
    result = re.search(regex, line)
    if result:
        if (result.group(2) and result.group(6)):
            unit = re.subf(regex, r"{2}.{6} l", result.group())
        else:
            unit = re.subf(regex, "%s" % (r"0.0{8} l"), result.group())
        file.write(unit + "\n")
        print(unit)
        
file.close()

ModuleNotFoundError: No module named 'regex'

##### 5.

In [11]:
import re
import os

directory = "processed_txt/scans"
phone_numbers = r"\(?\d[\d\ \/\-\–)]{6,}\d"

file = open("extracted_phone_numbers_scans.txt", "w")

for filename in os.listdir(directory):
    lines = open(os.path.join(directory, filename), encoding="utf-8").readlines()
    for line in lines:
        result = re.search(phone_numbers, line)
        if result:
            phone_number = result.group()
            phone_number = phone_number.replace(" ", "")
            phone_number = phone_number.replace("/", "")
            phone_number = phone_number.replace("-", "")
            phone_number = phone_number.replace("–", "")
            phone_number = phone_number.replace("(", "")
            phone_number = phone_number.replace(")", "")
            
            file.write(phone_number + "\n")
            
file.close()

It works quite okay with the scanned and ocr interpreted text. But some phone numbers are missed, because they have wrong characters recognized in between them.
Examples:
- "(06221) 43.41 49-0" is not recognized because of the point in between numbers
- "(0 62 21] 4 18 55 58" is not recognized because the closing bracket is

The extraction can be changed to take these into account, but one never can be sure to get all misinterpretations right.

Furthermore, one difficulty is, that a lot of the phone numbers don't have a prefix, because e.g. it is a prefix for all numbers on one page. 