# Problem 1-2 PDF Conversion and Regular Expressions

## 2. Quantitative comparison via SequenceMatcher.ratio()

In [26]:
import difflib

In [27]:
# read files to strings
filename = " - FL_SYB_BetriebsaerztlicherDienst_ID8414.txt"

pdftotext = open('comparison_txt/online pdftotext' + filename, mode='r', encoding="utf8").read()
pdf2go = open('comparison_txt/online pdf2go' + filename, mode='r', encoding="utf8").read()
calibre = open('comparison_txt/calibre' + filename, mode='r', encoding="utf8").read()
pypdf2 = open('comparison_txt/pypdf2' + filename, mode='r', encoding="utf8").read()

In [28]:
baseline = difflib.SequenceMatcher(None, pdftotext, pdftotext).ratio()
pdf2go_ratio = difflib.SequenceMatcher(None, pdf2go, pdftotext).ratio()
calibre_ratio = difflib.SequenceMatcher(None, calibre, pdftotext).ratio()
pypdf2_ratio = difflib.SequenceMatcher(None, pypdf2, pdftotext).ratio()

print("baseline:", baseline)
print("pdf2go:", pdf2go_ratio)
print("calibre:", calibre_ratio)
print("pypdf2:", pypdf2_ratio)

baseline: 1.0
pdf2go: 0.45782220202938056
calibre: 0.35420936764594246
pypdf2: 0.3905145549897393


## 4. Regex Extractions

### (i)

In [56]:
import os
import re

directory = "processed_txt/phone_numbers"
phone_numbers = r"\(?\d[\d\ \/\-\–)]{6,}\d"

file = open("extracted_phone_numbers.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(phone_numbers, line)
    if result:
        phone_number = result.group()
        phone_number = phone_number.replace(" ", "")
        phone_number = phone_number.replace("/", "")
        phone_number = phone_number.replace("-", "")
        phone_number = phone_number.replace("–", "")
        phone_number = phone_number.replace("(", "")
        phone_number = phone_number.replace(")", "")
        file.write(phone_number + "\n")
        
file.close()

### (ii)


In [58]:
import os
import re

directory = "processed_txt/phone_numbers"
urls = r"www.[a-zA-Z0-9+\-&@#\/%?=~_|!:.]*"

file = open("extracted_urls.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(urls, line)
    if result:
        url = result.group()
        file.write(url + "\n")
             
file.close()

In [59]:
import os
import re

directory = "processed_txt/phone_numbers"
emails = r"[a-zA-Z0-9\-_.]+@[a-zA-Z0-9\-_.]*\.[a-zA-Z0-9\-]+"

file = open("extracted_emails.txt", "w")

for filename in os.listdir(directory):
  lines = open(os.path.join(directory, filename)).readlines()
  for line in lines:
    result = re.search(emails, line)
    if result:
        email = result.group()
        file.write(email + "\n")
        
file.close()

### (iii)

In [60]:
import os
import re

isbn_path = "processed_txt/isbn/gelbe_seiten_2019-43.txt"
isbn_numbers = r"^(?=(?:\D*\d){10}(?:(?:\D*\d){3})?$)(978|979)[\d-]+$"

file = open("extracted_isbn.txt", mode="w", encoding="utf-8")

lines = open(isbn_path, encoding="utf-8").readlines()
for line in lines:
    result = re.search(isbn_numbers, line)
    if result:
        file.write(result.group() + "\n")
            
file.close()

### (iv)

In [61]:
import os
import re

unit_path = "data/unit_conversion/si.txt"
file = open("converted_units.txt", "w")
regex1 = r".*(?=\s?(ml|milliliters))"

for line in open(unit_path, encoding="utf-8").readlines():
    result = re.search(regex1, line)
    if result:
        unit = str(round(float(result.group(0).replace(',',''))/1000,3)) + " l"
        file.write(unit + "\n")
        
file.close()

##### 5.

In [31]:
import re
import os

directory = "processed_txt/scans"
phone_numbers = r"\(?\d[\d\ \/\-\–)]{6,}\d"

file = open("extracted_phone_numbers_scans.txt", "w")

for filename in os.listdir(directory):
    lines = open(os.path.join(directory, filename), encoding="utf-8").readlines()
    for line in lines:
        result = re.search(phone_numbers, line)
        if result:
            phone_number = result.group()
            phone_number = phone_number.replace(" ", "")
            phone_number = phone_number.replace("/", "")
            phone_number = phone_number.replace("-", "")
            phone_number = phone_number.replace("–", "")
            phone_number = phone_number.replace("(", "")
            phone_number = phone_number.replace(")", "")
            
            file.write(phone_number + "\n")
            
file.close()

It works quite okay with the scanned and ocr interpreted text. But some phone numbers are missed, because they have wrong characters recognized in between them.
Examples:
- "(06221) 43.41 49-0" is not recognized because of the point in between numbers
- "(0 62 21] 4 18 55 58" is not recognized because the closing bracket is

The extraction can be changed to take these into account, but one never can be sure to get all misinterpretations right.

Furthermore, one difficulty is, that a lot of the phone numbers don't have a prefix, because e.g. it is a prefix for all numbers on one page. 