In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import libraries 
from PIL import Image 
import pytesseract 
import re
from pdf2image import convert_from_path 
from os import getcwd, remove, path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

from tika import parser
import glob
import pickle
import pandas as pd

In [3]:
PROJECT_NAME = 'modern_slavery_registry'
PROJECT_PATH = f"{getcwd()[:getcwd().find(PROJECT_NAME)]}{PROJECT_NAME}"
DATA_PATH = f"{PROJECT_PATH}\\data"
PICKLE_PATH = f"{DATA_PATH}\\pickles"
PDF_PATH = f"{DATA_PATH}\\pdfs"
TEXT_PATH = f"{DATA_PATH}\\texts"
SLAVERY_TEXT_PATH = f"{DATA_PATH}\\slavery_texts"
MAX_DOWNLOAD_TIME = 5

In [4]:
def save_pickle(obj, file_name):
    pickle.dump(obj, file=open(f"{file_name}.pickle","wb"))
    
def load_pickle(file_name):
    return pickle.load(file=open(f"{file_name}.pickle","rb"))

In [5]:
pdfs = glob.glob(pathname=f"{PDF_PATH}\\*.pdf")
TOTAL_PDFs = len(pdfs)
print(f"Found {TOTAL_PDFs} pdfs.")

Found 15068 pdfs.


In [6]:
df = pd.read_csv(f"{DATA_PATH}\modernslaveryregistry-2020-09-14.csv")
df.head()

Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered
0,7676,"""K"" Line Holding Europe Limited",True,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019
2,28659,"""K"" Line (Europe) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019
3,28661,"""K"" Line LNG Shipping Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019
4,28658,Polar LNG Shipping (UK) Limited,False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019


In [7]:
def get_pdf_url_from_sid(sid):
    """Returns PDF url for input sid."""
    return df[df['Statement ID']==sid]['URL'].values[0]

In [8]:
def get_sid_from_filename(file_name):
    """Extract and return SID from file_name."""
    return int(re.findall(pattern=r"[\d]+",string=path.basename(file_name))[0])

In [9]:
def extract_text_from_pdf_using_tika(pdf_name):
    """Return PDF content using Tika."""
    return parser.from_file(pdf_name)['content']

In [10]:
def extract_text_from_pdf_using_ocr(pdf_file_name):
    """Extract and return from PDF."""
    sid = get_sid_from_filename(pdf_file_name)
    pages = convert_from_path(pdf_path=pdf_file_name, dpi=500) 
    text = ""
    for i, page in enumerate(pages):
        image_name = f"{sid}-{i}.jpg"
        page.save(image_name, "JPEG")
        text = text + " " + str(pytesseract.image_to_string(Image.open(image_name)))
        remove(image_name)
    return text

In [11]:
def print_loop_no(total_sids, sid, itr, type_, type_itr, msg=""):
    """Print loop statements."""
    primary_text = f"Running - {itr}/{total_sids} , SID - {sid}"
    if type_ == "Success":
        primary_text = f"{primary_text} - Success - {type_itr}"
    elif type_ == "Failed":
        primary_text = f"{primary_text} - Failed - {type_itr}"
    else:
        primary_text = f"{primary_text} - Processed - {type_itr}"
    print(f"{'-'*8} {primary_text} - {msg} - {'-'*8}")

In [12]:
def clean_text(text):
    """Clean and return input text."""
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    return " ".join(text.split())

In [13]:
def save_text(text, sid):
    """Save text in text file."""
    file_name = f"SID-{sid}" 
    file_obj = open(f"{TEXT_PATH}//{file_name}.txt", 'w', encoding='utf-8')
    file_obj.write(text)
    file_obj.close()

In [None]:
pickle_name = "leftover_pdf_to_downloads_SIDs"
leftover_SIDs = load_pickle(file_name=f"{PICKLE_PATH}\\{pickle_name}")
TOTAL_SIDs = len(leftover_SIDs)
print(f"Total SIDs - PDFs left to download : {len(leftover_SIDs)}")

In [None]:
PDF_SIDs = [get_sid_from_filename(pdf_file_name) for pdf_file_name in pdfs]
PDF_SIDs = [sid for sid in leftover_SIDs if sid in PDF_SIDs]

In [None]:
TOTAL_SIDs = len(PDF_SIDs)
print(f"Total text extraction required from : {TOTAL_SIDs} pdfs")

In [16]:
extracted_text_SIDs = []
failed_text_SIDs = []

In [None]:
# pdf_name = f"{PDF_PATH}\\SID-{8745}.pdf"
# text = extract_text_from_pdf_using_tika(pdf_name)
# if not text:
#     text = extract_text_from_pdf_using_ocr(pdf_name)
# text = clean_text(text)     
# text

In [None]:
for i, sid in enumerate(PDF_SIDs):
    pdf_name = f"{PDF_PATH}\\SID-{sid}.pdf"
    try:
        text = extract_text_from_pdf_using_tika(pdf_name)
        if not text:
            text = extract_text_from_pdf_using_ocr(pdf_name)
        text = clean_text(text)     
        save_text(text=text, sid=sid)
        extracted_text_SIDs.append(int(sid))
        print_loop_no(total_sids=TOTAL_SIDs,
                      sid=sid, 
                      itr=i+1, 
                      type_="Success",
                      type_itr=len(extracted_text_SIDs))
    except:
        failed_text_SIDs.append(int(sid))
        print_loop_no(total_sids=TOTAL_SIDs,
                      sid=sid, 
                      itr=i+1, 
                      type_="Failed",
                      type_itr=len(failed_text_SIDs))

# Running extracting texts from small text file for which text was not extracted properly

In [14]:
small_text_files = glob.glob(f"{SLAVERY_TEXT_PATH}//*.txt")
small_text_files = [file for file in small_text_files if path.getsize(file) < 1000]
print(f"Found {len(small_text_files)} small text files.")
small_text_SIDs = [get_sid_from_filename(file) for file in small_text_files]

Found 273 small text files.


In [17]:
for i, sid in enumerate(small_text_SIDs):
    pdf_name = f"{PDF_PATH}\\SID-{sid}.pdf"
    try:
        text = extract_text_from_pdf_using_ocr(pdf_name)
        text = clean_text(text)     
        save_text(text=text, sid=sid)
        extracted_text_SIDs.append(int(sid))
        print_loop_no(total_sids=len(small_text_SIDs),
                      sid=sid, 
                      itr=i+1, 
                      type_="Success",
                      type_itr=len(extracted_text_SIDs), 
                      msg="OCR")
    except:
        try:
            text = extract_text_from_pdf_using_tika(pdf_name)
            text = clean_text(text)     
            save_text(text=text, sid=sid)
            extracted_text_SIDs.append(int(sid))
            print_loop_no(total_sids=len(small_text_SIDs),
                          sid=sid, 
                          itr=i+1, 
                          type_="Success",
                          type_itr=len(extracted_text_SIDs), 
                          msg="TIKA")
        except:
            failed_text_SIDs.append(int(sid))
            print_loop_no(total_sids=len(small_text_SIDs),
                          sid=sid, 
                          itr=i+1, 
                          type_="Failed",
                          type_itr=len(failed_text_SIDs))

-------- Running - 1/273 , SID - 10090 - Success - 1 - TIKA - --------
-------- Running - 2/273 , SID - 10095 - Success - 2 - TIKA - --------
-------- Running - 3/273 , SID - 10109 - Success - 3 - TIKA - --------
-------- Running - 4/273 , SID - 10124 - Success - 4 - TIKA - --------
-------- Running - 5/273 , SID - 10254 - Success - 5 - OCR - --------
-------- Running - 6/273 , SID - 10426 - Success - 6 - TIKA - --------
-------- Running - 7/273 , SID - 10510 - Success - 7 - TIKA - --------
-------- Running - 8/273 , SID - 10514 - Success - 8 - TIKA - --------
-------- Running - 9/273 , SID - 10529 - Success - 9 - TIKA - --------
-------- Running - 10/273 , SID - 10611 - Success - 10 - TIKA - --------
-------- Running - 11/273 , SID - 10667 - Success - 11 - OCR - --------
-------- Running - 12/273 , SID - 10726 - Success - 12 - TIKA - --------
-------- Running - 13/273 , SID - 10799 - Success - 13 - OCR - --------
-------- Running - 14/273 , SID - 10892 - Success - 14 - OCR - --------


In [None]:
sid = 9010

In [None]:
extract_text_from_pdf_using_tika(f"{PDF_PATH}\\SID-{sid}.pdf")

In [None]:
extract_text_from_pdf_using_ocr(f"{PDF_PATH}\\SID-{sid}.pdf")

In [None]:
path.getsize(f"E:\\Projects\\modern_slavery_registry\\data\\slavery_texts\\SID-{sid}.txt")

In [None]:
get_pdf_url_from_sid(sid)