# Used to extract text from PDFs

Tika is used to extract text from PDFs - drawback is unable to extract from image converted PDFs

Note - to use tika, java jdk is required and tika-server-1.24.1.jar is required to be run using `javac -jar tika-server-1.24.1.jar` 
link to download tika-server : `https://downloads.apache.org/tika/tika-server-1.24.1.jar`

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from os import getcwd, path
import urllib
import pickle
from time import sleep
import functools
import PyPDF2
import glob
from func_timeout import func_timeout, FunctionTimedOut
import re
from langdetect import detect
from langdetect import lang_detect_exception

from tika import parser

In [3]:
PROJECT_NAME = 'modern_slavery_registry'
PROJECT_PATH = f"{getcwd()[:getcwd().find(PROJECT_NAME)]}{PROJECT_NAME}"
DATA_PATH = f"{PROJECT_PATH}\\data"
PICKLE_PATH = f"{DATA_PATH}\\pickles"
PDF_PATH = f"{DATA_PATH}\\pdfs"
TEXT_PATH = f"{DATA_PATH}\\texts"
PROCESSED_TEXT_PATH = f"{DATA_PATH}\\processed_texts"
MAX_DOWNLOAD_TIME = 5

In [4]:
def save_pickle(obj, file_name):
    pickle.dump(obj, file=open(f"{file_name}.pickle","wb"))
    
def load_pickle(file_name):
    return pickle.load(file=open(f"{file_name}.pickle","rb"))

In [5]:
pickle_name = f"{PICKLE_PATH}\\extracted_text_SIDs"
if path.exists(f"{pickle_name}.pickle"):
    extracted_text_SIDs = load_pickle(pickle_name)
    print(f"{pickle_name} exists")
else:
    extracted_text_SIDs = []
    save_pickle(extracted_text_SIDs, file_name=pickle_name)
    print(f"{pickle_name} created")
    
    
pickle_name = f"{PICKLE_PATH}\\failed_text_SIDs"
if path.exists(f"{pickle_name}.pickle"):
    failed_text_SIDs = load_pickle(pickle_name)
    print(f"{pickle_name} exists")
else:
    failed_text_SIDs = []
    save_pickle(failed_text_SIDs, file_name=pickle_name)
    print(f"{pickle_name} created")

E:\Projects\modern_slavery_registry\data\pickles\extracted_text_SIDs exists
E:\Projects\modern_slavery_registry\data\pickles\failed_text_SIDs exists


In [6]:
df = pd.read_csv(f"{DATA_PATH}\modernslaveryregistry-2020-09-14.csv")
df.head()

Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered
0,7676,"""K"" Line Holding Europe Limited",True,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019
2,28659,"""K"" Line (Europe) Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019
3,28661,"""K"" Line LNG Shipping Limited",False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019
4,28658,Polar LNG Shipping (UK) Limited,False,35092,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019


In [7]:
UNIQUE_SIDs = np.unique(df['Statement ID'].values)
TOTAL_UNIQUE_SIDs = len(UNIQUE_SIDs)
print(f"Total enteries : {len(df)}, "
      f"total unique statements : {TOTAL_UNIQUE_SIDs}")

Total enteries : 27531, total unique statements : 17799


In [8]:
pdfs = glob.glob(pathname=f"{PDF_PATH}\\*.pdf")
TOTAL_PDFs = len(pdfs)
print(f"Found {TOTAL_PDFs} pdfs.")

Found 15064 pdfs.


In [26]:
def get_pdf_url_from_sid(sid):
    """Returns PDF url for input sid."""
    return df[df['Statement ID']==sid]['URL'].values[0]

In [56]:
def get_sid_from_filename(file_name):
    """Extract and return SID from file_name."""
    return int(re.findall(pattern=r"[\d]+",string=path.basename(file_name))[0])

In [57]:
def clean_text(text):
    """Clean and return input text."""
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    return " ".join(text.split())

In [58]:
def extract_text_from_pdf(pdf_name):
    """Extract and return texts from PDFs."""
    pdf_file_obj = open(pdf_name, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
    num_pages = pdf_reader.numPages
    text = ""
    for page in range(num_pages):
        page_obj = pdf_reader.getPage(page)
        text += page_obj.extractText()
    return text

In [59]:
def extract_text_from_pdf_using_tika(pdf_name):
    """Return PDF content using Tika."""
    return parser.from_file(pdf_name)['content']

In [60]:
def save_text(text, sid):
    """Save text in text file."""
    file_name = f"SID-{sid}" 
    file_obj = open(f"{TEXT_PATH}//{file_name}.txt", 'w', encoding='utf-8')
    file_obj.write(text)
    file_obj.close()

In [14]:
# # Using PyPDF2
# for pdf in pdfs:
#     sid = re.findall(pattern=r"[\d]+",string=path.basename(pdf))[0]
#     print(f"Running {len(extracted_text_SIDs) + len(failed_text_SIDs)}/{len(pdfs)}")
#     try:
#         text = extract_text_from_pdf(pdf)
#         save_text(text=text, sid=sid)
#         extracted_text_SIDs.append(int(sid))
#         save_pickle(extracted_text_SIDs, file_name=f"{PICKLE_PATH}\\extracted_text_SIDs")
#         print(f"Extracted text for {len(extracted_text_SIDs)}/{len(pdfs)}")
#     except:
#         failed_text_SIDs.append(failed_text_SIDs)
#         save_pickle(failed_text_SIDs, file_name=f"{PICKLE_PATH}\\failed_text_SIDs")
#         print(f"Failed extract for {len(failed_text_SIDs)}/{len(pdfs)}")

In [61]:
def print_loop_no(total_sids, sid, itr, type_, type_itr):
    """Print loop statements."""
    primary_text = f"Running - {itr}/{total_sids} , SID - {sid}"
    if type_ == "Success":
        primary_text = f"{primary_text} - Success - {type_itr}"
    elif type_ == "Failed":
        primary_text = f"{primary_text} - Failed - {type_itr}"
    else:
        primary_text = f"{primary_text} - Processed - {type_itr}"
    print(f"{'-'*8} {primary_text} {'-'*8}")        

In [142]:
PDF_PATH

'E:\\Projects\\modern_slavery_registry\\data\\pdfs'

In [147]:
extract_text_from_pdf_using_tika(f"{PDF_PATH}\\SID-{26198}.pdf")

In [None]:
# using Tika
# Note : start tika-server using java -jar tika-server-1.24.1.jar
for i, pdf_name in enumerate(pdfs):
    sid = get_sid_from_filename(file_name=pdf_name)
    if sid not in extracted_text_SIDs and sid not in failed_text_SIDs:
        try:
            text = extract_text_from_pdf_using_tika(pdf_name)
            text = clean_text(text)
            save_text(text=text, sid=sid)
            extracted_text_SIDs.append(int(sid))
            print_loop_no(total_sids=TOTAL_PDFs,
                          sid=sid, 
                          itr=i+1, 
                          type_="Success",
                          type_itr=len(extracted_text_SIDs))
        except:
            failed_text_SIDs.append(failed_text_SIDs)
            print_loop_no(total_sids=TOTAL_PDFs,
                          sid=sid, 
                          itr=i+1, 
                          type_="Failed",
                          type_itr=len(failed_text_SIDs))
    else:
        print_loop_no(total_sids=TOTAL_PDFs,
                      sid=sid, 
                      itr=i+1,
                      type_="Processed",
                      type_itr=len(extracted_text_SIDs)+len(failed_text_SIDs))

In [None]:
print(f"No. of pdf from which texts were extracted :     {len(extracted_text_SIDs)}")
print(f"No. of pdf from which texts were not extracted : {len(failed_text_SIDs)}")
print(f"No. of pdf from which texts were not extracted : {len(failed_text_SIDs) + len(extracted_text_SIDs)}")

# Fixing PDFs SIDs pickles

In [32]:
pickle_name = f"{PICKLE_PATH}\\saved_SIDs"
saved_SIDs = load_pickle(pickle_name)
    
pickle_name = f"{PICKLE_PATH}\\failed_SIDs"
failed_SIDs = load_pickle(pickle_name)
    
pickle_name = f"{PICKLE_PATH}\\pdf_not_found_SIDs"
pdf_not_found_SIDs = load_pickle(pickle_name)
    
pickle_name = f"{PICKLE_PATH}\\timeout_SIDs"
timeout_SIDs = load_pickle(pickle_name)

print(f"Total PDFs available       : {len(pdfs)}")
print(f"Total PDFs downloaded      : {len(saved_SIDs)}")
print(f"Total PDFs not found       : {len(pdf_not_found_SIDs)}")
print(f"Total PDFs failed download : {len(failed_SIDs)}")
print(f"Total PDFs timeout download: {len(timeout_SIDs)}")
print(f"Total PDFs worked on       : {len(saved_SIDs) + len(pdf_not_found_SIDs) + len(failed_SIDs) + len(timeout_SIDs)}")

# failed_SIDs = [i for i in failed_SIDs if i not in saved_SIDs]
# pdf_not_found_SIDs = [i for i in pdf_not_found_SIDs if i not in saved_SIDs]
# timeout_SIDs = [i for i in timeout_SIDs if i not in saved_SIDs]

# pickle_name = f"{PICKLE_PATH}\\saved_SIDs"
# save_pickle(saved_SIDs, file_name=pickle_name)
    
# pickle_name = f"{PICKLE_PATH}\\failed_SIDs"
# save_pickle(failed_SIDs, file_name=pickle_name)

# pickle_name = f"{PICKLE_PATH}\\pdf_not_found_SIDs"
# save_pickle(pdf_not_found_SIDs, file_name=pickle_name)
    
# pickle_name = f"{PICKLE_PATH}\\timeout_SIDs"
# save_pickle(timeout_SIDs, file_name=pickle_name)

Total PDFs available       : 15064
Total PDFs downloaded      : 11044
Total PDFs not found       : 2471
Total PDFs failed download : 4020
Total PDFs timeout download: 264
Total PDFs worked on       : 17799


In [50]:
text_files = glob.glob(pathname=f"{TEXT_PATH}\\*.txt")
TOTAL_TEXT_FILES = len(text_files)
print(f"Found {TOTAL_TEXT_FILES} text files")

Found 14309 text files
