# Data loading

In [1]:
import os
import shutil
import requests
import re
import magic
import numpy as np
import pandas as pd
import concurrent.futures
import datetime
import pytesseract
import pdf2image

# gs
gs_url = "https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9540/ghostscript-9.54.0-linux-x86_64.tgz"
r = requests.get(gs_url, allow_redirects=True)
open("gs.tgz", "wb").write(r.content)
os.system("tar -xzvf gs.tgz")
os.remove("gs.tgz")
shutil.move("ghostscript-9.54.0-linux-x86_64/gs-9540-linux-x86_64","gs")
shutil.rmtree("ghostscript-9.54.0-linux-x86_64", ignore_errors=True)

# tes language model
tes_url = "https://github.com/tesseract-ocr/tessdata/raw/master/eng.traineddata"
r = requests.get(tes_url, allow_redirects=True)
open("eng.traineddata", "wb").write(r.content)

23466654

In [2]:
docs = pd.read_excel("../../data/un-global-impact.xlsx", sheet_name=[0,1])
docs = pd.concat([v.loc[:,"Participant":"Link"] for k,v in docs.items()])
# get only the eng ones with a link
docs = docs[(docs.Language=="english") & (docs.Link.notnull())].\
    sort_values(["Year", "Participant", "Link"]).drop_duplicates("Link").\
        reset_index(drop=True)
#docs = docs.loc[[74,75,76,77,78,79,80,81,82,151,544,568,810,863],:].reset_index(drop=True)
#docs = docs.loc[range(0,10),:].reset_index(drop=True)

# file types
file_type = docs.Link.str.split("/").apply(lambda x: x[-1].split("?")[0].split(".")[-1]).str.lower()
docs["FileType"] = ["" if f not in set(["pdf", "docx", "pptx", "doc", "docm", "htm", "html"]) else f\
    for f in file_type.values]
# prepare file names
def sanitize_names(r):
    ind = str(r.name).rjust(4,"0")
    year = str(r.Year)
    part = r.Participant
    part = part.lower()
    part = re.sub("[.!?\\-/,\"\(\)\+'\|]"," ",part)
    part = re.sub("&"," and ",part)
    part = re.sub(" +","-",part)
    part = re.sub("-$","",part)
    pp = [ind, year, part]
    res = "-".join(pp)
    if len(r.FileType)>0:
        res = res+"."+r.FileType
    return res
docs["FileName"] = docs.apply(sanitize_names, axis=1)

docs.head()

Unnamed: 0,Participant,Sector,Country,Year,Language,Link,FileType,FileName
0,AGNI MOTORS,Electronic & Electrical Equ...,India,2019,english,https://ungc-production.s3.us-west-2.amazonaws...,pdf,0000-2019-agni-motors.pdf
1,AMBEV S/A,Beverages,Brazil,2019,english,https://ungc-production.s3.us-west-2.amazonaws...,pdf,0001-2019-ambev-s-a.pdf
2,Access Spectrum Company Limited,Technology Hardware & Equip...,Myanmar,2019,english,https://ungc-production.s3.us-west-2.amazonaws...,pdf,0002-2019-access-spectrum-company-limited.pdf
3,Addit Sp. z o.o.,General Industrials,Poland,2019,english,https://ungc-production.s3.us-west-2.amazonaws...,pdf,0003-2019-addit-sp-z-o-o.pdf
4,Ahlsell AB,Technology Hardware & Equip...,Sweden,2019,english,https://ungc-production.s3.us-west-2.amazonaws...,pdf,0004-2019-ahlsell-ab.pdf


In [3]:
# download
raw_path = "../../data/raw-reports/"
if not os.path.exists(raw_path):
    os.mkdir(raw_path)
# func    
def download_report(ind=0, df=docs, output_path=raw_path):
    r = requests.get(df.loc[ind].Link, allow_redirects=True)
    open(output_path+df.loc[ind].FileName, "wb").write(r.content)
# par-all
st = datetime.datetime.now()
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as exe: 
   exe.map(download_report,  docs.index) # docs.index
td = (datetime.datetime.now()-st).seconds/60
print("The download took {:.2f} mins on the threadpool back-end.".format(td))

The download took 55.62 mins on the threadpool back-end.


In [4]:
# annotate files
docs["FileTypeMagic"] = docs.FileName.apply(lambda x: magic.from_file(raw_path+x,
    mime=True))
docs["FileTypeMagic"].value_counts()

application/pdf                                                              2412
application/vnd.openxmlformats-officedocument.wordprocessingml.document        34
text/html                                                                       8
application/vnd.openxmlformats-officedocument.presentationml.presentation       8
application/msword                                                              2
inode/x-empty                                                                   1
application/octet-stream                                                        1
Name: FileTypeMagic, dtype: int64

In [5]:
# prep
text_path = "../../data/text-reports/"
if not os.path.exists(text_path):
    os.mkdir(text_path)

docs["ConversionStatus"] = np.nan
docs["ConvertedFile"] = np.nan
# conversion
st = datetime.datetime.now()

for i in docs.index:

    input_file = raw_path+docs.loc[i].FileName
    temp_file = raw_path+docs.loc[i].FileName.split(".")[0]+".pdf"
    output_file = text_path+docs.loc[i].FileName.split(".")[0]+".txt"

    # check if conversion is needed
    if ("inode/x-empty" not in docs.loc[i,"FileTypeMagic"]) &\
        ("application/pdf" not in docs.loc[i,"FileTypeMagic"]):
        # and handle that
        try:
            cmd = "unoconv -f pdf -T 30 -o "+temp_file+" "+input_file
            input_file = temp_file
            docs.loc[i,"ConversionStatus"] = os.system(cmd)
            os.system("pkill soffice.bin")       
        except:
            print("Conversion of the file {} failed.".format(input_file))
            continue
    # try to ocr the files
    try:
        cmd = "./gs -sDEVICE=ocr -r200 -dQUIET -dBATCH -dNOPAUSE -sOutputFile="+\
            output_file+" "+input_file
        docs.loc[i, "ConversionStatus"] = os.system(cmd)
        with open(output_file, "r") as of:
            docs.loc[i, "OutputText"] = of.read().replace("\n", "")
    except:
        print("OCR of the file {} failed.".format(input_file))
        continue
    os.system("pkill gs")

td = (datetime.datetime.now()-st).seconds/60

pdf_count = np.sum(docs.ConversionStatus==0)
pdf_remainder_count = docs.shape[0]-pdf_count
print("Type conversion finished in {} mins with {} pdf files and the remainder of {} raw files.".\
    format(td, pdf_count, pdf_remainder_count,))

OCR of the file ../../data/raw-reports/0151-2020-3m-españa-s-l.pdf failed.


In [None]:
docs.to_pickle("../../data/metadata_and_texts.pkl")

In [None]:
# text_path = "../../data/text-reports-0/"
# if not os.path.exists(text_path):
#    os.mkdir(text_path)#

# def convert_to_text(ind=0, df=docs.dropna(), output_path=text_path):
#    pdf_file = df.loc[ind].ConvertedFile
#    text_file = output_path+df.loc[ind].FileName.split(".")[0]+".txt"
#    imgs = pdf2image.convert_from_path(pdf_file, dpi=72)
#    resulting_text = []
#    for pg, img in enumerate(imgs):
#        resulting_text.append(pytesseract.image_to_string(img))
#    if os.path.exists(text_file):
#        os.remove(text_file)
#    open(text_file, "w").write(" ".join(resulting_text))

# st = datetime.datetime.now()
# for i in docs.dropna().index:
#    convert_to_text(ind=i, df=docs.dropna())
# td = (datetime.datetime.now()-st).seconds/60
# print("The ocr process took {:.2f} mins on the single thread back-end.".format(td))

In [None]:
# text file to single 