# Data loading

In [None]:
import os
import shutil
import requests
import re
import magic
import numpy as np
import pandas as pd
import concurrent.futures
import datetime
import pytesseract
#import pdf2image

# gs
gs_url = "https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9540/ghostscript-9.54.0-linux-x86_64.tgz"
r = requests.get(gs_url, allow_redirects=True)
open("gs.tgz", "wb").write(r.content)
os.system("tar -xzvf gs.tgz")
os.remove("gs.tgz")
shutil.move("ghostscript-9.54.0-linux-x86_64/gs-9540-linux-x86_64","gs")
shutil.rmtree("ghostscript-9.54.0-linux-x86_64", ignore_errors=True)

# tes language model
tes_url = "https://github.com/tesseract-ocr/tessdata/raw/master/eng.traineddata"
r = requests.get(tes_url, allow_redirects=True)
open("eng.traineddata", "wb").write(r.content)

In [None]:
docs = pd.read_excel("../../data/un-global-impact.xlsx", sheet_name=[0,1])
docs = pd.concat([v.loc[:,"Participant":"Link"] for k,v in docs.items()])
# get only the eng ones with a link
docs = docs[(docs.Language=="english") & (docs.Link.notnull())].\
    sort_values(["Year", "Participant", "Link"]).drop_duplicates("Link").\
        reset_index(drop=True)
docs = docs.loc[[74,75,76,77,78,79,80,81,82,151,544,568,810,863],:].reset_index(drop=True)
#docs = docs.loc[[74,82],:].reset_index(drop=True)
#docs = docs.loc[range(0,20),:].reset_index()

# file types
file_type = docs.Link.str.split("/").apply(lambda x: x[-1].split("?")[0].split(".")[-1]).str.lower()
docs["FileType"] = ["" if f not in set(["pdf", "docx", "pptx", "doc", "docm", "htm", "html"]) else f\
    for f in file_type.values]
# prepare file names
def sanitize_names(r):
    
    ind = str(r.name).rjust(4,"0")
    year = str(r.Year)
    part = r.Participant
    part = part.lower()
    part = re.sub("[.!?\\-/,\"\(\)\+'\|]"," ",part)
    part = re.sub("&"," and ",part)
    part = re.sub(" +","-",part)
    part = re.sub("-$","",part)
    pp = [ind, year, part]
    res = "-".join(pp)
    if len(r.FileType)>0:
        res = res+"."+r.FileType
    return res

docs["FileName"] = docs.apply(sanitize_names, axis=1)

docs.head()

In [None]:
# download
raw_path = "../../data/raw-reports/"
if not os.path.exists(raw_path):
    os.mkdir(raw_path)
# func    
def download_report(ind=0, df=docs, output_path=raw_path):
    r = requests.get(df.loc[ind].Link, allow_redirects=True)
    open(output_path+df.loc[ind].FileName, "wb").write(r.content)
# par-all
st = datetime.datetime.now()
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as exe: 
   exe.map(download_report,  docs.index) # docs.index
td = (datetime.datetime.now()-st).seconds/60
print("The download took {:.2f} mins on the threadpool back-end.".format(td))

In [None]:
# annotate files
docs["FileTypeMagic"] = docs.FileName.apply(lambda x: magic.from_file(raw_path+x,
    mime=True))
docs["FileTypeMagic"].value_counts()

In [None]:
# filter & convert

# conversion
st = datetime.datetime.now()

docs["ConversionStatus"] = np.nan
docs["ConvertedFileName"] = docs["FileName"] # prefill

docs_to_convert_index = docs.loc[(docs.FileTypeMagic!="inode/x-empty") & (docs.FileTypeMagic!="application/pdf")].index
for i in docs_to_convert_index:
    input_file = raw_path+docs.loc[i].FileName
    output_file = raw_path+docs.loc[i].FileName.split(".")[0]+".pdf"
    try:
        cmd = "unoconv -f pdf -T 30 -o "+output_file+" "+input_file
        docs.loc[i,"ConversionStatus"] = os.system(cmd)
        docs.loc[i,"ConvertedFileName"] = output_file       
    except:
        print("Conversion of the file {} failed.".format(input_file))
        docs.loc[i,"ConvertedFileName"] = np.nan   
        continue
    os.system("pkill soffice.bin")

td = (datetime.datetime.now()-st).seconds/60
pdf_count = np.sum(docs.ConversionStatus==0)
pdf_remainder_count = len(docs_to_convert_index)-pdf_count
print("Type conversion finished in {} mins with {} pdf files and the remainder of {} raw files.".\
    format(td, pdf_count, pdf_remainder_count,))

In [63]:
# ocr in parallel

text_path = "../../data/text-reports/"
if not os.path.exists(text_path):
    os.mkdir(text_path)

st = datetime.datetime.now()    

docs["OCRStatus"] = np.nan
docs["OutputText"] = np.nan
docs["OutputFileName"] = np.nan

def doc2text(row, data_dir):
    import os
    import numpy as np
    
    input_file = data_dir+"raw-reports/"+row.ConvertedFileName
    output_file =  data_dir+"text-reports/"+row.FileName.split(".")[0]+".txt"
    # try to ocr the files
    try:
        cmd = "./gs -sDEVICE=ocr -r200 -dQUIET -dBATCH -dNOPAUSE -sOutputFile="+\
            output_file+" "+input_file
        row["OCRStatus"] = os.system(cmd)
        row["OutputFileName"] = output_file
        with open(output_file, "r") as of:
            row["OutputText"] = of.read().replace("\n", "")
    except:
        print("OCR of the file {} failed.".format(input_file))
        row["OutputFileName"] = np.nan
        return row
    #os.system("pkill gs")

    return row

from joblib import Parallel, delayed
temp_docs = Parallel(n_jobs=8, backend="multiprocessing")(
    delayed(doc2text)(docs.loc[i,:], "../../data/") for i in docs.index)
docs_augmented = pd.concat(temp_docs, axis=1).transpose()    
td = (datetime.datetime.now()-st).seconds/60

text_count = docs_augmented.shape[0]
text_remainder_count = docs_augmented.OutputText.isna().sum()
print("Type conversion finished in {} mins with {} pdf files and the remainder of {} raw files.".\
     format(td, text_count-text_remainder_count, pdf_remainder_count))

Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min 

OCR of the file ../../data/raw-reports/0009-2020-3m-españa-s-l.pdf failed.


Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (1x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (1x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (2x36 vs min width of 3)
Line cannot be recognized!!
Image too small to scale!! (1x36 vs min 

Type conversion finished in 13.133333333333333 mins with 13 pdf files and the remainder of 0 raw files.
