In [1]:
import sys
import io
import os
import shutil
from concurrent.futures import ThreadPoolExecutor

import requests

from wand.image import Image as wi
import cv2
import pandas as pd
import re
from dateutil.parser import parse
import json

from datetime import datetime,timedelta

In [2]:
# Convert Pdf to Jpgs
##############################################################################################
def pdftojpg(src_filename,dest_folder, dest_nameprefix=" ", dpi=300, start_fileindex=0): 
    new_jpgfilenames = []
    
    if not (os.path.exists(src_filename)):
        print('not available file: ', src_filename)
        return new_jpgfilenames
    
    if not (os.path.exists(dest_folder)):
        print('not available folder: ', dest_folder)
        return new_jpgfilenames

    # load src file
    pdf = wi(filename=src_filename, resolution=dpi)
    pdfimage = pdf.convert("jpeg")

    # newname = dest_folder + "/" + dest_nameprefix + " " + new_file_number +".jpg"
    i = start_fileindex # the last file_number index
    for img in pdfimage.sequence:
        # spawn new file-name
        dest_filename = dest_folder + "/"  + dest_nameprefix + " " + str(i).zfill(5) +".jpg"
        while os.path.exists(dest_filename): #check if dest_file name is exist
            i+=1
            dest_filename = dest_folder + "/" + dest_nameprefix + " " + str(i).zfill(5) +".jpg"        
        page = wi(image=img)
        page.save(filename=dest_filename) #save new file              
        new_jpgfilenames.append(dest_filename)
        i += 1 
        
    return new_jpgfilenames

##############################################################################################
def pdf_converting_pool(src_folder,dest_folder,converted_files, jpg_nameprefix='pdftojpg'):
    failconvertfilename = []
    filename_df = pd.DataFrame(columns=['pdf','jpg'])
    mapto_pdfunc = lambda scr_name,dest_list: pd.DataFrame(list(zip([scr_name]*len(dest_list), dest_list)),columns=['pdf','jpg'])

    i=0
    files_set = set(os.listdir(src_folder)) - set(converted_files)
    file_names = [file_name for file_name in files_set if file_name.endswith('.pdf')]
    for file_name in file_names:
        i+=1        
        pdf_filename = src_folder + "/" + file_name
        jpg_filenames = pdftojpg(pdf_filename, dest_folder, jpg_nameprefix, start_fileindex = len(os.listdir(dest_folder))+1)
        if len(jpg_filenames)==0: # exportedfilenames is empty list
            failconvertfilename.append(pdf_filename) # Convert fail add to falure list
            print('fail to converting pdf-file number {}: {}'.format(i,file_name))
        else:
            filename_df=pd.concat([filename_df,mapto_pdfunc(pdf_filename,jpg_filenames)],ignore_index=True)
            print('successed converting pdf-file number {}: {}'.format(i,file_name))

    if len(failconvertfilename)>0: 
        print('failure to convert pdf-files as below list:')
        print(failconvertfilename)
        
    filename_df['pdf_sn'] = filename_df['pdf'].apply(lambda fullname: re.split(r'\/',fullname)[-1])
    return filename_df

In [3]:
#api function to convert jpg to text 
def ocrtotxt(arg):
    # arg=[fname,api_key]
    fname = arg[0]
    api_key = arg[1]
    
    #pre-process image
    try:
        img = cv2.imread(fname) 
        h,w,_ = img.shape
        roi = img[0:h,0:w,:]
        _, compressedimage = cv2.imencode(".jpg", roi, [1, 80])
    except:
        return ('jpg file error: ' + fname)
    
    # post img-Ocr request
    file_bytes = io.BytesIO(compressedimage)
    try:
        result = requests.post('https://api.ocr.space/parse/image',
                              files={fname: file_bytes},
                              data= {'apikey': api_key,
                                     'isTable': True
                                    }
                                  )
        return result.content.decode()
    except requests.exceptions.RequestException as e:
        return 'request error'
    

# employ worker to bulk extracting    
def convert_jpgs_to_txts(argumentlist):
#return list of string - Jsontype
    with ThreadPoolExecutor(max_workers=12) as pool:
        result = list(pool.map(ocrtotxt,argumentlist))
    return result

#write to Json-text file
def writeRawJson(results):
    filelist = []
    for output in results:
        if ('ParsedResults' in output[1]) and ('ParsedText' in output[1]):
            f = open(file=output[0], mode='w')
            f.write(output[1])
            f.close()
            filelist.append(output[0])
        else:
            filelist.append("Error")
    return filelist
   
#write to Parsed-text file
def writeParseTxt(results):
    filelist = []
    for output in results:
        if ('ParsedResults' in output[1]) and ('ParsedText' in output[1]):
            content = json.loads(output[1]).get('ParsedResults')[0].get('ParsedText')
            f = open(file=output[0], mode='w')
            f.write(content)
            f.close()
            filelist.append(output[0])
        else:
            print("error on file: ", output[0] , ',' , output[1] )
            filelist.append("Error")
    return filelist

def setup_web_api_request():
    apikeystore = [
        'K84672750088957','K87887256288957','K87576188288957','K84236823888957','K87600253188957','K84253289488957''K84124214688957',
        'K84672750088957','K87887256288957','K87576188288957','K84236823888957','K87600253188957','K84253289488957','K84124214688957',
        'K84672750088957','K87887256288957','K87576188288957','K84236823888957','K87600253188957','K84253289488957','K84124214688957',    
        ]
    global apikey_list
    apikey_list=[]
    max_req_perday = 500 # max 500 request/key/day
    for key in apikeystore:
        apikey_list = apikey_list + ([key]*max_req_perday)
    
    non_converted_num = risc_df.loc[risc_df['txt'].isna(),'jpg'].count()
    if non_converted_num>180: 
        print('we are going to exceed the max.allow (180) request per hours, '
              'please re-run the script with 60 mins time-span')
        
    global today_request_num    
    global latest_request_time
    try: 
        txtfile = open(dayrequest_filename,'r')
        today_request_num = int(txtfile.readline()) % 18
        latest_request_time = datetime.strptime(txtfile.readline(),'%Y-%m-%d %H:%M:%S')
    except: 
        today_request_num = 0
        latest_request_time = datetime.strptime("2013-01-01T00:00:00", '%Y-%m-%d %H:%M:%S')
    txtfile.close()

In [4]:
def request_no(givenstring):
    req_patterns = [re.compile(r'r.sc.+?\d{4,5}'),
                    re.compile(r'request no.+?\d{4,5}')]
    try:
        for req_pattern in req_patterns:
            match = req_pattern.search(givenstring.lower())
            if match is not None: break
        return re.search(r'\d{5}',match.group()).group()
    except:
        return pd.NA
    
def request_date(givenstring):
    req_pattern = re.compile(r'.n date.+\d{1,2}.+\d{2,4}')
    try:
        match = req_pattern.search(givenstring).group()
        reqdate = re.findall(r'[\d\w]+',match)
        datestring = '{}-{}-{}'.format(reqdate[-3],reqdate[-2],reqdate[-1])
        ret = parse(datestring, dayfirst=True, fuzzy=False)
    except:
        return pd.NA
    return ret

def match_station(givenstring,_index):
    station_patterns =  [                
        ['tdn.{1,2}st.t.on','th.o.+d.en.+st.t.on','th.o.+di.n'],
        ['pl.{1,2}st.t.on','ph.+c.+l.ng'],
        ['vt.{1,2}st.t.on','v.n.+th.nh','vtp'],
        ['rc.{1,2}st.t.on','r.ch.+chi.c','r.ch.+ch.ec'],
        ['su.i.+tien','su.i ti.n','stt.sta'],
        ['nu.station', 'national univer.+', 'd.i h.c qu.c gia'],
        ['high tech','HT.+station','htp'],
        ['th. d.c','tdc station','td station'],
        ['binh.+thai','bt station','b.nh th.i'],
        ['an ph.','ap station'],
        ['t.n cang','tc station'],
        ['depot'],
        ['factory']
    ]
    for pattern in station_patterns[_index]:
        try:
            if re.search(pattern,givenstring)!=None: return True #match
        except: pass
    return False #not found any

def match_works(givenstring,_index):
    work_patterns =  [
            ['brick','stiffe.+','lintel','tie.+beam'], #brick
            ['plaster.+','render.+'],
            ['m.tal.+w.rk','met.l.+w.rk','m.ta.+cladding','met.l.+clad.+'], #metal
            ['susp.nd.d','suspension','cei.ing','c.i.ing','ceilling'], #ceiling
            ['epoxy','.poxy','ep.xy'], #epoxy
            ['paint','pa.nt','c.ng.+t.c.+s.n'], #paint          
            ['stone.+w.rk','st.ne.+w.rk','sto.e.+w.rk','ston.+w.rk','s.one.+w.rk'], # stone
            ['partition','part.t..n','p..tition','dry wall'], #partition
            ['water.+fing'], #waterproofing
            ['mortar','to..ing','v.a b. m.t'] #topping
            ]
    for pattern in work_patterns[_index]:
        try:
            if re.search(pattern,givenstring)!=None: return True
        except: pass
    return False

In [5]:
def to_jpg():
    global risc_df
    # converting to jpg
    for src_folder,dest_folder in zip(src_folders,dest_folders):
        jpg_nameprefix = dest_folder[-4:]
        #export pdf to jpg
        risc_df = pd.concat([risc_df,pdf_converting_pool(src_folder,dest_folder,risc_df['pdf_sn'],jpg_nameprefix)],
                                ignore_index=True
                                ) 
    return risc_df.loc[risc_df['txt'].isna(),'jpg'].count()

In [6]:
def to_txt():
# converting to json and txt
    global risc_df
    global today_request_num
    global latest_request_time
    setup_web_api_request()
    
    jpg_list=list(risc_df.loc[risc_df['txt'].isna(),'jpg']) # load jpg list
    if len(jpg_list)==0: 
        print('no file to convert to texts')
        return 0     
    
    if datetime.now()<=latest_request_time+timedelta(minutes=60):
        print('it is too soon, the last request was sent at ', datetime.strftime(latest_request_time,'%Y-%m-%d %H:%M:%S'))
        print('please re-run the script later')
        return 0
    
    if today_request_num>=18: 
        print('it is too late, we have send {} sets of request today'.format(today_request_num))
        print('please re-run the script later')
        return 0
    
    today_request_num +=1
    apikeys = apikey_list[(today_request_num-1)*180:(today_request_num)*180]
    end = min(len(jpg_list),180)# check to update start:end slicing
    jpgnames = jpg_list[0:end] # update name lists
    jsonnames =  list(map(lambda jpgname: json_folder + "/" + jpgname.split('/')[-1].replace('jpg','txt'),jpgnames)) # update name lists
    txtnames = list(map(lambda jpgname: txt_folder + "/" + jpgname.split('/')[-1].replace('jpg','txt'),jpgnames))# update name lists
    api_argument = list(zip(jpgnames,apikeys)) #new list of arguments

    # Run API request
    print('the number jpg-files to be converted is ', end)
    print('start coverting at {} ...'.format(datetime.strftime(datetime.now(),'%H:%M:%S')))
    results = convert_jpgs_to_txts(api_argument)

    # Write to file
    index_list = risc_df[risc_df['jpg'].isin(jpgnames)].index
    risc_df.loc[index_list,'json'] = writeRawJson(list(zip(jsonnames,results)))
    risc_df.loc[index_list,'txt'] = writeParseTxt(list(zip(txtnames,results)))

    # write request num and time
    try: 
        txtfile = open(dayrequest_filename,'w')
        txtfile.write(str(today_request_num)+'\n')
        txtfile.write(datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S'))
        txtfile.close()
    except: 
        print('writing request history record error')
    
    return len(jpg_list)

In [37]:
def update_data():
    #index_list = list(risc_df[risc_df['date'].isna()].index)
    index_list = list(risc_df[risc_df.isna().sum(axis=1)>=1].index)
    content_list = []
    df = pd.DataFrame()
    for index in index_list:
        try:
            file = open(risc_df.loc[index,'txt'],'r')
            content = file.read().lower()
            file.close()
            startpos = content.find('survey check'.lower())
            if startpos<0: startpos=0
            if startpos>500: startpos=500
            endpos = content.find('Submitted by SCC'.lower())
            content_list.append(content[startpos:endpos])
        except:
            print('Error on reading file ', risc_df.loc[index,'jpg'])
            content_list.append('Error')
    df['output'] = content_list
    df['request_no'] = df['output'].map(request_no)
    df['date'] = df['output'].map(request_date)
            
    # station place
    stations =[
        'thaodien','phuoclong','vanthanh','rachchiec','suoitien','daihocquocgia','hightech',
        'thuduc','binhthai','anphu','tancang','depot','factory']
    for index,station in enumerate(stations):
        df[station] = df['output'].apply(lambda textstring: match_station(textstring,index))   
    # work
    works = ['brick','plastering','metalcladding','ceiling','epoxy','painting','stone',
                 'partition','waterproofing','topping']
    for index,work in enumerate(works):
        df[work] = df['output'].apply(lambda textstring: match_works(textstring,index))
    
    df.index =  index_list
    del df['output']
    
    risc_df.loc[index_list,df.columns] = df          
    risc_df['date'] = risc_df['date'].fillna(method="ffill")
print('Done')

Done


# MAIN

In [16]:
#Input
src_folders  =  ['D:/tmrisc/pdf/2023']
dest_folders = ['D:/tmrisc/jpg/2023']
json_folder  = 'D:/tmrisc/json/'
txt_folder   = 'D:/tmrisc/txt/'
risc_filename = 'D:/tmrisc/risc_data.csv'
dayrequest_filename = 'D:/tmrisc/dayrequest.txt'

#load file map
global risc_df
if (os.path.exists(risc_filename)):
    risc_df = pd.read_csv(risc_filename)
else: 
    print('Error: risc_data.csv is not available')

if to_jpg()==0:
    print('no new pdf files')

if to_txt()>0:
    print('finish jpg converting')
    update_data()

risc_df.to_csv(risc_filename,index=False)     

the number jpg-files to be converted is  4
start coverting at 20:19:16 ...
finish jpg converting


In [45]:
update_data()
risc_df.to_csv(risc_filename,index=False)    

Error on reading file  D:/tmrisc/jpg/2018/2018 00416.jpg
Error on reading file  D:/tmrisc/jpg/2018/2018 00441.jpg
Error on reading file  D:/tmrisc/jpg/2018/2018 00442.jpg
Error on reading file  D:/tmrisc/jpg/2020/2020 03625.jpg
Error on reading file  D:/tmrisc/jpg/2020/2020 04548.jpg
Error on reading file  D:/tmrisc/jpg/2023/2023 00511.jpg


In [44]:
risc_df[risc_df.isna().sum(axis=1)>=1]

Unnamed: 0,pdf,jpg,json,txt,request_no,date,thaodien,phuoclong,vanthanh,rachchiec,...,plastering,metalcladding,ceiling,epoxy,painting,stone,partition,waterproofing,topping,pdf_sn
0,D:/tmrisc/pdf/2018/00359.pdf,D:/tmrisc/jpg/2018/2018 00001.jpg,D:/tmrisc/json//2018 00001.txt,D:/tmrisc/txt//2018 00001.txt,,0201-04-15 00:00:00,False,True,False,False,...,False,False,False,False,False,False,False,False,False,00359.pdf
12,D:/tmrisc/pdf/2018/00371-b.pdf,D:/tmrisc/jpg/2018/2018 00013.jpg,D:/tmrisc/json//2018 00013.txt,D:/tmrisc/txt//2018 00013.txt,,2018-05-29 00:00:00,False,False,False,False,...,False,False,False,False,False,False,False,False,False,00371-b.pdf
21,D:/tmrisc/pdf/2018/00380.pdf,D:/tmrisc/jpg/2018/2018 00022.jpg,D:/tmrisc/json//2018 00022.txt,D:/tmrisc/txt//2018 00022.txt,,2018-07-14 00:00:00,False,False,False,False,...,False,False,False,False,False,False,False,False,False,00380.pdf
37,D:/tmrisc/pdf/2018/12737.pdf,D:/tmrisc/jpg/2018/2018 00038.jpg,D:/tmrisc/json//2018 00038.txt,D:/tmrisc/txt//2018 00038.txt,,2018-01-04 00:00:00,False,False,False,False,...,False,False,False,False,False,False,False,False,False,12737.pdf
48,D:/tmrisc/pdf/2018/12737.pdf,D:/tmrisc/jpg/2018/2018 00049.jpg,D:/tmrisc/json//2018 00049.txt,D:/tmrisc/txt//2018 00049.txt,,2018-01-04 00:00:00,False,False,False,False,...,False,False,False,False,False,False,False,False,False,12737.pdf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25885,D:/tmrisc/pdf/2023/34352.pdf,D:/tmrisc/jpg/2023/2023 01312.jpg,D:/tmrisc/json//2023 01312.txt,D:/tmrisc/txt//2023 01312.txt,,2023-05-12 00:00:00,False,False,False,False,...,False,False,False,False,True,False,False,False,False,34352.pdf
25926,D:/tmrisc/pdf/2023/34354.pdf,D:/tmrisc/jpg/2023/2023 01353.jpg,D:/tmrisc/json//2023 01353.txt,D:/tmrisc/txt//2023 01353.txt,,2023-05-12 00:00:00,False,False,False,False,...,False,False,True,False,False,False,False,False,False,34354.pdf
25961,D:/tmrisc/pdf/2023/34312.pdf,D:/tmrisc/jpg/2023/2023 01388.jpg,D:/tmrisc/json//2023 01388.txt,D:/tmrisc/txt//2023 01388.txt,,2023-05-10 00:00:00,False,False,False,False,...,False,False,False,False,True,False,False,False,False,34312.pdf
25968,D:/tmrisc/pdf/2023/34355.pdf,D:/tmrisc/jpg/2023/2023 01395.jpg,D:/tmrisc/json//2023 01395.txt,D:/tmrisc/txt//2023 01395.txt,,2023-05-13 00:00:00,False,False,False,False,...,False,False,False,True,True,False,False,False,False,34355.pdf


In [22]:
def chk(ind):
    checkcontent = open(str(risc_df.loc[ind,'txt']),'r').read().lower()
    #print((checkcontent))
    req_pattern = re.compile(r'on date.+\d{1,2}.+\d{2,4}')
    match = req_pattern.search(checkcontent).group()
    reqdate = re.findall(r'[\d\w]+',match)
    datestring = '{}-{}-{}'.format(reqdate[-3],reqdate[-2],reqdate[-1])
    ret = parse(datestring, dayfirst=True, fuzzy=False)  
    print(request_no(checkcontent))
    
chk(26046)

33886
