In [1]:
import os
import shutil
import pandas as pd
import re

In [2]:
def empty_folder(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [3]:
def found_reqno(givenstring):
    req_pattern = re.compile(r'cp.+art.+r.sc.+?\d{5}')
    try:
        match = req_pattern.search(givenstring.lower()).group()
        return re.search(r'\d{5}',match).group()
    except:
        return 'unknown'

In [4]:
def found_places(givenstring):
    stations =[
        'thaodien','phuoclong','vanthanh','rachchiec',
        'suoitien','daihocquocgia','hightech','thuduc',
        'binhthai','anphu','tancang','depot',
        'factory'
    ]
    patterns =  [                
        ['tdn.{1,2}st.t.on','th.o.+d.en.+st.t.on','th.o.+di.n'],
        ['pl.{1,2}st.t.on','ph.+c.+l.ng'],
        ['vt.{1,2}st.t.on','v.n.+th.nh','vtp'],
        ['rc.{1,2}st.t.on','r.ch.+chi.c','r.ch.+ch.ec'],
        ['su.i.+tien','su.i ti.n','stt.sta'],
        ['nu.station', 'national univer.+', 'd.i h.c qu.c gia'],
        ['high tech','HT.+station','htp'],
        ['th. d.c','tdc station','td station'],
        ['binh.+thai','bt station','b.nh th.i'],
        ['an ph.','ap station'],
        ['t.n cang','tc station'],
        ['depot'],
        ['factory']
    ]
    result = set()
    for i in range(len(patterns)):
        for pattern in patterns[i]:
            try:
                if re.search(pattern,givenstring)!=None:
                    #print(stations[i]+'---'+pattern)
                    result.add(stations[i])
                    break
            except:
                pass
            
    if bool(result): return result
    return set('unknown place')

In [5]:
def found_works(givenstring):
    works = ['brick','plastering','metalcladding','ceiling','epoxy','painting','stone',
             'partition','waterproofing','topping']
    patterns =  [
            ['brick','stiffe.+','lintel','tie.+beam'], #brick
            ['plaster.+','render.+'],
            ['m.tal.+w.rk','met.l.+w.rk','m.ta.+cladding','met.l.+clad.+'], #metal
            ['susp.nd.d','suspension','cei.ing','c.i.ing','ceilling'], #ceiling
            ['epoxy','.poxy','ep.xy'], #epoxy
            ['paint','pa.nt','c.ng.+t.c.+s.n'], #paint          
            ['stone.+w.rk','st.ne.+w.rk','sto.e.+w.rk','ston.+w.rk','s.one.+w.rk'], # stone
            ['partition','part.t..n','p..tition','dry wall'], #partition
            ['water.+fing'], #waterproofing
            ['topping','concrete','mortar','to..ing']
            ]
    result = set()
    for i in range(len(works)):
        for pattern in patterns[i]:
            try:
                if re.search(pattern,givenstring)!=None:
                    result.add(works[i])
                    break
            except:
                pass
    
    if bool(result): return result
    return set('unknown work')

In [6]:
def match_risc(txtcontent,reqno_key='', searched_place='', searched_work=''):
    if reqno_key != '': return found_reqno(txtcontent) == reqno_key
    
    result = True
    if bool(searched_place): 
        places = found_places(txtcontent) 
        result = result and (searched_place in places) 
        
    if bool(searched_work): 
        works = found_works(txtcontent) 
        result = result and (searched_work in works)
    
    return result    

In [7]:
########### CONTROL BOARD ########################################
mappingfnames    = [
    'D:/tmrisc/2021 filemapping.csv',
    'D:/tmrisc/2022 filemapping.csv',
    'D:/tmrisc/2023 filemapping.csv',
]
search_output_folder   = 'D:/tmrisc/searchs/'

In [8]:
###### works = ['brick','plastering','metalcladding','ceiling','epoxy','painting','stone',
#         'partition','waterproofing','topping']
request_no = '23929'
work = ''
place = ''
#################################################################
empty_folder(search_output_folder)
for mappingfname in mappingfnames:
    df = pd.read_csv(mappingfname)
    for index in df['txt'].index:
        file = df.loc[index,'txt']
        content = open(file).read().lower()
        startpos = content.find('REQUEST FOR INSPECTION AND SURVEY CHECK (RISC)'.lower())
        endpos = content.find('Submitted by SCC'.lower())
        content = content[startpos:endpos]
        if match_risc(content,
                      reqno_key=request_no,
                      searched_place=place,
                      searched_work=work):
            shutil.copy(df.loc[index,'jpg'],search_output_folder)