In [1]:
import pandas as pd
import re
import os
import shutil
from dateutil.parser import parse

# LOAD MAPPING

In [2]:
mappingfname    = r'D:/tmrisc/filemap.csv',
output_folder   = 'D:/tmrisc/output/'
txt_folder      = 'D:/tmrisc/txt'

df = pd.read_csv(mappingfname) 

list_remain_index = list(df[df['output'].isna()].index)

for index in list_remain_index:
    try:
        content = open(df.loc[index,'txt'],'r').read().lower()
        startpos = content.find('survey check'.lower())
        if startpos<0: startpos=0
        if startpos>500: startpos=500
        endpos = content.find('Submitted by SCC'.lower())
        df.loc[index,'output']=content[startpos:endpos]
    except:
        pass

# NUMBER EXTRACT

In [3]:
def request_no(givenstring):
    req_patterns = [re.compile(r'r.sc.+?\d{4,5}'),
                    re.compile(r'request no.+?\d{4,5}')]
    try:
        for req_pattern in req_patterns:
            match = req_pattern.search(givenstring.lower())
            if match is not None: break
        return re.search(r'\d{5}',match.group()).group()
    except:
        return pd.NA
    
df.loc[list_remain_index,'request_no'] = df.loc[list_remain_index,'output'].map(request_no)

# DATE EXTRACT

In [4]:
def request_date(givenstring):
    req_pattern = re.compile(r'.n date.+\d{1,2}.+\d{2,4}')
    try:
        match = req_pattern.search(givenstring).group()
        reqdate = re.findall(r'[\d\w]+',match)
        datestring = '{}-{}-{}'.format(reqdate[-3],reqdate[-2],reqdate[-1])
        ret = parse(datestring, dayfirst=True, fuzzy=False)
    except:
        return pd.NA
    return ret

df.loc[list_remain_index,'date'] = df.loc[list_remain_index,'output'].map(request_date)
df.date = df.date.fillna(method="ffill")

# PLACE EXTRACT

In [5]:
stations =[
    'thaodien','phuoclong','vanthanh','rachchiec','suoitien','daihocquocgia','hightech',
    'thuduc','binhthai','anphu','tancang','depot','factory']

def match_station(givenstring,_index):
    station_patterns =  [                
        ['tdn.{1,2}st.t.on','th.o.+d.en.+st.t.on','th.o.+di.n'],
        ['pl.{1,2}st.t.on','ph.+c.+l.ng'],
        ['vt.{1,2}st.t.on','v.n.+th.nh','vtp'],
        ['rc.{1,2}st.t.on','r.ch.+chi.c','r.ch.+ch.ec'],
        ['su.i.+tien','su.i ti.n','stt.sta'],
        ['nu.station', 'national univer.+', 'd.i h.c qu.c gia'],
        ['high tech','HT.+station','htp'],
        ['th. d.c','tdc station','td station'],
        ['binh.+thai','bt station','b.nh th.i'],
        ['an ph.','ap station'],
        ['t.n cang','tc station'],
        ['depot'],
        ['factory']
    ]
    for pattern in station_patterns[_index]:
        try:
            if re.search(pattern,givenstring)!=None: return True #match
        except: pass
    return False #not found any
            
for index,station in enumerate(stations):
    df.loc[list_remain_index,station] = df.loc[list_remain_index,'output'].apply(
                        lambda textstring: match_station(textstring,index)
                    )            

# WORK EXTRACT

In [6]:
works = ['brick','plastering','metalcladding','ceiling','epoxy','painting','stone',
             'partition','waterproofing','topping']

def match_works(givenstring,_index):
    work_patterns =  [
            ['brick','stiffe.+','lintel','tie.+beam'], #brick
            ['plaster.+','render.+'],
            ['m.tal.+w.rk','met.l.+w.rk','m.ta.+cladding','met.l.+clad.+'], #metal
            ['susp.nd.d','suspension','cei.ing','c.i.ing','ceilling'], #ceiling
            ['epoxy','.poxy','ep.xy'], #epoxy
            ['paint','pa.nt','c.ng.+t.c.+s.n'], #paint          
            ['stone.+w.rk','st.ne.+w.rk','sto.e.+w.rk','ston.+w.rk','s.one.+w.rk'], # stone
            ['partition','part.t..n','p..tition','dry wall'], #partition
            ['water.+fing'], #waterproofing
            ['mortar','to..ing','v.a b. m.t'] #topping
            ]
    for pattern in work_patterns[_index]:
        try:
            if re.search(pattern,givenstring)!=None: return True
        except: pass
    return False

for index,work in enumerate(works):
    df.loc[list_remain_index,work] = df.loc[list_remain_index,'output'].apply(
                        lambda textstring: match_works(textstring,index)
                    ) 

# OS SCRIPT

In [7]:
def empty_folder(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [8]:
df.to_csv('D:/tmrisc/filemap.csv')

# COMMAND

In [76]:
seach_df = df[df['thaodien']*df['plastering']]
search_output_folder   = 'D:/tmrisc/searchs/'
empty_folder(search_output_folder)
#################################################################
for jpg_filename in seach_df['jpg']:
    shutil.copy(jpg_filename,search_output_folder)

In [84]:
seach_df = df[df['request_no']=='32934']
search_output_folder   = 'D:/tmrisc/searchs/'
empty_folder(search_output_folder)
#################################################################
for jpg_filename in seach_df['jpg']:
    shutil.copy(jpg_filename,search_output_folder)