In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import numpy as np
from tqdm.auto import trange, tqdm
import re
import pandas as pd

def tn_to_arb(line):
  fline = ''
  for char in line:
    if ord(char) >= ord('๐') and ord(char) <= ord('๙'):
      fline += str(ord(char) - ord('๐'))
    else:
      fline += char
  return fline

def get_level_of_line(String):
  regx = [('\d+(\.\d+){2}\)', 10),
   ('\d+\.\d+\)', 9),
   ('\d+\)', 8),
   ('\(\d*(\.\d*){3}\)', 15),
   ('\(\d*(\.\d*){2}\)', 14),
   ('\(\d*(\.\d*)\)', 13),
   ('\(\d*\)', 12),
   ('\d+(\.\d+){5}\D+', 6),
   ('\d+(\.\d+){4}\D+', 5),
   ('\d+(\.\d+){3}\D+', 4),
   ('\d+(\.\d+){2}\D+', 3),
   ('\d+\.\d+\D+', 2),
   ('\d+\.\D+', 1),
   ('\d+.', 11)]

  for r, l in regx:
    if re.match(r, String):
      return l
  return 0

def get_patern_of_bullet(String):
  regx = [('[1-9][0-9]*(\.[1-9][0-9]*)*\)$', 20),
   ('\(\d*(\.?\d*)*\)$', 50),
   ('[1-9][0-9]*(\.[1-9][0-9]*)+$', 2),
   ('[1-9][0-9]*\.$', 1),
   ('[1-9][0-9]*$', 30)]

  for r, l in regx:
    if re.match(r, String):
      if l in [2, 20, 50]:
        l = String.count('.') + l
      return r, l
  return 0

def mode(lst):
  if isinstance(lst, np.ndarray):
    lst = lst.tolist()
  return max(set(lst), key=lst.count)

In [None]:
get_level_of_line('2.1)oavnobe')

9

In [None]:
def isthaichar(string):
    return len(string) - len([c for c in string if 3585 <= ord(c) < 3676])

In [None]:
def to_gregorian(input):
  if input:
    return int(input) - 543
  else:
    return input

In [None]:
class ThaiBudget:
  def __init__(self, json_path):
    with open(json_path) as json_file:
      self._data_ = json.load(json_file)
      self.docs = [ref_doc for ref_doc in self._data_]
  
  def get_doc(self, ref_doc):
    return Doc(self._data_[ref_doc], ref_doc)

In [None]:
class Doc:
  def page(self, num_page, true_num_page=False):
    if true_num_page:
      npage = num_page + self.start
      return Page(npage, self._raw_page[npage], num_page)
    else:
      return Page(num_page, self._raw_page[num_page])

  @property
  def index_page(self):
    id_pages = []
    for i in range(65):
      page = self.page(i).get_text_lines()
      if page != None:
        text = ''.join(page)
        if re.findall('\d+ (ถึง) \d+', text):
          id_pages.append(i)
        elif id_pages:
          break
    return id_pages

  @property
  def ministry(self):
    ministry = []
    lines = self.page(0)._lines(yPos=True)
    for tline, ypline in lines:
      if (np.array([1250, 1400]) < ypline).any() and (ypline < np.array([2000, 2100])).any():
        ministry.append(' '.join([tn_to_arb(block) for block in tline if block not in '*-']))
    return ministry

  @property
  def range(self):
    if self.ref_doc in ['2022.3.12', '2022.3.16(2)', '2022.3.16(3)']:
      ministry = []
      page_rage = []
      for ipage in self.index_page:
        ministry, page_rage_ = get_range_in_page(self.page(ipage), ministry)
        page_rage += page_rage_
      return page_rage
    page_rage = []
    ministry = ''
    tempdict = {}
    stw = [ministr[:int(len(ministr)*0.6)]for ministr in self.ministry]
    for i in self.index_page:
      page = self.page(i)
      page.dline_tolerance = 50
      lines = page.get_text_list_lines()
      for line in lines:
        if [sw for sw in stw if ' '.join(line).startswith(sw)]:
          ministry = [word for word in line if word != 'ถึง' and not word.isdigit() and word != 'หน้า']
          ministry = ' '.join(ministry)
        if re.match('\(\d+\)', line[0]):
          if tempdict:
            page_rage.append(tempdict)
            tempdict = {}
          tempdict['ministry'] = ministry
          tempdict['budget_unit'] = ' '.join([word for word in line[1:] if word != 'ถึง' and not word.isdigit()])
        
        if line[0] == 'สภากาชาดไทย':
          page_rage.append(tempdict)
          tempdict = {}
          tempdict['ministry'] = ministry
          tempdict['budget_unit'] = ministry
        if line[0] == 'ส่วนราชการในพระองค์':
          page_rage.append(tempdict)
          tempdict = {}
          tempdict['ministry'] = ministry
          tempdict['budget_unit'] = ministry
        if line[0] == 'งบกลาง':
          tempdict['ministry'] = ministry
          tempdict['budget_unit'] = ministry
          tempdict['range'] = (5, 8)
        if '7. ราย' in ' '.join(line) or 'ผลผลิต/โครงการ' in ' '.join(line):
          if line[-1].isdigit():
            start = line[-1]
        if '8. ราย' in ' '.join(line):          
          tempdict['range'] = (int(start), int(line[-1]))
          if self.ref_doc == '2022.3.13(1)':
            if tempdict['budget_unit'] == 'จังหวัดนราธิวาส':
              tempdict['range'] = (542, 552)
            elif tempdict['budget_unit'] =='กลุ่มจังหวัดภาคใต้ชายแดน':
              tempdict['range'] = (527, 535)
            elif tempdict['budget_unit'] =='จังหวัดฉะเชิงเทรา':
              tempdict['range'] = (604, 614)
            elif tempdict['budget_unit'] == 'จังหวัดจันทบุรี':
              tempdict['range'] = (680, 614)
            elif tempdict['budget_unit'] == 'จังหวัดตราด':
              tempdict['range'] = (699, 614)
    page_rage.append(tempdict)
    return page_rage

  @property
  def start(self):
    idpage = self.index_page
    fstpage = idpage[-1] + 1
    while self.page(fstpage).get_blocks() == None:
      fstpage += 1
    return fstpage - 1
  
  def get_budgetary_unit(self, minis):
    budumi = []
    for budu in self.range:
      if minis == budu['ministry']:
        budumi.append({'budget_unit': budu['budget_unit'],
                      'range': budu['range']})
      
    return budumi

  def get_ministry(self, budge_u):
    for budu in self.range:
      if budge_u == budu['budget_unit']:
        return {'ministry': budu['ministry'],
                      'range': budu['range']}
      else:
        return None

  def __init__(self, raw_pages, ref_doc):
    self.fiscal_year, self.chabab, self.lem = ref_doc.split('.')
    self._raw_page = raw_pages
    self.ref_doc = ref_doc
    self.len = len(raw_pages)


In [None]:
class Page:
  def get_blocks(self):
    if self._page == []:
      return None
    blocks = []
    for block in self._page:
      crd = self._simplify_coordinates([[coo['x'], coo['y']] for coo in block['vertices']])
      blocks.append({'text': block['description'],
                    'sim_coord': crd})
    return blocks

  @property
  def xSigPos(self):
    w = self.whpage[0][1] - self.whpage[0][0]
    for line in self.Lines:
      text = [block['text'] for block in line]
      if [txt for txt in text if txt in ['ตั้งงบประมาณ', 'ผูกพันงบประมาณ', 'วงเงินทั้งสิ้น', 'ผลผลิต']]:
        continue
      scord = [block['sim_coord'] for block in line]
      for scrd, txt in zip(scord, text):
        if txt == 'บาท' and (scrd[0] > (self.whpage[0][1] - 200)).all():
          return scrd[0].tolist()

  @property
  def level_dict(self):
    lvldict = {}
    for line in self.Lines[1:]:
      fidx = line[0]
      pattern_lvel = get_patern_of_bullet(fidx['text'] + ' ')
      if pattern_lvel:
        if pattern_lvel[1] in lvldict.keys():
          lvldict[pattern_lvel[1]].append(fidx['sim_coord'][0][0])
        else:
          lvldict.update({pattern_lvel[1]: [fidx['sim_coord'][0][0]]})

    return {key: mode(lvldict[key]) for key in lvldict}

  def _simplify_coordinates(self, coord):
    scrd = np.sort(coord, axis=0)
    x = []
    y = []
    for c in scrd:
      x.append(c[0])
      y.append(c[1])
    return np.array([[(x[0] + x[1]) / 2, (x[2] + x[3]) / 2],
                     [(y[0] + y[1]) / 2, (y[2] + y[3]) / 2]])
  
  def _ypos_mean(self, list_of_blocks):
    if self._page == []:
      return None
    return np.mean([bx['sim_coord'][1] for bx in list_of_blocks], axis=0)

  def _lines(self, line_tolerance=None, yPos=False):
    line_tolerance = self.dline_tolerance if line_tolerance == None else line_tolerance
    if self._page == []:
      return None
    lines = []
    for b in self.get_blocks():
      if lines == []:
        lines.append([b])
      else:
        cm = b['sim_coord'][1]
        found = 0
        for line in lines:
          diff = abs(cm - self._ypos_mean(line))
          if diff[1] < line_tolerance:
            found = 1
            line.append(b)
            break
        if not found:
          lines.append([b])
    lines = [sorted(line, key = lambda i: i['sim_coord'][0][0]) for line in lines]
    if not yPos:
      return lines
    else:
      return [([block['text'] for block in line], self._ypos_mean(line).tolist()) for line in lines]

  def get_text_lines(self):
    if self._page == []:
      return []
    text_lines = []
    for line in self.get_text_list_lines():
      text_lines.append(' '.join(line))
    return text_lines

  def get_text_list_lines(self):
    if self._page == []:
      return []
    text_lines = []
    for line in self.Lines:
      text_lines.append([l['text'] for l in line])
    return text_lines    

  def xpos_text_lines(self):
    if self._page == []:
      return None
    xpos_lines = []
    text_lines = []
    for line in self.Lines:
      ax = np.array([bx['sim_coord'] for bx in line])
      xpos_lines.append(ax)
      text_lines.append([bx['text'] for bx in line])
    return  xpos_lines, text_lines

  def is_empty(self):
    if self._page == []:
      return True
    else:
      return False

  def __init__(self, index_page, page, pdfpage=None):
    self.dline_tolerance = 30
    self.index_page = index_page
    self.pdfpage = pdfpage
    self._hpage = page[:1]
    self._page = page[1:]
    if page:
      sim = self._simplify_coordinates([[coo['x'], coo['y']] for coo in page[0]['vertices']])
    else:
      sim = [np.NaN] * 2
    self.whpage = sim
    self.Lines = self._lines()

In [None]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

# FUNCTIONS

In [None]:
def get_range_in_page(page, ministry=[]):
  page_rage = []
  budgetary_u = []
  tempdict = {}
  newBU = 0
  newM = 0
  stw = [ministr[:int(len(ministr)*0.6)]for ministr in curr_doc.ministry]
  for line in page.Lines:
    width = page.whpage[0]
    m = (width[1] - width[0]) * 0.1
    bpt = get_patern_of_bullet(line[0]['text'])
    if bpt and bpt[1] == 1:
      newM = 0
      if line[0]['text'] == '7.':
        s = int(line[-1]['text'])
      elif line[0]['text'] == '8.':
        tempdict['budget_unit'] = ' '.join(budgetary_u)
        tempdict['ministry'] = ' '.join(ministry)
        tempdict['range'] = (s, int(line[-1]['text']))
        page_rage.append(tempdict)
        tempdict = {}
        budgetary_u = []
      newBU = 0
      continue
    if [sw for sw in stw if ' '.join([b['text'] for b in line]).startswith(sw)]:
      ministry = []
      newM = 1
    for b in line:
      if (b['sim_coord'][0] < width - m).any():
        if newBU and isthaichar(b['text']) == 0:
          budgetary_u += [b['text']]
        if newM and isthaichar(b['text']) == 0:
          ministry += [b['text']]
        if bpt and bpt[1] == 50:
          newBU = 1
          newM = 0
  return ministry, page_rage

def get_range():
  ministry = []
  page_rage = []
  for ipage in curr_doc.index_page:
    ministry, page_rage_ = get_range_in_page(curr_doc.page(ipage), ministry)
    page_rage += page_rage_
  page_rage

### clean prefix

In [None]:
def clean_prefix(textlev, labellev):
  otext = []
  olabel = []
  for txt, lbel in zip(textlev, labellev):
    txtlbel = [{'text': text, 'label': label} for text, label in zip(txt, lbel)]
    if 'BULLET' in lbel:
      bltindex = lbel.index('BULLET')
      txt = txt[bltindex:]
      lbel = lbel[bltindex:]
    if 'FBAHT' in lbel :
      if 'ปี' in txt and 'FBAHT' in lbel:
        fyindex = txt.index('ปี')
        txt = txt[fyindex:]
        lbel = lbel[fyindex:]
    otext.append(txt)
    olabel.append(lbel)
  return otext, olabel

### labelling

In [None]:
offer_x = 0

In [None]:
def labelling(coord, text, whpage):
    global offer_x
    labels = []
    xpos = []
    xbltpos = 0
    bltx = 0
    blt_s = ''

    fscy = [txt for txt in text if txt in ['ตั้งงบประมาณ', 'ผูกพันงบประมาณ', 'ตั้งงบประมา', 'งงบประมาณ', 'ตั้งบประมาณ']]

    jtxt = ''.join([txt for txt in text])
    
    if [txt for txt in SKIP_LINE if jtxt.startswith(txt)] or 'รายการผูกพัน' == jtxt:
        return None

    opmargin = 270 if curr_doc.ref_doc == '2022.3.15' else 220
    amout_margin = 270
    if not isinstance(offer_x, int):
      amout_margin = 270 if curr_doc.ref_doc != '2022.3.15' else whpage[0][1] - offer_x[0]
    for crd, txt in zip(coord, text):
        if txt in ['ขอจดสรร', 'ขอจัดสรร']:
          offer_x = crd[0]
          return
        xpos += crd[0].tolist()
        if (crd[1] < whpage[1][0] + 25).all() and ''.join(text).isdigit():
            labels.append('PAGENUM')
        elif (np.array([len(txt) for txt in text]) < 5).all():
            return None
        elif get_patern_of_bullet(txt) and not labels and not fscy and not ',' in txt:
            labels.append('BULLET')
            xbltpos = crd[0].tolist()[0]
            bltx = [crd[0]]
            blt_s = txt
        elif bltx == 0 and (crd[1] < whpage[1][0] + opmargin).all() and (crd[0] < whpage[0][0] + 200).all() and txt == 'ผลผลิต':
            labels.append('OUTPUT')
            bltx = [crd[0]]
        elif bltx == 0 and (crd[1] < whpage[1][0] + opmargin).all() and (crd[0] < whpage[0][0] + 200).all() and txt == 'โครงการ':
            labels.append('PROJECT')
            bltx = [crd[0]]
        elif (crd[0] > whpage[0][1] - amout_margin).any() and not fscy and not txt.startswith('แผนงาน'):
            if re.match(r'[0-9]{1,3}(,[0-9]{1,3})*', txt) and txt.replace(',','').isdigit():
                labels.append('AMOUNT')
            else:
                labels.append('BAHT')
        elif fscy:
            if txt.isdigit():
                labels.append('FYEAR')
            elif re.match(r'[0-9]{1,3}(,[0-9]{1,3})*', txt):
                labels.append('FAMOUNT')
            else:
                labels.append('FBAHT')
        else:
            labels.append('NOTMATCH')
            if not re.match(r'(\(\d+\)|\d+\))$', blt_s) and len(labels) > 1 and labels[-2] in ['BULLET', 'PROJECT', 'OUTPUT'] and (txt in UNIT_NOUN or txt.startswith('และ')):
                labels[-2] = 'NOTMATCH'
                xbltpos = 0
                bltx = 0
                blt_s = ''
            if bltx and len(bltx) < 2:
                bltx.append(crd[0])

    return xbltpos if xbltpos else np.min(xpos), text, labels, bltx


In [None]:
re.match(r'(\(\d+\)|\d+\))$', '11+400')

In [None]:
def get_budget_plan(plist):
  npagelist, poslist, lineslist = [], [], []
  for ipge in plist:
    tmppos, tmptext = ipge.xpos_text_lines()
    poslist += tmppos
    lineslist += tmptext
    npagelist += [ipge.pdfpage] * len(tmptext)

  budget_plan = ''
  for coord, line in zip(poslist, lineslist):
    if [block for block in line if re.match('7\.[1-9]\.[1-9]', block)]:
      return budget_plan
    elif [block for block in line if block.startswith('แผนงาน')]:
      for bcrd, btext in zip(coord, line):
        if (bcrd[0] < 1200).all() and not re.match('7\.[1-9]', btext):
          budget_plan += btext
    elif budget_plan:
      budget_plan += ' '.join(line)

In [None]:
def xpos_text_lines(Lines):
  xpos_lines = []
  text_lines = []
  for line in Lines:
    ax = [bx['sim_coord'] for bx in line]
    xpos_lines.append(ax)
    text_lines.append([bx['text'] for bx in line])
  return  xpos_lines, text_lines

In [None]:
def clean_entry(plist, pl=False):
  pstart = curr_doc.start
  if pl:
    print([ipage.pdfpage for ipage in plist])
    print([ipage.index_page + 1 for ipage in plist])
  try:
    allxpos = [ipage.xSigPos[0] for ipage in plist] 
  except:
    print([ipage.pdfpage for ipage in plist])
  if len(plist) > 1:
    xdiffl = np.subtract(allxpos[0], allxpos)
  else:
    xdiffl = [0]
  
  if pl:
    print(xdiffl)

  npagelist, poslist, textlist, whplist = [], [], [], []

  for ipge, xdiff in zip(plist, xdiffl):
      tmppos, tmptext = xpos_text_lines(ipge.Lines)
      poslist += [np.add(tmpos, [[xdiff, xdiff], [0, 0]]) for tmpos in tmppos]
      textlist += tmptext
      npagelist += [ipge.pdfpage] * len(tmptext)
      whplist += [ipge.whpage] * len(tmptext)

  xposlev = []
  textlev = []
  labellev = []
  npagelev = []
  bltxlev = []

  for npage, coord, text, whp in zip(npagelist, poslist, textlist, whplist):
    if curr_doc.ref_doc == '2022.3.1' and npage in range(5, 8):
      if not get_patern_of_bullet(text[0]) or text[0] == '3.':
        continue
    olabel = labelling(coord, text, whp)

    if olabel == None:
      continue

    minxpos, textin, labels, bltx = olabel
    if 'PAGENUM' in labels:
      continue
    else:
      xposlev.append(minxpos)
      textlev.append(text)
      labellev.append(labels)
      npagelev.append(npage)
      bltxlev.append(bltx)

  textlev, labellev = clean_prefix(textlev, labellev)

  bltwdt = 0
  aftbltwdt = 0
  tbltx = 0
  oxposlev, otextlev, olabellev, onpagelev, obfbltxlev = [], [], [], [], []
  for idx, xpos, text, label, npage, bltx in zip(range(len(xposlev)), xposlev, textlev, labellev, npagelev, bltxlev):
    if isinstance(bltx, list):
      if len(bltx) < 2:
        print([p.pdfpage for p in plist])
        print(bltx, text, label, npage)
      bltwdt = bltx[0][0] - bltx[0][1]
      aftbltwdt = bltx[0][0] - bltx[1][0]
      tbltx = bltx
    if oxposlev:
      if text[0] == 'รวม':
        otextlev[-1] += text
        olabellev[-1] += label
      elif (('FBAHT' in label or 'FAMOUNT' in label or 'FYEAR' in label) or
          (('BAHT' in olabellev[-1] or 'FBAHT' in olabellev[-1]) and ('BULLET' in label or 'BAHT' in label)) or
          ('BULLET' in olabellev[-1] and 'BULLET' in label) or
          (len(labellev) > idx+1 and idx-1 >= 0 and 'BAHT' in labellev[idx-1] and 'BAHT' in labellev[idx+1] and 'BULLET' not in labellev[idx+1])):
        oxposlev.append(xpos)
        otextlev.append(text)
        olabellev.append(label)
        onpagelev.append(npage)
        obfbltxlev.append(tbltx)
      else:
        if ('BULLET' in olabellev[-1] and 'BULLET' in label):
          print('curr', xpos, text, label, npage)
        if isinstance(obfbltxlev[-1], int):
          print(' - ', xpos, text, label, npage)
        difblt = obfbltxlev[-1][0][0] - xpos
        difblt_tolerance = 12 if curr_doc.ref_doc != '2022.3.5' else 17
        if (difblt < difblt_tolerance and not ('BULLET' in olabellev[-1] and 'BULLET' in label)  and (bltwdt - 15 < difblt or aftbltwdt - 20 < difblt or
                            (('OUTPUT' in olabellev[-1] and -113 < difblt) or 
                             ('PROJECT' in olabellev[-1] and -130 < difblt))) or
            len(labellev) > idx+1 and idx-1 >= 0 and 'BAHT' not in labellev[idx-1] and 
            ('BAHT' in labellev[idx+1] or 'FBAHT' in labellev[idx+1] or 'FYEAR' in labellev[idx+1])):
          otextlev[-1] += text
          olabellev[-1] += label
        else:
          print('‣ prev', oxposlev[-1], otextlev[-1], olabellev[-1], onpagelev[-1])
          print('‣ curr', xpos, text, label, npage)
          print(obfbltxlev[-1])
          print(bltwdt - 15, difblt, bltwdt - 15 < difblt, aftbltwdt - 20)
    else:
      oxposlev.append(xpos)
      otextlev.append(text)
      olabellev.append(label)
      onpagelev.append(npage)
      obfbltxlev.append(tbltx)
  if pl:
    for npge, x_, text, labl in zip(onpagelev, oxposlev, otextlev, olabellev):
      print(npge, x_, [(tx, lb) for tx, lb in zip(text, labl)])

  return oxposlev, otextlev, olabellev, onpagelev

### get entry

In [None]:
garbage = [chr(x) for x in range(3585, 3654)] + ['|']

In [None]:
 def get_entry(cleaned_entry, fp=False):
  xposlev, textlev, labellev, npagelev = cleaned_entry

  for text, label, idx in zip(textlev, labellev, range(len(textlev))):
    if 'BULLET' not in label and 'FBAHT' not in label and 'OUTPUT' not in label and 'PROJECT' not in label:
      tj = ''.join(textlev[idx])
      if '(ชดเชยงบ' in tj or '(ชุดเชยงบประมาณ' in tj or tj.startswith('กิจกรรม') or tj.startswith('กิจรรม'):
        xposlev[idx] = None
        textlev[idx] = None
        labellev[idx] = None
        npagelev[idx] = None
      elif get_patern_of_bullet(text[0]):
        label[0] = 'BULLET'
      else:
        print('🚨 [1] is not \'BULLET\': ', text)

  xposlev = [x for x in xposlev if x != None]
  textlev = [x for x in textlev if x != None]
  labellev = [x for x in labellev if x != None]
  npagelev = [x for x in npagelev if x != None]

  iserrorlev = [False] * len(xposlev)
  loglev = [''] * len(xposlev)

  # get level dict
  lev = []
  level_tolerance = 11 
  for i in range(len(xposlev)):
    if lev == []:
      lev.append({'xPos': [xposlev[i]], 'index': [i]})
    else:
      diff = abs(np.subtract([np.mean(lv['xPos']) for lv in lev], xposlev[i]))
      if np.amin(diff) > level_tolerance:
        lev.append({'xPos': [xposlev[i]], 'index': [i]})
      else:
        idx = np.where(diff == np.amin(diff))[0][0]
        lev[idx]['xPos'].append(xposlev[i])
        lev[idx]['index'].append(i)

  # get level list of int
  level_dict = lev
  srtd_idx = sorted([(lv['index'], np.mean(lv['xPos'])) for lv in level_dict],
          key=lambda tup: tup[1])
  srtd_idx = [idx[0] for idx in srtd_idx]
  levs = []
  t_levs = len(srtd_idx)
  for idx in range(len(xposlev)):
    mask = np.array([idx in tst for tst in srtd_idx])
    n = np.arange(t_levs)[mask]
    levs.append(n[0])
 
  entry = []
  prjopt = {'Project': '', 'Output': ''}

  crdstack = []
  bltstack = []
  for pnum, xpos, level, text, label, idx in zip(npagelev, xposlev, levs, textlev, labellev, range(len(loglev))):
    if 'รายการบุคลากรภาครัฐ' in text or 'รายการบุคลากรภาครััฐ' in text:
      iserrorlev[idx], loglev[idx] = True, 'LINE SKIPED'
      continue

    if 'วงเงินทั้งสิ้น' in text:
      continue
    if 'วงเงินทั้งสิน' in text:
      continue

    if '(1) รายการไม่ผูกพัน' in ' '.join(text):
      continue

    context = ' '.join([txt for lbel, txt in zip(label, text) if lbel == 'NOTMATCH'])

    if 'OUTPUT' in label or 'PROJECT' in label:
      context = ' '.join([txt for lbel, txt in zip(label, text) 
                    if lbel == 'NOTMATCH' and txt != ':' and not txt in garbage])
      if 'OUTPUT' in label:
        prjopt['Output'] = context
      else:
        prjopt['Project'] = context
      continue
    elif 'FAMOUNT' in label or 'FBAHT' in label:
      amount = text[label.index('FAMOUNT')] if 'FAMOUNT' in label else '0'
      if re.match('\d{1,3}(,\d{3})*(\.\d+)?$', amount):
        amount = float(''.join([x for x in amount if x != ',']))
      else:
        temp['DEBUG_LOG'] += 'AMOUNT FORMAT IS WORONG'
        print(amount, pnum, level, text, label)

      if entry:
        entry[-1]['is_obliged'] = 1
        if label.count('FYEAR') == 2:
          year = [txt for lbel, txt in zip(label, text) if lbel == 'FYEAR']
          nyr = int(year[1]) - int(year[0]) + 1
          fiscal_year = {i : '{:,}'.format(amount / nyr) for i in range(to_gregorian(year[0]), to_gregorian(year[1])+1)}
        elif label.count('FYEAR') == 1:
          if isinstance(amount, float):
            amount = f'{amount:,}'
          fiscal_year = {to_gregorian(text[label.index('FYEAR')]): amount}
        else:
          print('🚨 no \'FYEAR\'', pnum, level, text, label)
          iserrorlev[idx], loglev[idx] = True, '\'FYEAR\' NOT FOUND'
          continue

        if isinstance(entry[-1]['fiscal_year'], int):
          entry[-1]['fiscal_year'] = fiscal_year
        elif isinstance(entry[-1]['fiscal_year'], dict):
          entry[-1]['fiscal_year'].update(fiscal_year)
        else:
          print('🚨 type not match', entry[-1]['fiscal_year'])
          iserrorlev[idx], loglev[idx] = True, 'TYPE NOT MATCH'
      else:
        print('🚨 entry is empty', pnum, level, text, label)
        iserrorlev[idx], loglev[idx] = True, 'ENTRY IS EMPTY'
        
      continue

    context = ' '.join([txt for lbel, txt in zip(label, text) if lbel == 'NOTMATCH'])
    if curr_doc.ref_doc == '2022.3.15':
      if len([x for x in label if x == 'AMOUNT']) != 2:
        print('ONLY ONE AMOUNT ON PAGE {} {}'.format(pnum, text), label)
      amount = text[label.index('AMOUNT')] if 'AMOUNT' in label else '0'

      if re.match('\d{1,3}(,\d{3})*(\.\d+)?$', amount):
        amount = float(''.join([x for x in amount if x != ',']))
      else:
        print(amount,': ', pnum, level, text, label)
      if isinstance(amount, float):
          amount = f'{amount:,}'
        
    else:
      amount = text[label.index('AMOUNT')] if 'AMOUNT' in label else '0'
    if not 'AMOUNT' in label:
        if 'BAHT' in label and re.match(r'[0-9]{1,3}(,[0-9]{1,3})*', text[label.index('BAHT') - 1]):
            amount = text[label.index('BAHT') - 1]
        else:
            print('No amount found', pnum, level, text, label)
    
    temp = {'context': context,
            'ref_line': text,
            'amount': amount,
            'coord_level': level,
            'page': pnum,
            'fiscal_year': int(curr_doc.fiscal_year),
            'is_obliged': 0,
            'DEBUG_LOG': ''}

    blt = 0
    bltlevel = -1
    clevel = level
    bltfound = 0
    if 'BULLET' in label:
      bltfound = 1
      blt = text[label.index('BULLET')]
      bltlevel = get_patern_of_bullet(text[label.index('BULLET')])[1]
      temp['bullet_level'] = (bltlevel, blt)
    else:
      temp['DEBUG_LOG'] = 'NO BULLET FOUND'
      iserrorlev[idx], loglev[idx] = True, 'NO BULLET FOUND'

    db = crdstack[:]
    if crdstack:
      if crdstack[-1] < clevel:
        crdstack.append(clevel)
      elif crdstack[-1] > clevel:
        if not clevel in crdstack:
          crdstack[-1] = clevel
        while crdstack[-1] != clevel:
          crdstack.pop()
    else:
      crdstack.append(clevel)
    temp['crdstack_level'] = len(crdstack)

    if bltlevel != -1:
      db = bltstack[:]
      if bltstack:
        if bltstack[-1] < bltlevel:
          bltstack.append(bltlevel)
        elif bltstack[-1] > bltlevel:
          if not bltlevel in bltstack:
            bltstack[-1] = bltlevel
          while bltstack[-1] != bltlevel:
            bltstack.pop()
      else:
        bltstack.append(bltlevel)
      temp['bltstack_level'] = len(bltstack)
    else:
      temp['bltstack_level'] = len(bltstack)
      print('🚨 bulltet_lebel is -1: ', text, label)

    if fp:
      print(crdstack, bltstack)

    if temp['bltstack_level'] != temp['crdstack_level']:
      if bltfound:
        temp['level'] = temp['bltstack_level']
      else:
        temp['level'] = temp['crdstack_level']

      iserrorlev[idx], loglev[idx] = True, 'LEVEL NOT MATCH'
      log = 'LEVEL NOT MATCH level: by x position {}, by pattern {}'.format(temp['crdstack_level'], temp['bltstack_level'])
      temp['DEBUG_LOG'] = temp['DEBUG_LOG'] + '/ ' + log if temp['DEBUG_LOG'] else log
    else:
      temp['level'] = temp['bltstack_level']

    if temp['DEBUG_LOG']:
      print(temp['DEBUG_LOG'])
      print('page: {}, [\'level\']: {}, xPos: {}, text: {}'.format(pnum, temp['level'], xpos, temp['ref_line']))
    entry.append(temp)
  
  semi_raw = []
  for pnum, xpos, level, text, label, iserror, log in zip(npagelev, xposlev, levs, textlev, labellev, iserrorlev, loglev):
    semi_raw.append({'text': text,
          'minxpos': xpos,
          'label': label,
          'page_number': pnum,
          'error_found': iserror,
          'log': log})
    
  if 5 in npagelev and curr_doc.ref_doc == '2022.3.1':
    for aent in entry:
      aent['level'] = aent['bullet_level'][0]

  return prjopt, entry, semi_raw

In [None]:
def get_categorys_lv(list_of_dict):
  CATEGORY_LV1, CATEGORY_LV2, CATEGORY_LV3, CATEGORY_LV4, CATEGORY_LV5, CATEGORY_LV6, ITEM_DESCRIPTION, DEBUG_LOG = [''] * 8
  for ite in list_of_dict:
    if ite['DEBUG_LOG']:
      DEBUG_LOG += ite['DEBUG_LOG']
    if ite['level'] == 1:
      CATEGORY_LV1 = ite['context']
    elif ite['level'] == 2:
      CATEGORY_LV2 = ite['context']
    elif ite['level'] == 3:
      CATEGORY_LV3 = ite['context']
    elif ite['level'] == 4:
      CATEGORY_LV4 = ite['context']
    elif ite['level'] == 5:
      CATEGORY_LV5 = ite['context']
    elif ite['level'] == 6:
      CATEGORY_LV6 = ite['context']
    elif ite['level'] == 7:
      ITEM_DESCRIPTION = ite['context']
    else:
      print('LEVEL MORE THAN 7')

  if not ITEM_DESCRIPTION:
    if CATEGORY_LV6:
      ITEM_DESCRIPTION = CATEGORY_LV6
      CATEGORY_LV6 = ''
    elif CATEGORY_LV5:
      ITEM_DESCRIPTION = CATEGORY_LV5
      CATEGORY_LV5 = ''
    elif CATEGORY_LV4:
      ITEM_DESCRIPTION = CATEGORY_LV4
      CATEGORY_LV4 = ''
    elif CATEGORY_LV3:
      ITEM_DESCRIPTION = CATEGORY_LV3
      CATEGORY_LV3 = ''
    elif CATEGORY_LV2:
      ITEM_DESCRIPTION = CATEGORY_LV2
      CATEGORY_LV2 = ''
    elif CATEGORY_LV1:
      ITEM_DESCRIPTION = CATEGORY_LV1
      CATEGORY_LV1 = ''

  if DEBUG_LOG:
    DEBUG_LOG += '/ {}'.format([ite['ref_line'] for ite in list_of_dict])
  return CATEGORY_LV1, CATEGORY_LV2, CATEGORY_LV3, CATEGORY_LV4, CATEGORY_LV5, CATEGORY_LV6, ITEM_DESCRIPTION, DEBUG_LOG

In [None]:
def get_rows_of_budg_unit(running_id, budget_plan_details, bud_u):

  REF_DOC = curr_doc.ref_doc
  ITEM_ID = '{}.{}'.format(REF_DOC, running_id)
  REF_PAGE_NO = -1
  DEBUG_LOG = ''
  MINISTRY = budget_plan_details['ministry']
  BUDGETARY_UNIT = budget_plan_details['budget_unit']
  CROSS_FUNC = budget_plan_details['is_cross_func']
  BUDGET_PLAN = budget_plan_details['budget_plan']
  OUTPUT = budget_plan_details['Output']
  PROJECT = budget_plan_details['Project']

  CATEGORY_LV1, CATEGORY_LV2, CATEGORY_LV3, CATEGORY_LV4, CATEGORY_LV5, CATEGORY_LV6, ITEM_DESCRIPTION = [''] * 7
  FISCAL_YEAR = curr_doc.fiscal_year
  AMOUNT = -1
  OBLIGED = 0

  stack = []
  rows = []
  for item in bud_u:
    if stack:
      if stack[-1]['level'] < item['level']:
        stack.append(item)
      elif stack[-1]['level'] > item['level']:
        rows.append([x for x in stack])
        stack.pop()
      elif stack[-1]['level'] == item['level']:
        rows.append([x for x in stack])
      if stack:
        stack.pop()
      stack.append(item)
    else:
      stack.append(item)
      
  rows.append([x for x in stack])

  rows_of_lists = []

  for row in rows[:]:
    if not row:
      continue
    itms = row[-1]
    REF_PAGE_NO = itms['page']
    OBLIGED = itms['is_obliged']
    CATEGORY_LV1, CATEGORY_LV2, CATEGORY_LV3, CATEGORY_LV4, CATEGORY_LV5, CATEGORY_LV6, ITEM_DESCRIPTION, DEBUG_LOG = get_categorys_lv([x for x in row])
    if MINISTRY == 'งบกลาง':
      BUDGET_PLAN = CATEGORY_LV2[:]
      CROSS_FUNC = BUDGET_PLAN.startswith('แผนงานบูรณาการ')
      CATEGORY_LV1 = CATEGORY_LV3[:]
      CATEGORY_LV2 = ''
      CATEGORY_LV3 = ''

    if itms['is_obliged']:
      for fy, amt in itms['fiscal_year'].items():
        FISCAL_YEAR = fy
        AMOUNT = amt
        ITEM_ID = '{}.{}'.format(REF_DOC, running_id)
        running_id += 1
        rows_of_lists.append([ITEM_ID, REF_DOC, REF_PAGE_NO, MINISTRY, BUDGETARY_UNIT, np.bool_(CROSS_FUNC), BUDGET_PLAN, OUTPUT,
                              PROJECT, CATEGORY_LV1, CATEGORY_LV2, CATEGORY_LV3, CATEGORY_LV4,
                              CATEGORY_LV5, CATEGORY_LV6, ITEM_DESCRIPTION,
                              FISCAL_YEAR, AMOUNT, np.bool_(OBLIGED), DEBUG_LOG])
    else:
      FISCAL_YEAR = itms['fiscal_year']
      AMOUNT = itms['amount']
      ITEM_ID = '{}.{}'.format(REF_DOC, running_id)
      running_id += 1
      rows_of_lists.append([ITEM_ID, REF_DOC, REF_PAGE_NO, MINISTRY, BUDGETARY_UNIT, np.bool_(CROSS_FUNC), BUDGET_PLAN, OUTPUT,
                            PROJECT, CATEGORY_LV1, CATEGORY_LV2, CATEGORY_LV3, CATEGORY_LV4,
                            CATEGORY_LV5, CATEGORY_LV6, ITEM_DESCRIPTION, FISCAL_YEAR, AMOUNT, np.bool_(OBLIGED), DEBUG_LOG])
  return running_id, rows_of_lists

### get data frame

In [None]:
def get_data_frame(doc, to_csv=False):
  lemtestrange = doc.range
#   for i, lrange in enumerate(lemtestrange[:]):
#       print(i, lrange['budget_unit'], lrange['range'])

  pstart = doc.start
  listpage = []
  for lrange in lemtestrange:
      startp, endp = lrange['range']
      for ipage in range(startp, endp):
          listpage.append(doc.page(ipage + pstart))
          listpage[-1].pdfpage = ipage

  idtpages = []
  ihdpages = []
  tmphdict = {}
  semi_raw_entry = {doc.ref_doc :[]}
  Header = []
  Detail = []

  bgplan = ''
  lemrange = lemtestrange[:]
  df = pd.DataFrame(columns=cols)
  running_id = 0

  if curr_doc.ref_doc == '2022.3.1':
    tmppages = []
    idx = 0
    while (listpage[0].pdfpage < 8):
      tmppages.append(listpage.pop(0))
      idx += 1
    prjopt, entry, semi_raw = get_entry(clean_entry(tmppages))
    tmph = {'ministry': 'งบกลาง',
            'budget_unit': 'งบกลาง', 
            'is_cross_func': 0, 
            'budget_plan':'', 
            'Output': '', 
            'Project': '',
            'ref_page': []}
    running_id, rows = get_rows_of_budg_unit(running_id, tmph, entry)
    new_row = pd.DataFrame(rows, columns=cols)
    df = df.append(new_row, ignore_index=True)
    semi_raw_entry[doc.ref_doc].append(semi_raw)
    Header.append(tmph)

  for ipage in tqdm(listpage, desc=doc.ref_doc):
      text = '\n'.join(ipage.get_text_lines())
      headerw = ['หน่วย : ล้านบาท', 'หน่วยเล้านบาท', 'วัตถุประสงค์', 'หน่วย:ล้านบาท', 'หน่วยนับ', 'หน่วยล้านบาท']
      if not [hw for hw in headerw if hw in text]:
          # detail
          if ihdpages:
            tbgplan = get_budget_plan(ihdpages)
            bgplan = tbgplan if tbgplan else bgplan
            budget_unit = ''
            ministry = ''
            while not budget_unit:
              if lemrange[0]['range'][0] <= ihdpages[0].pdfpage < lemrange[0]['range'][1]:
                ministry = lemrange[0]['ministry']
                budget_unit = lemrange[0]['budget_unit']
              else:
                lemrange.pop(0)
            tmphdict = {'ministry': ministry, 'budget_unit': budget_unit,
                        'budget_plan': bgplan, 'ref_page': [i.pdfpage for i in ihdpages],
                        'is_cross_func': bgplan.startswith('แผนงานบูรณาการ')}
            ihdpages = []
          idtpages.append(ipage)
      else:
          # header
          if idtpages:
            prjopt, entry, semi_raw = get_entry(clean_entry(idtpages))
            semi_raw_entry[doc.ref_doc].append(semi_raw)
            Detail.append(entry)
            tmphdict.update(prjopt)
            Header.append(tmphdict)
            idtpages = []
          ihdpages.append(ipage)
  if idtpages:
    prjopt, entry, semi_raw = get_entry(clean_entry(idtpages))
    semi_raw_entry[doc.ref_doc].append(semi_raw)
    Detail.append(entry)
    tmphdict.update(prjopt)
    Header.append(tmphdict)
    
  for header, detail in zip(Header, Detail):
    running_id, rows = get_rows_of_budg_unit(running_id, header, detail)
    new_row = pd.DataFrame(rows, columns=cols)
    df = df.append(new_row, ignore_index=True)

  # print('entries contain error'.upper())
  # tempe = []
  # for buplan, header in zip(semi_raw_entry[curr_doc.ref_doc], Header):
  #   tempd = {}
  #   tempa = []
  #   curr_page = 0
  #   for entry in buplan:
  #     if entry['error_found']:
  #       print(entry)
  #     if curr_page and curr_page == entry['page_number']:
  #       tempa.append({
  #           'minxpos': entry['minxpos'],
  #           'error_found': entry['error_found'],
  #           'log': entry['log'],
  #           'block': [{'text': txt, 'label': lbel} for txt, lbel in zip(entry['text'], entry['label'])]
  #           })
  #     elif curr_page != entry['page_number']:
  #       if curr_page != 0:
  #         tempd.update({curr_page: tempa})
  #         tempa = []
  #       curr_page = entry['page_number']
  #   tempd.update({curr_page: tempa})
  #   header.update({'page': tempd})
  #   tempe.append(header)

  # semirawentry = {curr_doc.ref_doc: tempe}
  if to_csv:
    # with open("/content/drive/MyDrive/data/24-Jul/{}-raw-entry.json".format(curr_doc.ref_doc.replace('.', '-')), "w", encoding="utf8") as file:
    #     json.dump(semirawentry, file, ensure_ascii=False, cls=NpEncoder)

    df.to_csv('/content/drive/MyDrive/data/24-Jul/{}.csv'.format(curr_doc.ref_doc.replace('.', '-')))
  return df

### globe var

In [None]:
cols = ['ITEM_ID','REF_DOC','REF_PAGE_NO','MINISTRY','BUDGETARY_UNIT',
        'CROSS_FUNC?','BUDGET_PLAN','OUTPUT','PROJECT','CATEGORY_LV1',
        'CATEGORY_LV2','CATEGORY_LV3','CATEGORY_LV4','CATEGORY_LV5',
        'CATEGORY_LV6', 'ITEM_DESCRIPTION','FISCAL_YEAR','AMOUNT','OBLIGED?', 'DEBUG_LOG']

In [None]:
UNIT_NOUN = ['ไร่', 'กิโลเมตร', 'กม.','มิลลิกรัม','ชีซี', 'ซีซี', 'แห่ง', 'ระบบ', 'กิจกรรม', 'โครงการ','ฟุต','กิโลวัตต์', 'รายการ', 'ช่องสัญญาณ',
             'เตียง', 'องศา', 'ต้นแบบ', 'เครื่อง', 'ชุด', 'คัน', 'ลิตร','ล้อ', 'เมตร', 'จังหวัดริมน้ำโขง', 'งาน', 'ไมครอน',
             'คัน/ลำ/เครื่อง', 'แรงม้า', 'ชั้น', 'พรรษา', 'ตารางเมตร', 'ลูกบาศก์เมตร/วัน', 'ถุง', 'สายทาง', 'kW',
             'อาคาร', 'หลัง', 'ห้อง', 'สถานการณ์', 'กิโลกรัม', 'เกลียวผสม', 'ฉบับ','ล้านบาท', 'ไมโครกรัม']

In [None]:
SKIP_LINE = ['วงเงินทั้งิ้น', 'เงินนอกงบประมาณ', 'เงินงบประมาณ', 'รายการบุคลากรภาครัฐ', 'รายการบุคลากรภาครั้ัฐ',
             'วงเงินทั้งสิ้น', 'วงเงินทั้งสิ้น','วงเงินทั่งสิ้น', 'วงเงินทั้งส้น', 'วงเงินทิ้งส้น','วงเงินทั่้งสิ้น', 'วงเงินทั้งสิ้',
             'วงเงินทิ่้งสิ้น','วงเงินทิ้งสิ้น', 'วงเงินทิ่งสิ้น','วงเงินทั้งสิน', 'งงบประมาณ', 'วงเงินทิ้งสิน', 'วงเงินท้งสิ้น',
             'วงเงินทิ่งสิน', 'วงเงินทั้งิน', 'วงเงินทุ่งสิน',
             
             'รายการผูกพัน', 'รายการไม่ผูกพัน', 'ประกอบด้วย', # 3.15
             
             'รายละเอียดงบประมาณจำแนกตามงบรายจ่าย', 'งบประมาณทั้งสิ้น', 'วงเงินทั้งสน', 'วงเงินทั้งี']

# implementation

In [None]:
%%time
tbg = ThaiBudget('/content/drive/MyDrive/data/thaiBudget2022.json')

CPU times: user 8.98 s, sys: 2.61 s, total: 11.6 s
Wall time: 17 s


In [None]:
for rd in tbg.docs[-8:]:
  curr_doc = tbg.get_doc(rd)
  df = get_data_frame(curr_doc, 1)

In [None]:
curr_doc = ThaiBudget('/content/2022-3-16(3).json').get_doc('2022.3.16(3)')
df = get_data_frame(curr_doc, 1)

HBox(children=(FloatProgress(value=0.0, description='2022.3.16(3)', max=326.0, style=ProgressStyle(description…

🚨 [1] is not 'BULLET':  ['ค่าที่ดิน/สิ่งก่อสร้าง', '5,896,900', 'บาท']
🚨 bulltet_lebel is -1:  ['ค่าที่ดิน/สิ่งก่อสร้าง', '5,896,900', 'บาท'] ['NOTMATCH', 'AMOUNT', 'BAHT']
NO BULLET FOUND/ LEVEL NOT MATCH level: by x position 3, by pattern 4
page: 71, ['level']: 3, xPos: 186.0, text: ['ค่าที่ดิน/สิ่งก่อสร้าง', '5,896,900', 'บาท']
LEVEL NOT MATCH level: by x position 5, by pattern 4
page: 102, ['level']: 4, xPos: 240.0, text: ['(11)', 'เงินอุดหนุนสำหรับสนับสนุนอาหารกลางวัน', '6,813,500', 'บาท']
LEVEL NOT MATCH level: by x position 5, by pattern 4
page: 111, ['level']: 4, xPos: 230.0, text: ['(2)', 'ค่าก่อสร้างอื่นๆที่มีราคาต่อหน่วยต่ำกว่า', '10', 'ล้านบาท', '5,893,400', 'บาท', 'รวม', '2', 'รายการ', '(รวม', '2', 'หน่วย)']
🚨 [1] is not 'BULLET':  ['ค่าที่ดิน/สิ่งก่อสร้าง', '10,044,800', 'บาท']
🚨 bulltet_lebel is -1:  ['ค่าที่ดิน/สิ่งก่อสร้าง', '10,044,800', 'บาท'] ['NOTMATCH', 'AMOUNT', 'BAHT']
NO BULLET FOUND
page: 119, ['level']: 4, xPos: 206.0, text: ['ค่าที่ดิน/สิ่งก่อสร้าง', '10,044

In [None]:
curr_doc = tbg.get_doc('2022.3.16(3)')
df = get_data_frame(curr_doc)

In [None]:
a = clean_entry([curr_doc.page(i, True) for i in [102]], pl=1)
a = get_entry(a)

[102]
[162]
[0]
102 139.0 [('ผลผลิต', 'OUTPUT'), (':', 'NOTMATCH'), ('ผลผลิตการจัดบริการสาธารณะ', 'NOTMATCH'), ('163,525,800', 'AMOUNT'), ('บาท', 'BAHT')]
102 139.0 [('1.', 'BULLET'), ('งบเงินอุดหนุน', 'NOTMATCH'), ('163,525,800', 'AMOUNT'), ('บาท', 'BAHT')]
102 156.0 [('1.1', 'BULLET'), ('เงินอุดหนุนทั่วไป', 'NOTMATCH'), ('155,466,300', 'AMOUNT'), ('บาท', 'BAHT')]
102 172.0 [('1)', 'BULLET'), ('ค่าใช้จ่ายบุคลากร', 'NOTMATCH'), ('62,070,000', 'AMOUNT'), ('บาท', 'BAHT')]
102 223.0 [('(1)', 'BULLET'), ('เงินอุดหนุนสำหรับการจัดการศึกษาภาคบังคับ', 'NOTMATCH'), ('(เงินเดือนครู', 'NOTMATCH'), ('ค่าจ้างประจำ)', 'NOTMATCH'), ('60,183,300', 'AMOUNT'), ('บาท', 'BAHT')]
102 222.0 [('(2)', 'BULLET'), ('เงินอุดหนุนสำหรับสนับสนุนการถ่ายโอนบุคลากร', 'NOTMATCH'), ('835,200', 'AMOUNT'), ('บาท', 'BAHT')]
102 222.0 [('(3)', 'BULLET'), ('เงินอุดหนุนสำหรับสนับสนุนศูนย์พัฒนาเด็กเล็ก', 'NOTMATCH'), ('1,051,500', 'AMOUNT'), ('บาท', 'BAHT')]
102 171.0 [('2)', 'BULLET'), ('ค่าใช้จ่ายดำเนินงาน', 'NOTMATCH'), ('7

In [None]:
page = curr_doc.page(767, 1)
print(page.whpage)
xposl, textl = page.xpos_text_lines()
for xpos, line in zip(xposl, textl):
    # print([(txt) for x, txt in zip(xpos.tolist(), line)])
    # print([(txt, x[1]) for x, txt in zip(xpos.tolist(), line)])
    print([(txt, x[0][0]) for x, txt in zip(xpos.tolist(), line)])

[[ 225. 1525.]
 [ 107.  791.]]
[('767', 1482.5)]
[('(9)', 316.0), ('เงินอุดหนุนสำหรับการดำเนินงานตามแนวทางโครงการพระราชดำริ', 350.0)]
[('ด้านสาธารณสุข', 316.0), ('460,000', 1376.0), ('บาท', 1467.0)]
[('(10)', 316.0), ('เงินอุดหนุนสำหรับสนับสนุนอาหารเสริม', 362.0), ('(นม)', 727.0), ('1,855,100', 1359.0), ('บาท', 1467.0)]
[('3)', 247.0), ('เงินอุดหนุนดำเนินการตามอำนาจหน้าที่และภารกิจถ่ายโอน', 274.0), ('บ', 592.0), ('ส4่', 611.0), ('19,430,500', 1348.0), ('บาท', 1467.0)]
[('1.2', 225.0), ('เงินอุดหนุนเฉพาะกิจ', 263.0), ('5,955,000', 1359.0), ('บาท', 1467.0)]
[('ค่าครุภัณ์', 246.0), ('2,400,000', 1359.0), ('บาท', 1467.0)]
[('(1)', 293.0), ('รถบรรทุกขยะ', 326.0), ('ขนาด', 461.0), ('6', 522.0), ('ตัน', 543.0), ('6', 579.0), ('ล้อ', 599.0), ('ปริมาตรกระบอกสูบไม่ต่ำกว่า', 635.0)]
[('6,000', 292.0), ('ซีซี', 352.0), ('หรือกำลังเครื่องยนต์สูงสุดไม่ต่ำกว่า', 388.0), ('170', 707.0), ('กิโลวัตต์', 745.0)]
[('แบบอัดท้าย', 294.0), ('ตำบลท่าโรง', 410.0), ('อำเภอวิเชียรบุรี', 523.0), ('จังหวัดเพชรบูรณ์

In [None]:
ministry = []
page_rage = []
for ipage in curr_doc.index_page:
  ministry, page_rage_ = get_range(curr_doc.page(ipage), ministry)
  page_rage += page_rage_
page_rage

In [None]:
df.loc[df['REF_PAGE_NO'] == 103]

Unnamed: 0,ITEM_ID,REF_DOC,REF_PAGE_NO,MINISTRY,BUDGETARY_UNIT,CROSS_FUNC?,BUDGET_PLAN,OUTPUT,PROJECT,CATEGORY_LV1,CATEGORY_LV2,CATEGORY_LV3,CATEGORY_LV4,CATEGORY_LV5,CATEGORY_LV6,ITEM_DESCRIPTION,FISCAL_YEAR,AMOUNT,OBLIGED?,DEBUG_LOG
343,2022.3.16(3).343,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนทั่วไป,ค่าใช้จ่ายดำเนินงาน,,,,เงินอุดหนุนสำหรับขับเคลื่อนโครงการสัตว์ปลอดโรค...,2022,27000,False,"LEVEL NOT MATCH level: by x position 6, by pat..."
344,2022.3.16(3).344,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนทั่วไป,ค่าใช้จ่ายดำเนินงาน,,,,เงินอุดหนุนสำหรับสำรวจข้อมูลจำนวนสัตว์และขึ้นท...,2022,5400,False,"LEVEL NOT MATCH level: by x position 6, by pat..."
345,2022.3.16(3).345,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนทั่วไป,ค่าใช้จ่ายดำเนินงาน,,,,เงินอุดหนุนสำหรับการดำเนินงานตามแนวทางโครงการพ...,2022,400000,False,"LEVEL NOT MATCH level: by x position 6, by pat..."
346,2022.3.16(3).346,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนทั่วไป,ค่าใช้จ่ายดำเนินงาน,,,,เงินอุดหนุนชดเชยรายได้ที่ลดลงจากเหตุการณ์ความไ...,2022,5793600,False,"LEVEL NOT MATCH level: by x position 6, by pat..."
347,2022.3.16(3).347,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนทั่วไป,ค่าใช้จ่ายดำเนินงาน,,,,เงินอุดหนุนการจัดการศึกษาขององค์กรปกครองส่วนท้...,2022,2280000,False,"LEVEL NOT MATCH level: by x position 6, by pat..."
348,2022.3.16(3).348,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนทั่วไป,ค่าใช้จ่ายดำเนินงาน,,,,เงินอุดหนุนสำหรับสนับสนุนอาหารเสริม (นม),2022,3087100,False,"LEVEL NOT MATCH level: by x position 6, by pat..."
349,2022.3.16(3).349,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนทั่วไป,,,,,เงินอุดหนุนดำเนินการตามอำนาจหน้าที่และภารกิจถ่...,2022,15699200,False,
350,2022.3.16(3).350,2022.3.16(3),103,องค์กรปกครองส่วนท้องถิ่น,เทศบาลเมืองในพื้นที่จังหวัดปัตตานี เทศบาลเมือง...,False,แผนงานยุทธศาสตร์ส่งเสริมการกระจายอำนาจให้แก่อง...,ผลผลิตการจัดบริการสาธารณะ,,งบเงินอุดหนุน,เงินอุดหนุนเฉพาะกิจ,ค่าที่ดิน/สิ่งก่อสร้าง,,,,ค่าก่อสร้างทางและสะพานที่มีร ละสะพานที่มีราคาต...,2022,8059500,False,


In [None]:
df.loc[df['AMOUNT'] == '0.0']

In [None]:
df.loc[df['DEBUG_LOG'] != '']

In [None]:
df.loc[df['FISCAL_YEAR'] < 0]

In [None]:
df.to_csv('{}.csv'.format(curr_doc.ref_doc.replace('.', '-')))

In [None]:
OJSON = []
for header, detail in zip (Header, Detail):
  OJSON.append(header)
  OJSON[-1].update({'entry': detail})

with open("{}.json".format(curr_doc.ref_doc.replace('.', '-')), "w", encoding="utf8") as file:
  json.dump(OJSON, file, ensure_ascii=False, cls=NpEncoder)