<a href="https://colab.research.google.com/github/clarkde5/jams-pub/blob/main/colabs/jams-env-doctr-lease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Environment**

## **Bootstrap public repo**

In [None]:
from pathlib import Path

if not Path(f"/content").is_dir():
  %cd ~
else:
  %cd /content

if not Path(f"src/jams-pub").is_dir():
  !git clone https://github.com/clarkde5/jams-pub src/jams-pub
else:
  print("src/jams-pub already exists")

## **Pull private resources**

In [None]:
runOptional = True
def CallPrivateEnvSetup():
  from pathlib import Path

  home = str(Path.home())

  try:
    from google.colab import userdata
    try:
      rsa_private_key = userdata.get('PrivateKey')
    except:
      rsa_private_key = ""
      print("PrivateKey is required, but not found in secrets / colab userdata")
      return
  except:
    print("google.colab could not be loaded")
    if not Path(f"{home}/.ssh/id_rsa").is_file():
      print("rsa key does not exist, please create manually")
      return
    else:
      print("rsa key already exists")

  if not Path(f"{home}/.ssh/id_rsa").is_file():
    !python src/jams-pub/env-setup.py "{rsa_private_key}"

  if not Path(f"{home}/.ssh/id_rsa").is_file():
    print("env-setup failed to create rsa key")
    return
  else:
    #!apt-get install dos2unix -y
    #!dos2unix ~/.ssh/id_rsa
    !sed -i 's/\r$//' ~/.ssh/id_rsa

  if not Path(f"src/jams").is_dir():
    !git clone git@github.com:clarkde5/jams.git src/jams
  else:
    print("src/jams already exists")

if __name__ == "__main__":
  if runOptional == True:
    CallPrivateEnvSetup()
  else:
    print("Not running optional cell")

# **Parse Lease Output**

**NOTE:** Requires [Pull private resources](https://colab.research.google.com/github/clarkde5/jams-pub/blob/main/colabs/jams-env-doctr.ipynb#scrollTo=CyQkr3hoftql)

In [None]:
runOptional = True
def getInvoiceNumber(page_idx,page):
  import re

  invoice_number = ""

  for block in page["blocks"]:
    if invoice_number != "":
      break
    for line in block["lines"]:
      if invoice_number != "":
        break
      sorted_words = sorted(line["words"], key = lambda x: x["geometry"][0][0])
      for word_idx,word in enumerate(sorted_words):
        if re.search("INVOICE",word["value"]) and re.search("NUMBER.",sorted_words[1]["value"]):
          invoice_number = sorted_words[2]["value"]
          break

  return invoice_number

def getContractsForPage(page_idx,page):
  import re
  CurrentFound = page_idx != 0
  contracts = []

  for block in page["blocks"]:
    for line in block["lines"]:
      for word_idx,word in enumerate(sorted(line["words"], key = lambda x: x["geometry"][0][1])):
        if re.search("CURRENT",word["value"]):
          CurrentFound = True

        if not CurrentFound:
          continue

        if re.search("\d{3}-\d{7}-\d{3}",word["value"]):
          contracts.append({"contract_number": word["value"], "pdf_y": word["geometry"][0][1]+page_idx, "page": page_idx+1})
        else:
          continue

  return contracts

def getSerialNumbersForPage(page_idx,page):
  import re
  CurrentFound = page_idx != 0
  serialNumbers = []

  for block in page["blocks"]:
    for line in block["lines"]:
      for word_idx,word in enumerate(sorted(line["words"], key = lambda x: x["geometry"][0][1])):
        if re.search("CURRENT",word["value"]):
          CurrentFound = True

        if not CurrentFound:
          continue

        if re.search("SERIAL",word["value"]):
          serial_number_word = line["words"][2]
          serialNumbers.append({"serial_number": serial_number_word["value"], "pdf_y": serial_number_word["geometry"][0][1]+page_idx, "page": page_idx+1})
        else:
          continue

  return serialNumbers

def getPaymentDue(page_idx,page):
  import re
  CurrentFound = page_idx != 0
  PleaseFound = False
  prices = []

  for block in page["blocks"]:
    for line in block["lines"]:
      for word_idx,word in enumerate(sorted(line["words"], key = lambda x: x["geometry"][0][1])):
        if re.search("PLEASE",word["value"]):
          PleaseFound = True

        if not PleaseFound:
          continue

        if re.search("CURRENT",word["value"]):
          CurrentFound = True

        if not CurrentFound:
          continue

        if re.search("-{0,1}\d+\.\d{2}",word["value"]):
          prices.append({"price": word["value"], "pdf_y": word["geometry"][0][1]+page_idx, "page": page_idx+1})
        else:
          continue

  return prices

def convertToJson(sortedData):
  currentPage = 0

  response_list = []
  current_page_dict = {}
  current_contract_number_item = {}

  for contractSerialPair in sortedData:
    if contractSerialPair["page"] != currentPage:
      currentPage = contractSerialPair["page"]     
      response_list.append({"page": currentPage})
      current_page_dict = response_list[-1]
      current_page_dict["items"] = []

    if "contract_number" in contractSerialPair:
      current_page_dict["items"].append({"contract_number": contractSerialPair["contract_number"], "price": []})
      current_contract_number_item = current_page_dict["items"][-1]
    elif "serial_number" in contractSerialPair:
      current_contract_number_item["serial_number"] = contractSerialPair["serial_number"]
    elif "price" in contractSerialPair:
        current_contract_number_item["price"].append(contractSerialPair["price"])
    else:
      print("Error unknown pair: " + contractSerialPair)

  return response_list

def calculateTotals(response_list):
  for response_page in response_list["pages"]:
    for response_item in response_page["items"]:
      for response_price in response_item["price"]:
        if "total_price" in response_item:
          response_item["total_price"] = float(response_item["total_price"]) + float(response_price)
        else:
          response_item["total_price"] = float(response_price)
        response_item["total_price"] = str(round(response_item["total_price"],2))
  
  return response_list

def main():
  import json

  f = open('src/jams/output/Aug 23 Lease-docTR.json')
  data = json.load(f)

  contracts = []
  serialNumbers = []
  paymentDue = []

  invoice_number = getInvoiceNumber(0,data["pages"][0])

  for page_idx,page in enumerate(data["pages"]):
    contracts += getContractsForPage(page_idx,page)
    serialNumbers += getSerialNumbersForPage(page_idx,page)
    paymentDue += getPaymentDue(page_idx,page)

  contractSerials = contracts + serialNumbers + paymentDue
  contractSerials = sorted(contractSerials, key = lambda x: x["pdf_y"])

  response_list = {"pages" : convertToJson(contractSerials), "invoice_number": invoice_number}
  response_list = calculateTotals(response_list)


  json_formatted_str = json.dumps(response_list, indent=2)
  print(json_formatted_str)

if __name__ == "__main__":
  if runOptional == True:
    main()
  else:
    print("Not running optional cell")