<a href="https://colab.research.google.com/github/clarkde5/jams-pub/blob/main/colabs/jams-env-doctr-copies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Environment**

## **Bootstrap public repo**

In [1]:
from pathlib import Path

if not Path(f"/content").is_dir():
  %cd ~
else:
  %cd /content

if not Path(f"src/jams-pub").is_dir():
  !git clone https://github.com/clarkde5/jams-pub src/jams-pub
else:
  print("src/jams-pub already exists")

/content
Cloning into 'src/jams-pub'...
remote: Enumerating objects: 157, done.[K
remote: Counting objects: 100% (157/157), done.[K
remote: Compressing objects: 100% (138/138), done.[K
remote: Total 157 (delta 98), reused 32 (delta 16), pack-reused 0[K
Receiving objects: 100% (157/157), 116.30 KiB | 5.54 MiB/s, done.
Resolving deltas: 100% (98/98), done.


## **Pull private resources**

In [2]:
runOptional = True
def CallPrivateEnvSetup():
  from pathlib import Path

  home = str(Path.home())

  try:
    from google.colab import userdata
    try:
      rsa_private_key = userdata.get('PrivateKey')
    except:
      rsa_private_key = ""
      print("PrivateKey is required, but not found in secrets / colab userdata")
      return
  except:
    print("google.colab could not be loaded")
    if not Path(f"{home}/.ssh/id_rsa").is_file():
      print("rsa key does not exist, please create manually")
      return
    else:
      print("rsa key already exists")

  if not Path(f"{home}/.ssh/id_rsa").is_file():
    !python src/jams-pub/env-setup.py "{rsa_private_key}"


  if not Path(f"{home}/.ssh/id_rsa").is_file():
    print("env-setup failed to create rsa key")
    return

  if not Path(f"src/jams").is_dir():
    !git clone git@github.com:clarkde5/jams.git src/jams
  else:
    print("src/jams already exists")

if __name__ == "__main__":
  if runOptional == True:
    CallPrivateEnvSetup()
  else:
    print("Not running optional cell")

PrivateKey acquired
Cloning into 'src/jams'...
remote: Enumerating objects: 242, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 242 (delta 5), reused 9 (delta 1), pack-reused 221[K
Receiving objects: 100% (242/242), 76.54 MiB | 15.81 MiB/s, done.
Resolving deltas: 100% (7/7), done.


# **Parse Lease Output**

**NOTE:** Requires [Pull private resources](https://colab.research.google.com/github/clarkde5/jams-pub/blob/main/colabs/jams-env-doctr.ipynb#scrollTo=CyQkr3hoftql)

In [None]:
runOptional = True
def getInvoiceNumber(page_idx,page):
	import re

	invoice_number = ""

	for block in page["blocks"]:
		if invoice_number != "":
			break
		for line in block["lines"]:
			if invoice_number != "":
				break
			sorted_words = sorted(line["words"], key = lambda x: x["geometry"][0][0])
			for word_idx,word in enumerate(sorted_words):
				if re.search("Invoice",word["value"]) and re.search("Nbr:",sorted_words[1]["value"]):
					invoice_number = sorted_words[2]["value"]
					break

	return invoice_number

def getInvoiceTotal(page_idx,page):
	import re
	StartingPlaceFound = page_idx != 0
	invoice_total = ""

	for block in page["blocks"]:
		if invoice_total != "":
			break
		for line in block["lines"]:
			if invoice_total != "":
				break
			sorted_words = sorted(line["words"], key = lambda x: x["geometry"][0][0])
			if len(sorted_words) == 4:
				if re.search("This",sorted_words[1]["value"]):
					invoice_total = sorted_words[3]["value"]
					break

	return invoice_total

def getContractsForPage(page_idx,page):
	import re
	StartingPlaceFound = page_idx != 0
	contracts = []

	for block in page["blocks"]:
		for line in block["lines"]:
			for word_idx,word in enumerate(sorted(line["words"], key = lambda x: x["geometry"][0][1])):
				if re.search("Unit",word["value"]):
					StartingPlaceFound = True

				if not StartingPlaceFound:
					continue

				if re.search("Contract",word["value"]):
					if len(line["words"])>=3:
						contract_number_word = line["words"][2]
						if re.search("\d{8}",contract_number_word["value"]):
							contracts.append({"contract_number": contract_number_word["value"], "pdf_y": contract_number_word["geometry"][0][1]+page_idx, "page": page_idx+1})
				else:
					continue

	return contracts

def getSerialNumbersForPage(page_idx,page):
	import re
	StartingPlaceFound = page_idx != 0
	serialNumbers = []

	for block in page["blocks"]:
		for line_idx,line in enumerate(block["lines"]):
			sorted_words = sorted(line["words"], key = lambda x: x["geometry"][0][1])
			for word_idx,word in enumerate(sorted_words):
				if re.search("Unit",word["value"]):
					StartingPlaceFound = True

				if not StartingPlaceFound:
					continue

				if re.search("Contract",word["value"]):
					if len(line["words"])>=3:
						contract_number_word = line["words"][2]
						if re.search("\d{8}",contract_number_word["value"]):
							serial_number_word = block["lines"][line_idx+1]["words"][0]
							if serial_number_word["value"][4] == 'O':
								serial_number_word["value"] = serial_number_word["value"][:4] + "0" + serial_number_word["value"][5:]
							serialNumbers.append({"serial_number": serial_number_word["value"], "pdf_y": serial_number_word["geometry"][0][1]+page_idx, "page": page_idx+1})
				else:
					continue

	return serialNumbers

def getPaymentDue(page_idx,page):

	import re
	StartingPlaceFound = page_idx != 0
	prices = []

	for block in page["blocks"]:
		for line in block["lines"]:
			for word_idx,word in enumerate(sorted(line["words"], key = lambda x: x["geometry"][0][1])):
				if re.search("\$\-{0,1}\d+\.\d{2}",word["value"]):
					prices.append({"price": word["value"], "pdf_y": word["geometry"][0][1]+page_idx, "page": page_idx+1})
				else:
					continue

	return prices


def convertToJson(sortedData):
	currentPage = 0

	response_list = []
	current_page_dict = {}
	current_contract_number_item = {}

	for contractSerialPair in sortedData:

		if contractSerialPair["page"] != currentPage:
			currentPage = contractSerialPair["page"]
			response_list.append({"page": currentPage})
			current_page_dict = response_list[-1]
			current_page_dict["items"] = []

		if "contract_number" in contractSerialPair:
			current_page_dict["items"].append({"contract_number": contractSerialPair["contract_number"], "serial_number": [], "price": []})
			current_contract_number_item = current_page_dict["items"][-1]
		elif "serial_number" in contractSerialPair:
			current_contract_number_item["serial_number"] = contractSerialPair["serial_number"]
		elif "price" in contractSerialPair:
			current_contract_number_item["price"].append(contractSerialPair["price"])
		else:
		  print("Error unknown pair: " + contractSerialPair)

  return response_list

def main():
	import json
	import re
	import pandas as pd
	from io import StringIO

	f = open('C:\Vaidhy\JAMS\Solution\Aug23copies-docTR.json')
	data = json.load(f)

	contracts = []
	serialNumbers = []
	paymentDue = []
	invoice_number = ""
	invoice_total = ""

	for page_idx,page in enumerate(data["pages"]):
		contracts += getContractsForPage(page_idx,page)
		serialNumbers += getSerialNumbersForPage(page_idx,page)
		paymentDue += getPaymentDue(page_idx,page)
		if invoice_number == "":
			invoice_number = getInvoiceNumber(page_idx,page)
		if invoice_total == "":
			invoice_total = getInvoiceTotal(page_idx,page)

	contractSerials = contracts + serialNumbers + paymentDue
	contractSerials = sorted(contractSerials, key = lambda x: x["pdf_y"])

	response_list = {"pages" : convertToJson(contractSerials), "invoice_number": invoice_number, "invoice_total": invoice_total}

	json_formatted_str = json.dumps(response_list, indent=2)
	print(json_formatted_str)

	df = pd.read_json(StringIO(json_formatted_str))
	# df.to_excel("C:\Vaidhy\JAMS\Solution\Aug 23 Copies-docTR.parsed.xlsx")
	df.to_csv("C:\Vaidhy\JAMS\Solution\Aug 23 Copies-docTR.parsed.csv")


if __name__ == "__main__":
	if runOptional == True:
		main()
	else:
		print("Not running optional cell")