# Total Amount Extraction

#### Approach:

- Step 1: Iterate through Azure analysis JSON Line by line
- Step 2: Match line with "Total"
- Step 3: Determine Total Amount by Iterating through the line
    - Approach 1: Create sorted list of lines, Merge lines with enough similarity, estimate Total by going through a line

#### Bugs/edgecases to deal with
- "Total" text and amount found in different lines/regions
    - Solution: MergeLines combines lines within a calculated "slack" numbre of pixels within eachother
- "Total" text and amount may not be correctly read by the OCR
    - Solution: Apply some text cleaning to increase accuracy
- Try to return numbers close to the total
    - Solution: using a levenshtein distance (String similarity) of 0.7 if total is not captured subtotal will
    - Solution: Iterate from the bottom lines first

#### Imports

In [15]:
import json
import re
import copy
from Levenshtein import ratio

#### Load the OCR Analysis JSON

In [None]:
# Remove Punctuation
# param text (arr) - Text to Clean
def remove_punc(text):
    return re.sub(r'[^\w\s]','',text)

# Remove Digits
# param text (str) - Text to clean
def remove_digit(text):
    return re.sub('\d+', '',text)


# Given an Analysis (JSON) output from Azure OCR, output the lines in order
# param: text id (int) - repecetive text id for receipt
def extract_lines(text_id):
    with open("json/" + str(text_id) + ".txt","r") as fin:
        analysis = json.load(fin)
    
    unsorted_lines = []
    for region in analysis["regions"]:
        for line in region["lines"]:
            unsorted_lines.append(line)
               
    # sort by Y coordinates
    sorted_lines = sorted(unsorted_lines, key = lambda x: int(x["boundingBox"].split(",")[1]))
    
    return sorted_lines
        
# Merge word lists in line dictionaries.
# Used to merge lines that were not identified in the analysis json
# param: line1 - line to merge
# param: line2 - line to merge
def merge_words(line1,line2):
    out_line = line1
    out_line["words"] += line2["words"]
    return out_line
    
# Iterate through sorted lines and merging lines that are found to be separate
# Lines will be merged if the top edge of the text is within the defined slack
# This is used to identify lines that could be angled or shifted as seen below
#
# Total:
#                50.00
#
# param: sorted_lines (list) - sorted lines output from extract lines
# param: slack (int) - the number of pixels above and below the top edge of text 
#        that will be considered for merging lines
def merge_lines(sorted_lines, slack=None):
    merged_lines = []
    for line_i in sorted_lines:
        for line_j in sorted_lines:
            line_i_BBy = int(line_i["boundingBox"].split(",")[1])
            line_j_BBy = int(line_j["boundingBox"].split(",")[1])
            line_i_BBx = int(line_i["boundingBox"].split(",")[0])
            line_j_BBx = int(line_j["boundingBox"].split(",")[0])
            
            if slack == None:
                slack = int(line_i["boundingBox"].split(",")[3]) / 4
            
            if abs(line_i_BBy - line_j_BBy) < slack and line_i != line_j and line_i_BBx < line_j_BBx:
                merge_words(line_i, line_j)
                sorted_lines.remove(line_j)
                
        merged_lines.append(line_i)
    return merged_lines
 
# Condensed pipeline of functions to output a final list of text lines
# param text_id (int) - id for the desired analysis JSON file
def get_lines(text_id):
    sorted_lines = extract_lines(text_id)
    output = merge_lines(sorted_lines)
    return output

# Given a line dictionary, iterate through the words for the first number
# param line (dict) line dictionary containing word dict
def num_search(line):
    for word in line["words"]:
        test_word = word["text"].replace("$","")
        try:
            total = float(test_word)
            return total
        except:
            #print("searching")
            pass
            
    #print("Could not find total")
    return None

# Given a line dictionary, iterate searching for a word similar to "total"
# param: line (dict) - line dictionary containing words
def string_search(line, similarity=0.7):
    for word in line["words"]:
        test_word = remove_punc(remove_digit(word["text"].lower()))
        if ratio(test_word, "total") > similarity:
            #print(word["text"], test_word, ratio(test_word,"total"))
            return True
        
# Final pipeline, Given a text id, collect lines and serach for total amount
# param text_id (int) - the id for the json file
def extract_total(text_id):
    for line in get_lines(text_id)[::-1]:
        if string_search(line):
            total = num_search(line)
            if total is not None:
                return total
