# IBM Receipt Recognition

# Step 1: Image-to-Text - Azure

Requirements:
- Microsoft Azure Subsription key and corresponding server
- URL of Images


### Imports

In [1]:
### Imports import requests
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from PIL import Image
from io import BytesIO
import json
import requests

### Functions

In [2]:
# Extract the word bounding boxes and text.
# Returns an Array That Contains a Dictionary with Bounding Box
# Coordinates and the Text Labels
def extract_word_info(line_infos):
    word_infos = []
    for line in line_infos:
        for word_metadata in line:
            for word_info in word_metadata["words"]:
                word_infos.append(word_info)
    return word_infos

In [3]:
# Save Output
# Saves a Text File That Contains Information of the Word Bounding Boxes and Text Labels
def save_output(word_infos, file_name="output"):
    words = []
    for i in word_infos: words.append(str(i))
    with open("text/" + file_name + ".txt", "w") as f:
        for line in words:
            f.write("%s\n" % line)

In [4]:
# Save Json Output
# Saves a Text File that contains the Json Output from Microsoft Azure
def save_json(json_data, file_name="json"):
    #print(json_data)
    type(json_data)
    with open('json/' + file_name + '.txt', 'w') as outfile:
        outfile.write(json.dumps(json_data))

In [5]:
# Display the image and overlay it with the extracted text.
def create_image(image_url, word_infos, save=False, img_name="img"):
    plt.figure(figsize=(30, 30))
    image = Image.open(BytesIO(requests.get(image_url).content))
    ax = plt.imshow(image, alpha=0.5)
    for word in word_infos:
        bbox = [int(num) for num in word["boundingBox"].split(",")]
        text = word["text"]
        origin = (bbox[0], bbox[1])
        patch  = Rectangle(origin, bbox[2], bbox[3], fill=False, linewidth=2, color='y')
        ax.axes.add_patch(patch)
        plt.text(origin[0], origin[1], text, fontsize=20, weight="bold", va="top")
    plt.axis("off")
    if save: plt.savefig('img/' + img_name + '.jpg')

In [6]:
# ENTER YOUR MICROSOFT SUBSCRIPTION KEY
subscription_key = "af3d8afcd1a64257aeed676d27958a8a"
image_url = ""

def receipt_ocr(img_id, sub_key=subscription_key, save=True):
    img_id = str(img_id)
    print("Currently Processing: " + img_id + ".jpg")
    # Setup Key and Vision Base URL
    subscription_key = sub_key
    assert subscription_key
    vision_base_url = "https://eastus.api.cognitive.microsoft.com/vision/v2.0/"
    ocr_url = vision_base_url + "ocr"
    # Set up API Request Parameters
    image_url = "http://kunalsharma.net/reciept_project/" + img_id + ".jpg"
    headers = {'Ocp-Apim-Subscription-Key': subscription_key}
    params  = {'language': 'unk', 'detectOrientation': 'true'}
    data    = {'url': image_url}
    response = requests.post(ocr_url, headers=headers, params=params, json=data)
    response.raise_for_status()
    analysis = response.json()
    save_json(analysis, img_id)
    
    # Extract Word Information form JSON
    line_infos = [region["lines"] for region in analysis["regions"]]
    word_infos = extract_word_info(line_infos)
    # Save a Txt File With Word Info
    save_output(word_infos, img_id)
    # Save the Annotated Image
    create_image(image_url, word_infos, save=save, img_name=img_id)

# Step 3: Natural Language Processing

### Imports

In [7]:
# Import Standard Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set()

In [8]:
# Import Spacy
import re
import string
import spacy
from spacy import displacy
# Prereq: python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

In [9]:
# Import NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/DJ/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/DJ/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Text Formatting and Cleaning

In [10]:

# Remove Punctuation
# param text (arr) - Text to Clean
def remove_punc(text):
    return re.sub(r'[^\w\s]','',text)

# Remove Digits
# param text (str) - Text to clean
def remove_digit(text):
    return re.sub('\d+', '',text)

# Remove Words that are Numbers from a List
# param text (arr) - List of Words
def remove_num(text):
    return [w for w in text if not w.isdigit()]

# Tokenize Words
# param text (arr) - List of Words
def tokenize_words(text):
    return word_tokenize(' '.join(text))

# Remove Stop Words
# param text (arr) - List of Words
def remove_stop(text):
    stop_words = set(stopwords.words('english'))
    return [w for w in text if not w in stop_words]

# Remove Custom Stop Words
# param text (arr) - List of Words
def remove_custom_stop(text):
    custom_stop_words = ['visit', 'back', 'visa', 'change', 'thanks', 'survey', 'total', 'tax',
                         'coupon', 'invalid', 'details', 'your', 'emp', 'rewards', 'please',
                         'see', 'register', 'order', 'take', 'again', 'our', 'subtotal', 'for',
                         'rebate', 'cash', 'website', 'id', 'come', 'tendered', 'building', 'street']
    return [w for w in text if not w in custom_stop_words]

# Filter for only Nouns
# param text (arr) - List of Words
def remove_non_nouns(text):
    return [w for w in text if nlp(w)[0].pos_ == 'NOUN']

# Remove Meaningless Words
# param text (arr) - List of Words
# param report (bool) - Print Number of Removed Words
def remove_meaningless(text, report=True):     
    cleaned = [w for w in text if not nlp(w).vector_norm == 0]
#     if report: print('Removed ' + str(len(text) - len(cleaned)) + ' word(s) out of ' + str(len(text)))
    return cleaned
    

# Extrats Text from the Formatted Text Output of Azure Text Detection
# param text_id (str) - ID of the text (index)
def extract_text(text_id):
    file = open('text/' + str(text_id) +'.txt', 'r')
    text = file.readlines()
    words = []
    for i in text:
        word = i.split(':')[-1].split('}')[0].strip()
        word = remove_punc(word)
        if word != '': words.append(word.lower())
    return words

# Extract and Clean Text
# param text_id (str) - ID of the text
def format_text(text_id):
    text = extract_text(text_id)
#     print(text)
    text = remove_num(text)
    text = tokenize_words(text)
    text = remove_stop(text)
    text = remove_custom_stop(text)
    text = remove_meaningless(text)
    return text


### Expense Types

In [11]:
expense_types = ['Airfare', 'Car Rental', 'Hotel', 'Fuel', 'Parking', 'Taxi', 'Toll', 'Train', 'Maintenance',
                 'Breakfast', 'Meal', 'Dinner', 'Lunch', 'Groceries', 'Coffee', 'Entertainment', 'Office Supplies', 'Software',
                 'Online Fees', 'Mobile', 'Cellular Phone']

### Categorization by Average Vector Functions

In [12]:
# Get the Avg Vector of Text
# param text (arr) - Text to Avg
# return avg_vect (arr) - Averaged Vector of Text
def avg_vect(text):
    return nlp(' '.join(text))

# Finds the Most Similar Expense Type
# param types (arr) - List of Expense Types
# param text (arr) - Array of Text
def most_similar_by_avg(types, text):
    avg_vector = avg_vect(text)
    closest_type = ''
    closest_score = 0
    for i in types:
        #print(str(round(avg_vector.similarity(nlp(i)), 2)) + ' ' + i)
        similarity = avg_vector.similarity(nlp(i))
        if(similarity > closest_score):
            closest_score = similarity
            closest_type = i
    closest_score = round(closest_score * 100, 3)
    return closest_type, closest_score  

#

# Plots Type Similarity Using the Avg Word Vector
# types (arr) - List of Expense Types
def plot_similar_by_avg(types, text, text_id=''):
    avg_vector = avg_vect(text)
    sim = []
    for i in types: sim.append(avg_vector.similarity(nlp(i)))
    sns.barplot(types, sim, alpha=0.9).set_xticklabels(types, rotation=90)
    plt.xlabel('Category')
    plt.ylabel('Similarity')
    plt.title('Similarities by Expense Types for Receipt '+ str(text_id))
    plt.show()

### Date Extraction

In [13]:
import datefinder
import json
import re
from Levenshtein import ratio

In [14]:

# Given string of text extracted from the receipt, Identify the date of Purchase
# Returns the First Date Object Found
# param text (arr) - List of Words
def date_find(text):
    sentence = ' '.join(text)
    matches = datefinder.find_dates(sentence)
    dates = [i for i in matches]
    if(len(dates) == 0): 
        return None
    else: 
        return dates[0]

## Total Extraction

In [15]:
import json
import re
import copy
from Levenshtein import ratio

In [15]:
# Remove Punctuation
# param text (arr) - Text to Clean
def remove_punc(text):
    return re.sub(r'[^\w\s]','',text)

# Remove Digits
# param text (str) - Text to clean
def remove_digit(text):
    return re.sub('\d+', '',text)


# Given an Analysis (JSON) output from Azure OCR, output the lines in order
# param: text id (int) - repecetive text id for receipt
def extract_lines(text_id):
    with open("json/" + str(text_id) + ".txt","r") as fin:
        analysis = json.load(fin)
    
    unsorted_lines = []
    for region in analysis["regions"]:
        for line in region["lines"]:
            unsorted_lines.append(line)
               
    # sort by Y coordinates
    sorted_lines = sorted(unsorted_lines, key = lambda x: int(x["boundingBox"].split(",")[1]))
    
    return sorted_lines
        
# Merge word lists in line dictionaries.
# Used to merge lines that were not identified in the analysis json
# param: line1 - line to merge
# param: line2 - line to merge
def merge_words(line1,line2):
    out_line = line1
    out_line["words"] += line2["words"]
    return out_line
    
# Iterate through sorted lines and merging lines that are found to be separate
# Lines will be merged if the top edge of the text is within the defined slack
# This is used to identify lines that could be angled or shifted as seen below
#
# Total:
#                50.00
#
# param: sorted_lines (list) - sorted lines output from extract lines
# param: slack (int) - the number of pixels above and below the top edge of text 
#        that will be considered for merging lines
def merge_lines(sorted_lines, slack=None):
    merged_lines = []
    for line_i in sorted_lines:
        for line_j in sorted_lines:
            line_i_BBy = int(line_i["boundingBox"].split(",")[1])
            line_j_BBy = int(line_j["boundingBox"].split(",")[1])
            line_i_BBx = int(line_i["boundingBox"].split(",")[0])
            line_j_BBx = int(line_j["boundingBox"].split(",")[0])
            
            if slack == None:
                slack = int(line_i["boundingBox"].split(",")[3]) / 4
            
            if abs(line_i_BBy - line_j_BBy) < slack and line_i != line_j and line_i_BBx < line_j_BBx:
                merge_words(line_i, line_j)
                sorted_lines.remove(line_j)
                
        merged_lines.append(line_i)
    return merged_lines
 
# Condensed pipeline of functions to output a final list of text lines
# param text_id (int) - id for the desired analysis JSON file
def get_lines(text_id):
    sorted_lines = extract_lines(text_id)
    output = merge_lines(sorted_lines)
    return output

# Given a line dictionary, iterate through the words for the first number
# param line (dict) line dictionary containing word dict
def num_search(line):
    for word in line["words"]:
        test_word = word["text"].replace("$","")
        try:
            total = float(test_word)
            return total
        except:
            #print("searching")
            pass
            
    #print("Could not find total")
    return None

# Given a line dictionary, iterate searching for a word similar to "total"
# param: line (dict) - line dictionary containing words
def string_search(line, similarity=0.7):
    for word in line["words"]:
        test_word = remove_punc(remove_digit(word["text"].lower()))
        if ratio(test_word, "total") > similarity:
            #print(word["text"], test_word, ratio(test_word,"total"))
            return True
        
# Final pipeline, Given a text id, collect lines and serach for total amount
# param text_id (int) - the id for the json file
def extract_total(text_id):
    for line in get_lines(text_id)[::-1]:
        if string_search(line):
            total = num_search(line)
            if total is not None:
                return total


In [16]:
def getEverything(text_id):
    text = format_text(text_id)
    dateFound = date_find(text)
    totalFound = extract_total(text_id)
    expenseType, dummy = most_similar_by_avg(expense_types, text)
    return  text_id , totalFound, dateFound, expenseType