In [3]:
from PIL import Image
import matplotlib.pyplot as plt
import cv2 
import numpy as np
import pandas as pd
import re
import ocr_parser
import DateCleaner
from fuzzywuzzy import fuzz

In [4]:
userData = pd.read_csv('Bill.com/Users.csv')
OCR_files = ocr_parser.parse_files()
OCR_keys = sorted(list(OCR_files))
date_mapping = DateCleaner.get_reformatted_strings()
date_df = DateCleaner.create_pd_df(date_mapping)
date_df = date_df.set_index("Document ID")
ocr_prices = pd.read_csv('ocr_totals.csv', index_col="Document")[["Total"]]
user_locations = pd.read_csv("user_data_coords.csv", index_col="Address")[["Latitude", "Longitude"]]
ocr_locations = pd.read_csv("ocr_coords.csv", index_col="Document")[["Latitude", "Longitude", "Header", "Location"]]

In [84]:
def scoring_func(ocr_file_id):
    scores_date = []
    scores_price = []
    for row in range(len(userData)):
        entry_row = userData.iloc[row]

        #Look for matching dates
        ocr_date = date_df.loc[ocr_file_id].Dates
        if (ocr_date != None):
            match_dates = max(
                fuzz.ratio(ocr_date[0], entry_row.date),
                fuzz.ratio(ocr_date[1], entry_row.date)
            )
        else:
            match_dates = 0     #Might need to change?

        scores_date.append(match_dates)
        
        #Look for matching price amounts
        ocr_price = ocr_prices.loc[ocr_file_id].Total
        match_prices = 0
        if (ocr_price != None):
            if entry_row.amount != 0:
                percentage_diff = abs(ocr_price - entry_row.amount) / entry_row.amount
                if percentage_diff < 0.005:
                    match_prices = percentage_diff
            # ocr_price_formatted = format(np.around(ocr_price, decimals=2), '.2f').replace('.','')
            # entry_price_formatted = format(np.around(entry_row.amount), '.2f').replace('.','')
            # match_prices = fuzz.ratio(ocr_price_formatted, entry_price_formatted)
        scores_price.append(match_prices)
    
    #Add distance measurement
    distances = []
    for row in range(len(userData)):
        addressUsed = entry_row.vendor_address
        user_coords = user_locations.loc[addressUsed].values
        ocr_coords = ocr_locations.loc[ocr_file_id].values
        readability = user_coords[0:1] - ocr_coords[0:1]
        distances.append(np.dot(readability, readability))

    #convert to arrays and deal with null points
    scores_date = np.array(scores_date)
    scores_date = np.nan_to_num(scores_date, nan = 0)
    scores_price = np.array(scores_price)
    scores_price = np.nan_to_num(scores_price, nan = 0)
    distances = np.array(distances)  
    distances = np.nan_to_num(distances, nan = np.inf)
    
    normalized_price = scores_price / np.linalg.norm(scores_price + 0.000001)
    normalized_date = scores_date / np.linalg.norm(scores_date + 0.000001)
    # distance_temp = np.log(distances)
    distance_temp = 1/(distances + 0.00000001)
    normalized_distance = distance_temp / (np.linalg.norm(distance_temp)+0.000001)
    confidence_vector = normalized_price + normalized_date + normalized_distance + 2 * np.amax([normalized_date, normalized_distance], axis=0)
    return confidence_vector

In [85]:
#CREATE CONFIDENCE MATRIX

confidence_matrix_raw = []

for OCR_file in OCR_keys: 
    OCR = OCR_files[OCR_file]
    date = date_df.loc[OCR_file].Dates
    price = ocr_prices.loc[OCR_file].Total
    confidence_vector = scoring_func(OCR_file)
    confidence_matrix_raw.append(confidence_vector)

In [86]:
confidence_matrix = np.array(confidence_matrix_raw)
docIDs = OCR_keys[:]
entries = list(userData.paymentid.values)
entry2docID = {}

In [87]:
while min(confidence_matrix.shape) != 0:
    loc = np.unravel_index(np.argmax(confidence_matrix), confidence_matrix.shape)
    docID = docIDs.pop(loc[0])
    entry = entries.pop(loc[1])
    entry2docID[entry] = docID
    confidence_matrix = np.delete(confidence_matrix, loc[0], 0)
    confidence_matrix = np.delete(confidence_matrix, loc[1], 1)

In [88]:
counter = 0
total = 0
for entry, docID in entry2docID.items():
    if userData[userData.paymentid == entry].iloc[0].documentid == docID:
        counter += 1
    total += 1 
    
counter/total

0.22044088176352705

In [9]:
counter = 0
total = 0
for entry, docID in entry2docID.items():
    if userData[userData.paymentid == entry].iloc[0].documentid == docID:
        counter += 1
    total += 1 
    
counter/total

0.35070140280561124