In [1]:
import pandas as pd
import numpy as np
import ocr_parser
import cv2
import matplotlib.pyplot as plt
import geopandas as gpd 
import geopy 
from geopy.extra.rate_limiter import RateLimiter
import re
import requests

In [2]:
user_data = pd.read_csv("Bill.com/Users.csv")

In [3]:
ocr_files = ocr_parser.parse_files()

In [4]:
keys = sorted(list(ocr_files.keys()))

In [5]:
address_starters = {}
address_enders = {}
for address in set(user_data.vendor_address):
    words = address.lower().replace(",", " ").split(" ")
    for idx, word in enumerate(words):
        word = word.strip()
        if word == "":
            continue
        ratio = (idx + 1) / len(words)
        if ratio < .3:
            address_starters[word] = address_starters.get(word, 0) + (4 / ratio)
        elif ratio > .7:
            address_enders[word] = address_enders.get(word, 0) + (ratio * 10)

In [6]:
vibe_candidate = []
for ocr_doc in keys:
    ocr_df = ocr_files[ocr_doc]
    start_line = -1
    end_line = -1
    gst_idx = -1
    for idx, line in enumerate(ocr_df.TEXT):
        line_lower = line.lower()
        if line_lower.find("gst") != -1:
            gst_idx = idx
            continue
        words = line_lower.replace(",", " ").split(" ")

        if start_line == -1: # hard code numbers ?
            start_score = 0
            for word in words:
                if word in address_starters:
                    start_score += address_starters[word]
            
            start_score += line.count(",") * 10
            start_score = start_score * (4**line.count(",")) / len(words)
            if start_score > 100:
                start_line = idx
            
        if start_line != -1:
            end_score = 0
            for word in words:
                if word in address_enders:
                    end_score += address_enders[word]
            end_score /= len(words)
            if end_score > 100:
                end_line = idx
                break

    if start_line != -1 and end_line == -1:
        end_line = start_line
    
    if start_line < gst_idx and gst_idx < end_line:
        start_line = gst_idx + 1

    if end_line != -1:
        address = ", ".join(list(ocr_df.TEXT)[start_line:end_line + 1])
        address = address.replace(",,", ",")
    else:
        address = None
    vibe_candidate.append(address)

In [7]:
header_blocks = []
for ocr_file_id in keys:
    ocr_file = ocr_files[ocr_file_id]
    # print(ocr_file_id)
    prev_thetas = np.array([360, 360, 360])
    prev_coords = np.array(ocr_file.iloc[-1][:-1])
    block_start = 0
    block_end = 0
    align = None
    for idx in range(len(ocr_file)):
        cur_line = ocr_file.iloc[idx]
        plain_line = cur_line.TEXT.strip()
        if plain_line == "" or plain_line.isnumeric():
            continue

        cur_coords = np.array(cur_line[:-1])
        dir_vec = cur_coords - prev_coords
        cur_left_num = dir_vec[0] - (dir_vec[1] * 1j)
        cur_right_num = dir_vec[2] - (dir_vec[3] * 1j)
        cur_center_num = (cur_left_num + cur_right_num) / 2
        cur_nums = np.array([cur_left_num, cur_center_num, cur_right_num])
        cur_thetas = np.angle(cur_nums, deg=True)
        
        theta_deltas = np.abs(cur_thetas - prev_thetas)

        if align == None:
            if theta_deltas.min() < 25:
                align = theta_deltas[:2].argmin()
                block_start = max(0, idx - 2)
                # print("align:", align)
        else:
            if theta_deltas[align] > 30:
                block_end = idx
                break


        prev_thetas = cur_thetas
        prev_coords = cur_coords

    header_blocks.append("\n".join(ocr_file.iloc[block_start:block_end].TEXT))

In [8]:
block_candidate = []
regexp = re.compile(r'[^a-zA-Z]*\d\d\d\d+-[a-zA-Z]|^\d\d\d\d+-[a-zA-Z]')
for block in header_blocks:
    cur_address = []
    for line in block.split("\n"):
        line = line.upper()
        good_line = True
        for word in re.sub('[^0-9a-zA-Z]+', ' ', line).split():
            if word in ["TAX", "GST", "TEL", "SUN", "MON", "FRI", "HOURS", "FAX", "MOBILE", "WHATSAPP", "INVOICE"]:
                good_line = False
                break

        if good_line and regexp.search(line):
            continue
        
        if good_line and ((line.find("HTTP://") != -1) or (line.find("WWW.") != -1) or (line.find(".COM") != -1)):
            continue

        if good_line:
            cur_address.append(line)
    # address_candidates.append("\n".join(cur_candidate))
    new_block = "\n".join(cur_address)
    if new_block == "":
        new_block = None
    block_candidate.append(new_block)

In [9]:
c_df = pd.DataFrame(data={
    "document_id": keys,
    "block_candidate": block_candidate,
    "vibe_candidate": vibe_candidate
})

In [10]:
# locator = geopy.geocoders.Nominatim(user_agent="mygeocoder")
locator = geopy.geocoders.GoogleV3(api_key='AIzaSyBuBbNOWXHWR7sjfu0p_vz6kJ-ZxzxiOB8')
geocode = RateLimiter(locator.geocode)

In [16]:
c_df[].iloc[0].vibe_loc

Location(Ijok, 45000 Bestari Jaya, Selangor, Malaysia, (3.320926, 101.4142529, 0.0))

In [12]:
c_df["block_loc"] = c_df["block_candidate"].apply(geocode)

In [13]:
c_df["vibe_loc"] = c_df["vibe_candidate"].apply(lambda vibe: geocode(vibe) if vibe else None)

In [20]:
block_worse_commas = c_df.block_loc.apply(lambda loc: loc[0].count(",") < 3 if loc else None)
vibe_better_commas = c_df.vibe_loc.apply(lambda loc: loc[0].count(",") > 2 if loc else False)
c_df["location"] = np.where((block_worse_commas & vibe_better_commas) | c_df.block_loc.isnull(), c_df.vibe_loc, c_df.block_loc)

In [24]:
c_df['coordinates'] = c_df['location'].apply(lambda loc: tuple(loc.point[:2]) if loc else None)

In [26]:
c_df[["latitude", "longitude"]] = pd.DataFrame(c_df['coordinates'].tolist(), index=c_df.index)

In [27]:
c_df["location_str"] = c_df.location.apply(lambda loc: loc[0] if loc else None)

In [28]:
pre_csv = c_df[["document_id", "latitude", "longitude", "block_candidate", "location_str"]]
pre_csv = pre_csv.rename(columns={
    "document_id": "Document",
    "latitude": "Latitude",
    "longitude": "Longitude",
    "block_candidate": "Header",
    "location_str": "Location"
})
pre_csv.to_csv("ocr_coords.csv")

In [30]:
c_df.iloc[112]

document_id                            00d0292723088
block_candidate    23, JIN BURUNG JENTAYU, TMN BUKIT
vibe_candidate     23, JIN BURUNG JENTAYU, TMN BUKIT
block_loc                                       None
vibe_loc                                        None
location                                        None
coordinates                                     None
latitude                                         NaN
longitude                                        NaN
location_str                                    None
Name: 112, dtype: object