In [82]:
# NOTEBOOK IMPORTS
import os, glob, warnings, pickle, re
import numpy as np
from shutil import copyfile, rmtree
from datetime import datetime
from fuzzywuzzy import process

import traceback

# IMAGE IMPORTS
from PIL import Image
import cv2

# GIS IMPORTS
from affinetransformation import *
from affine import Affine
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, LineString, Point, MultiPoint, box
import rasterio as rio
import contextily as cx

# IMAGE IMPORTS
from skimage.morphology import skeletonize
from skimage import img_as_bool, img_as_ubyte


# PLOTTING IMPORTS
import matplotlib.pyplot as plt

# CUSTOM UTILITIES
from IndexUtils import * 
from TileUtils import *

# TILED INFERENCE
import sahi
from sahi import AutoDetectionModel
from sahi.predict import get_sliced_prediction, predict
sahi.utils.cv.IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.tiff', '.bmp', '.tif']

Image.MAX_IMAGE_PIXELS = 933120000
warnings.filterwarnings("ignore")
initialize = False

%matplotlib widget

from fuzzywuzzy import process
from rapidfuzz import fuzz
import easyocr
from tqdm.autonotebook import tqdm

In [9]:
proc_dir = r"D:\RECTDNN\processing\2024-10-11_15-08-46\\"
outputs_dir  = os.path.join(proc_dir, "Outputs")
output_struct_dir = os.path.join(outputs_dir, "IndexCoords.pkl")
with open(output_struct_dir, 'rb') as f:
    gen_dict = pickle.load(f)

In [10]:
def resize_image(image, size=512):
    
    if image.ndim == 3:
        image = image[:, :, 0]

    # Get the current dimensions of the image
    height, width = image.shape
    
    # Calculate the scaling factor to resize the longest side to 512 pixels
    if height > width:
        scale_factor = size / height
    else:
        scale_factor = size / width
    
    # Compute new dimensions
    new_height = int(height * scale_factor)
    new_width = int(width * scale_factor)
    
    # Resize the image using OpenCV
    resized_img = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
    
    return resized_img

In [11]:
def prepImageForReading(img):
    image = np.asarray(img, dtype=np.uint8)
    if np.max(image) < 255:
        image = image * 255
    return image

In [12]:
import re
from dateutil import parser

'FEBRUARY  2, 2006'

In [66]:
parser.parse(re.sub(",", ", ", "'FEBRWARY 16 , 1995'").replace("|", ""))

ParserError: Unknown string format: 'FEBRWARY 16 ,  1995'

In [65]:
def extract_date(text):
    """
    # Step 1: Use regular expressions to look for date-like patterns
    # Looking for both numeric dates (e.g. 9/10/92) and written dates (e.g. April 2, 1990)
    date_regex = r'(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})|([A-Za-z]+\s+\d{1,2},\s+\d{4})'
    
    # Step 2: Search the text for any potential date patterns
    match = re.search(date_regex, text)
    
    if match:
        date_str = match.group(0)
        try:
            # Step 3: Parse the matched date string
            return parser.parse(date_str)
        except ValueError:
            pass  # If parsing fails, it will return None later on
    """
    # Step 4: Fallback: Try fuzzy parsing on the entire string if no match is found
    try:
        return parser.parse(re.sub(",", ", ", text).replace("|", ""), fuzzy=True)
    except:
        return None  # Return None if no valid date is found
    

def extractDate(img, filter_values=None, verbose=False, year_cutoff=2020, ini_year=1965):
    image = prepImageForReading(img)
    texts = reader.readtext(cv2.cvtColor(resize_image(image, size=1024), cv2.COLOR_GRAY2BGR), detail=0, width_ths=50)
    texts_search = [s if re.search(r'[a-zA-Z]', s) else "" for s in texts]
    dates = [extract_date(a) for a in texts_search]

    if len(dates) == 0:
        return None, texts

    if filter_values is None:
        return dates, texts
    
    filtered_dates = []
    for a in np.where(np.array(dates) != None)[0].tolist():
        for b in filter_values:
            _, v = process.extractOne(b, texts[a-1].split(" "))
            if verbose:
                print(texts[a], texts[a-1], b, v)
            v = np.max([a, v])        
        if v > 0.7:
            filtered_dates.append(dates[a])

    filtered_dates = [a for a in filtered_dates if a.year < year_cutoff]
    filtered_dates = [a for a in filtered_dates if a.year > ini_year]

    if len(filtered_dates) == 0:
        return None, texts

    return filtered_dates, texts
    
    # dates = [a for a in dates if a is not None]
    
    

    return dates, " ".join(texts)
'''
for i, (k, v) in tqdm(enumerate(gen_dict.items()), total=len(gen_dict.items())):
    if len(v['legend']) == 0:
        continue
    dates_found, text_found = extractDate(v['legend'][0]['data'], filter_values=['revised', 'effective'])

    print(dates_found)
    if dates_found is None:
        print(text_found)
'''    

  0%|          | 0/171 [00:00<?, ?it/s]

[datetime.datetime(1990, 4, 2, 0, 0)]
[datetime.datetime(1989, 7, 17, 0, 0)]
[datetime.datetime(1990, 9, 28, 0, 0)]
None
['NATiONal FLOOD INSURANCE PROGRAm', 'FIRM', 'FLOOD INSURANCE RATE MAP', 'BEXAR COUNTY, TEXAS', '(UNINCORPORATED AREAS) #', 'MAP INDEX', 'PANELS PRintedr2s. 50. 66.76: 100.125. 150.', '156.160.105,470.180.186. 190-195.226.240 ,', '245.250 3275.286.295.305.310,315.220.350.', '375. 3805385, 390, 395, 426. 450. 476. 480 ,', '485. 4903495. 525. 550. 575,800. 625. 650, 676', 'CommunITy-PaNEL  NUMBERS', '480035 0001-0675', 'EFFECTIVE DATE:', 'GTOBER  16,1984', 'Federal Emergency Management Agency', '2', '450,']
[datetime.datetime(1991, 10, 16, 0, 0)]
[datetime.datetime(1985, 11, 15, 0, 0)]
[datetime.datetime(1986, 1, 3, 0, 0)]
[datetime.datetime(1989, 1, 5, 0, 0)]
[datetime.datetime(1986, 4, 2, 0, 0)]
[datetime.datetime(1978, 6, 27, 0, 0)]
[datetime.datetime(1991, 7, 2, 0, 0)]
[datetime.datetime(1990, 8, 2, 0, 0)]
[datetime.datetime(1984, 11, 15, 0, 0)]
[datetime.datetime(

In [72]:
k

'480035IND0_0490.jpg'

In [84]:

out_dir = r"D:\RECTDNN\TNNN\images2\\"

# key = "48201"

smart_dict = {}

unsureCounter = 0
try:
    reader
except:
    reader = easyocr.Reader(['en']) 

for i, (k, v) in tqdm(enumerate(gen_dict.items()), total=len(gen_dict.items())):
    key = findIndexKey(k)

    dates_found, text_found = extractDate(v['legend'][0]['data'], filter_values=['revised', 'effective'])
    affine   = Affine(*v["output_transform"].flatten()[:6])
    
    for ii, (kk, vv) in tqdm(enumerate(v['tile'].items()), total=len(v['tile'].items()), leave=False):
        try:
            image = prepImageForReading(vv['data'])

            texts = reader.readtext(cv2.cvtColor(resize_image(image, size=1024), cv2.COLOR_GRAY2BGR), detail=0, width_ths=50)
            
            texts = [a for a in texts if len(a) >= len(key)]
            if not len(texts) == 0:
                text, val = process.extractOne(key, texts)
            else: 
                val = 0
            if val < 60:
                text = f"Unsure{unsureCounter}"
                unsureCounter = unsureCounter + 1
            else:
                # text= text.split(" ")[0]
                text = re.sub('[^A-Za-z0-9]+', '', text)
                text = re.sub(f'^.*?{key}', key, text)
                text = text.upper()
                text = re.sub('O', '0', text)
                text = re.sub('T', '1', text)
                text = re.sub('Q', '0', text)
                text = re.sub('Z', '7', text)
                if text[-1] == '1':
                    text = text[-1]


                score, s1_start, s1_end, s2_start, s2_end = fuzz.partial_ratio_alignment(text, key)

                text = key + text[s1_end:]
                
            # CALCULATE COORDS FROM AFFINE 
            bbox = vv['bbox']
            left, bottom = affine * (bbox[0], bbox[1])
            right, top   = affine * (bbox[2], bbox[3])
            
            if text[-1].isalpha():
                tilename = text[:-1]
                inner_key = text[-1]
            else:
                tilename = text
                inner_key = 0
            
            currKeyCheck = smart_dict.get(tilename, None)
            if currKeyCheck is None:
                smart_dict[tilename] = {}

            currInnerKeyCheck = smart_dict[tilename].get(inner_key, None)
            if currInnerKeyCheck is None:
                smart_dict[tilename][inner_key] = {}
                smart_dict[tilename][inner_key]['coords'] = []
                smart_dict[tilename][inner_key]['indexes'] = []
                smart_dict[tilename][inner_key]['pix'] = []
                smart_dict[tilename][inner_key]['dates'] = []
            
            smart_dict[tilename][inner_key]['pix'].append(bbox)
            smart_dict[tilename][inner_key]['coords'].append(np.array([left, bottom, right, top]))
            smart_dict[tilename][inner_key]['indexes'].append(k)
            smart_dict[tilename][inner_key]['dates'].append(dates_found[0])
        except Exception:
            print(traceback.format_exc())
            continue

    try:
        with open("save.p", "wb" ) as f:
            pickle.dump(smart_dict, f)
    except Exception:
        print(traceback.format_exc())
        continue


'''
new_tile_idx = 0
out_fn = os.path.join(out_dir, text) + ".png"
if os.path.exists(out_fn):
    while os.path.exists(out_fn):
        extension = f"_{new_tile_idx}.png"
        out_fn = os.path.join(out_dir, text) + extension
        new_tile_idx = new_tile_idx + 1



# plt.imsave(out_fn, image, cmap='Greys_r')'''

  0%|          | 0/171 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

In [80]:
smart_dict

{'480035': {0: {'coords': [array([ -1.098e+07,  3.4399e+06, -1.0965e+07,  3.4242e+06]),
    array([-1.0982e+07,  3.4727e+06, -1.0967e+07,  3.4568e+06]),
    array([-1.0965e+07,  3.4244e+06,  -1.095e+07,  3.4088e+06]),
    array([-1.0968e+07,  3.4734e+06, -1.0953e+07,  3.4576e+06]),
    array([-1.0951e+07,  3.4252e+06, -1.0936e+07,  3.4097e+06]),
    array([ -1.098e+07,  3.4388e+06, -1.0965e+07,  3.4227e+06]),
    array([-1.0982e+07,  3.4721e+06, -1.0967e+07,   3.456e+06]),
    array([-1.0968e+07,  3.4728e+06, -1.0953e+07,  3.4568e+06]),
    array([-1.0981e+07,  3.4725e+06, -1.0966e+07,  3.4565e+06])],
   'indexes': ['480035IND0_0490.jpg',
    '480035IND0_0490.jpg',
    '480035IND0_0490.jpg',
    '480035IND0_0490.jpg',
    '480035IND0_0490.jpg',
    '480035IND0_0789.jpg',
    '480035IND0_0789.jpg',
    '480035IND0_0789.jpg',
    '480035IND0_0990.jpg'],
   'pix': [array([     970.77,      1510.4,      1377.3,      1972.4]),
    array([     973.61,      597.52,      1379.3,      1063.6]),

In [None]:
"""
SNIPPET BELOW SHOWS GAUSSIAN BLUR MAY BE BENEFICIAL
key = "48201"
for i in range(10, 50):
    myimage = cv2.imread(r"D:\RECTDNN\TNNN\images1\Unsure6" + str(i) + ".png")
    test = cv2.GaussianBlur(cv2.cvtColor(resize_image(myimage, size=1024), cv2.COLOR_GRAY2BGR), (5,5), 1, 1)
    texts = reader.readtext(test, detail=0, width_ths=50)
    texts = [a for a in texts if len(a) >= len(key)]
    print(texts)
    if not len(texts) == 0:
        text, val = process.extractOne(key, texts)
    else: 
        val = 0
    if val < 60:
        text = f"Unsure"
        unsureCounter = unsureCounter + 1
    else:
        # text= text.split(" ")[0]
        text = re.sub('[^A-Za-z0-9]+', '', text)
        text = re.sub(f'^.*?{key}', key, text)
        text = text.upper()
        text = re.sub('O', '0', text)
        text = re.sub('T', '1', text)
        text = re.sub('Q', '0', text)
        text = re.sub('Z', '7', text)

        score, s1_start, s1_end, s2_start, s2_end = fuzz.partial_ratio_alignment(text, key)

        text = key + text[s1_end:]
    print(text)

"""