In [1]:
# PYTHON IMPORTS
import os, copy, math, re, ultralytics, zipfile, shutil
from tqdm.notebook import trange, tqdm
from datetime import datetime

# IMAGE IMPORTS 
from PIL import Image

# DATA IMPORTS 
import random, h5py, glob
import numpy as np

# PLOTTING
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# SHAPES IMPORTS
import shapely
import geopandas as gpd
from shapely.ops import unary_union
from shapely.geometry import LineString, Polygon, Point

# MY OWN CLASSES
from FindGrid import *

# OCR libraries
import pytesseract
from fuzzywuzzy import fuzz

# INITIALIZE
t_path = r'C:\Users\fhacesga\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = t_path

# PREFERENCES
Image.MAX_IMAGE_PIXELS = 933120000

In [3]:
# r'C:\Users\fhacesga\Desktop\FIRMsDigitizing\data\runs031524\runs\detect\train_1024px_30tilt\weights\best.pt'
model_weights = r"data_local/curr_weights.pt"

zip_folder    = r"D:\FloodChange\AAA_HistoricalDownload\ZIP\\"
input_folder  = r"D:\FloodChange\AAA_HistoricalDownload\Files\\"


output_folder = r"D:\FloodChange\Outputs\\"
output_folder = f"{output_folder}\\{datetime.now().strftime('%m-%d-%Y_%H-%M-%S')}\\"
index_folder  = f"{output_folder}00_identifiedIndices\\"
resize_folder = f"{output_folder}01_resized\\"
infer_folder  = f"{output_folder}02_infered\\"

os.makedirs(index_folder, exist_ok=True)
os.makedirs(resize_folder, exist_ok=True)
os.makedirs(infer_folder,  exist_ok=True)

Unzip all the files outside the directory if needed

In [4]:
def unzip_all_zips(input_dir, output_dir):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate through all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".zip"):
            zip_file_path = os.path.join(input_dir, filename)
            # Open the ZIP file
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                # Extract all contents to the output directory
                zip_ref.extractall(output_dir)
                print(f"Contents of {zip_file_path} extracted to {output_dir}")

if not os.path.exists(input_folder):
    unzip_all_zips(zip_folder, input_folder)

Read through all the images in the directory. Some images are saved as Multi-page TIFF files. These need to be exported into individual images. We'll do that first

In [5]:
for filename in tqdm(glob.glob(input_folder + "\\*.tif*")):
    tiff_file = os.path.join(input_folder, filename)
    try:
    # Check if the file is a multi-page TIFF
        with Image.open(tiff_file) as img:
            if img.is_animated:    
                for i in range(img.n_frames):
                    try:
                        img.seek(i)
                        output_filename = f"{os.path.splitext(tiff_file)[0]}_{i+1}{os.path.splitext(tiff_file)[1]}"
                        img.save(output_filename, format=img.format)
                    except:
                        print(f"Error with {tiff_file} page {i}")
                        continue
    except:
        print(f"Error opening {tiff_file}")
        continue

    os.remove(tiff_file)

  0%|          | 0/710 [00:00<?, ?it/s]

Error opening D:\FloodChange\AAA_HistoricalDownload\Files\485479B.tif
Error opening D:\FloodChange\AAA_HistoricalDownload\Files\485479C.tif
Error opening D:\FloodChange\AAA_HistoricalDownload\Files\485481A.tif
Error opening D:\FloodChange\AAA_HistoricalDownload\Files\485510.tif
Error opening D:\FloodChange\AAA_HistoricalDownload\Files\485510B.tif
Error opening D:\FloodChange\AAA_HistoricalDownload\Files\485514B.tif
Error opening D:\FloodChange\AAA_HistoricalDownload\Files\485516A.tif


Now we identify the TileIndices through known heuristics

In [6]:
patterns = ["IND", "_1."]

index_tiles = [file for pattern in patterns for file in glob.glob(input_folder + "\\*" + pattern + "*")]
for file in tqdm(index_tiles):
    shutil.copy(file, os.path.join(index_folder, os.path.basename(file)))

  0%|          | 0/55 [00:00<?, ?it/s]

These are the parameters we define to run BBNN

In [7]:
target_size = (1024, 1024) 
original_shapes = []

# COCO DATASET PARAMS
category_labels = {
    0 : "County",
    1 : "Tile",
    2 : "Box",
    3 : "Legend"
}

categories=[0, 1]

Resize all indices to BBNN target size

In [8]:
for filename in tqdm(os.listdir(index_folder)):
    # DEFINE DIRS
    input_path  = os.path.join(index_folder, filename)
    output_path = os.path.join(resize_folder, filename)
    
    # LOOP THROUGH RESIZED IMAGES
    if os.path.isfile(input_path) and filename.lower().endswith(('.png', '.jpg', '.bmp', '.gif', '.tif')):
        if not os.path.exists(output_path[:-3]+"png"): # SKIP IF EXISTS
            try:
                # OPEN IMAGE
                img = np.asarray(Image.open(input_path))
                original_shapes.append(img.shape)

                # RESCALE BW IMAGES TO UINT8 TO BE ABLE TO USE BILINEAR INTERPOLATION
                if np.max(img) != 255:
                    img = 255 * img / np.max(img)
                img = Image.fromarray(img.astype(np.uint8))

                # RESIZE AND SAVE
                img_resized = img.resize(target_size, Image.BILINEAR)
                img_resized.save(output_path[:-3]+"png")
            except: 
                continue

  0%|          | 0/127 [00:00<?, ?it/s]

In [10]:
model = ultralytics.YOLO(model_weights).to("cpu")
results = model(glob.glob(resize_folder+"\\*"), imgsz=1024,)

# Process results list
for result in results:
    fn = os.path.join(infer_folder, os.path.basename(result.path))
    result.save(filename=os.path.join(infer_folder, os.path.basename(result.path)))


0: 1024x1024 43 tiles, 1 county, 1: 1024x1024 41 tiles, 1 county, 2: 1024x1024 27 tiles, 1 county, 3: 1024x1024 35 tiles, 1 county, 4: 1024x1024 31 tiles, 1 county, 5: 1024x1024 26 tiles, 1 county, 6: 1024x1024 1 tile, 7: 1024x1024 4 tiles, 2 countys, 8: 1024x1024 3 tiles, 9: 1024x1024 3 tiles, 1 county, 10: 1024x1024 (no detections), 11: 1024x1024 1 tile, 12: 1024x1024 1 tile, 1 county, 13: 1024x1024 (no detections), 14: 1024x1024 3 tiles, 15: 1024x1024 4 tiles, 3 countys, 16: 1024x1024 2 tiles, 17: 1024x1024 28 tiles, 1 county, 18: 1024x1024 31 tiles, 1 county, 19: 1024x1024 25 tiles, 1 county, 20: 1024x1024 17 tiles, 1 county, 21: 1024x1024 32 tiles, 1 county, 22: 1024x1024 23 tiles, 1 county, 23: 1024x1024 26 tiles, 1 county, 24: 1024x1024 31 tiles, 1 county, 25: 1024x1024 3 tiles, 1 county, 26: 1024x1024 2 tiles, 27: 1024x1024 2 tiles, 28: 1024x1024 1 tile, 1 county, 29: 1024x1024 6 tiles, 2 countys, 30: 1024x1024 5 tiles, 1 county, 31: 1024x1024 3 tiles, 32: 1024x1024 4 tiles, 1

In [8]:
model = ultralytics.YOLO(model_weights).to("cpu")
results = model(glob.glob(index_folder+"\\*"), imgsz=1024,)


0: 1024x1024 61 tiles, 1 county, 1: 1024x1024 67 tiles, 1 county, 2: 1024x1024 65 tiles, 1 county, 3: 1024x1024 61 tiles, 1 county, 4: 1024x1024 62 tiles, 1 county, 5: 1024x1024 2 tiles, 1 county, 6: 1024x1024 61 tiles, 1 county, 7: 1024x1024 57 tiles, 1 county, 8: 1024x1024 57 tiles, 1 county, 9: 1024x1024 20 tiles, 1 county, 10: 1024x1024 57 tiles, 1 county, 11: 1024x1024 57 tiles, 1 county, 12: 1024x1024 57 tiles, 1 county, 13: 1024x1024 60 tiles, 1 county, 14: 1024x1024 2 tiles, 1 county, 15: 1024x1024 4 tiles, 1 county, 16: 1024x1024 9 tiles, 3 countys, 17: 1024x1024 9 tiles, 2 countys, 18: 1024x1024 2 tiles, 2 countys, 19: 1024x1024 4 tiles, 2 countys, 20: 1024x1024 22 tiles, 1 county, 21: 1024x1024 78 tiles, 1 county, 22: 1024x1024 22 tiles, 1 county, 23: 1024x1024 77 tiles, 1 county, 24: 1024x1024 39 tiles, 1 county, 25: 1024x1024 39 tiles, 1 county, 26: 1024x1024 39 tiles, 1 county, 27: 1024x1024 39 tiles, 1 county, 28: 1024x1024 106 tiles, 1 county, 29: 1024x1024 5 tiles, 3 

In [11]:
results[0].boxes.xyxy

tensor([[1763.9381, 2422.2461, 2169.5525, 2891.5251],
        [2164.4939, 1519.8820, 2567.7900, 1975.1213],
        [ 573.3839, 2420.8589,  973.0733, 2892.2678],
        [1771.9183,  596.1050, 2173.3445, 1068.3162],
        [2165.1152, 1970.2655, 2567.1863, 2429.7595],
        [2165.1943, 1058.0609, 2566.7073, 1525.0452],
        [1371.1106, 1061.3557, 1774.7196, 1517.5161],
        [ 970.0963, 2421.8728, 1379.3400, 2891.1653],
        [ 576.4122, 1747.0842,  773.3394, 1970.4717],
        [1766.6050, 1969.5414, 2169.6074, 2426.6318],
        [1370.7255,  600.4040, 1776.8030, 1062.4729],
        [1972.1722, 1289.7322, 2172.6816, 1522.6820],
        [ 771.6378, 1744.9929,  979.3391, 1969.6783],
        [1365.1526, 1966.4033, 1772.3239, 2426.7263],
        [1363.3843, 2421.0940, 1768.9821, 2890.8765],
        [ 770.4731, 1517.9115,  976.2397, 1749.9775],
        [ 178.6360, 1053.1951,  584.5432, 1515.1360],
        [ 180.5558,  594.4118,  586.3154, 1060.3407],
        [1769.6377, 1749.378

In [21]:
def extract_bounded_area(image, bbox):
    """
    Extracts the bounded area from the image defined by the bounding box.

    Parameters:
        image_path (str): Path to the image file.
        bbox (tuple): Bounding box coordinates in the format (x, y, width, height),
                      normalized by the total image width and height.

    Returns:
        cropped_image (PIL.Image): Cropped region of the image.
    """
    # image = Image.fromarray(image)

    # Get image width and height
    image_width, image_height = image.size
    
    # Convert normalized bounding box to absolute coordinates
    x, y, width, height = bbox
    x_abs = int(x * image_width)
    y_abs = int(y * image_height)
    width_abs = int(width * image_width)
    height_abs = int(height * image_height)
    
    # Define bounding box region
    bbox_region = (x_abs, y_abs, width_abs, height_abs)

    # Crop the image
    cropped_image = image.crop(bbox_region)
    
    return cropped_image

for i, result in enumerate(results):

    classes = result.boxes.data.numpy()[:, -1]
    conf    = result.boxes.data.numpy()[:, -2]

    slice = np.logical_and(classes==0, conf > 0.92)

    # GOTTA FIND CORRECT FILE BC RESIZED WERE SAVED WITH PNG EXTENSION
    basen = os.path.basename(result.path)[:-4]
    in_fn = glob.glob(os.path.join(input_folder,  basen + '*'))[0]
    key = findKey(basen)

    image = Image.open(in_fn)

    for i in np.where(slice)[0]: 
        
        # GET TILE DATA
        bbox = result.boxes.xyxyn.numpy()[i]
        data = extract_bounded_area(image, bbox)

        # GET ID FROM TILE
        text = pytesseract.image_to_string(data, config='--psm 12 --oem 3') # -c tessedit_char_whitelist=0123456789
        word = find_word_with_key(text, key, threshold=80, verbose=False)
        


        # if len(ocr_text) == 0:
        #     continue
            
        # text = find_word_with_key(ocr_text, key, verbose=verbose)

        # plt.imshow(test)
        # plt.show()
    # print(result.boxes.xyxyn.numpy()[slice])
    # error

48023300104
4802330005
48023300204
4802330015
4802870250
4802870050
4802870300
4802870275
4802870225
4802870175
4802870400
4802870150
4302870550
4802870200
4802870325
4802870350
1802870
4802870075
4802870425
4802870100
4802870375
4802870450
None
No ID Found!
None
No ID Found!
None
No ID Found!
4802870475
4802870460
4802870470
4802870035
4802870055
4802870045
480287792530
4802870370
4802870160
4802870015
4802879305
4802870380
4802870095
48028701450


KeyboardInterrupt: 