In [1]:
# PYTHON IMPORTS
import os, ultralytics, zipfile, shutil
from tqdm.notebook import tqdm
from datetime import datetime

# IMAGE IMPORTS 
from PIL import Image

# DATA IMPORTS 
import glob
import numpy as np

# PLOTTING
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# MY OWN CLASSES
from FindGrid import *

# OCR libraries
import pytesseract
from fuzzywuzzy import fuzz

# INITIALIZE
t_path = r'C:\Users\fhacesga\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = t_path

# PREFERENCES
Image.MAX_IMAGE_PIXELS = 933120000

In [2]:
# r'C:\Users\fhacesga\Desktop\FIRMsDigitizing\data\runs031524\runs\detect\train_1024px_30tilt\weights\best.pt'
model_weights = r"C:\Users\fhacesga\Downloads\best (1).pt"

zip_folder    = r"D:\FloodChange\AAA_HistoricalDownload\ZIP\\"
input_folder  = r"D:\FloodChange\AAA_HistoricalDownload\Files\\"


output_folder = r"D:\FloodChange\Outputs\\"
output_folder = f"{output_folder}\\{datetime.now().strftime('%m-%d-%Y_%H-%M-%S')}\\"
index_folder  = f"{output_folder}00_identifiedIndices\\"
infer_folder  = f"{output_folder}01_infered\\"

os.makedirs(index_folder, exist_ok=True)
os.makedirs(infer_folder,  exist_ok=True)

Unzip all the files, and undo TIFF multi-paging

In [3]:
def undoMultiPageTIFFs(input_folder):
    '''
    Some images are saved as Multi-page TIFF files. These need to be exported into individual images, which is what this function does
    '''
    for filename in tqdm(glob.glob(input_folder + "\\*.tif*")):
        tiff_file = os.path.join(input_folder, filename)
        try:
        # Check if the file is a multi-page TIFF
            with Image.open(tiff_file) as img:
                if img.is_animated:    
                    for i in range(img.n_frames):
                        try:
                            img.seek(i)
                            output_filename = f"{os.path.splitext(tiff_file)[0]}_{i+1}{os.path.splitext(tiff_file)[1]}"
                            img.save(output_filename, format=img.format)
                        except:
                            print(f"Error with {tiff_file} page {i}")
                            continue
        except:
            print(f"Error opening {tiff_file}")
            continue

        os.remove(tiff_file)

def unzip_all_zips(input_dir, output_dir):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate through all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".zip"):
            zip_file_path = os.path.join(input_dir, filename)
            # Open the ZIP file
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                # Extract all contents to the output directory
                zip_ref.extractall(output_dir)
                print(f"Contents of {zip_file_path} extracted to {output_dir}")

    

if not os.path.exists(input_folder):
    unzip_all_zips(zip_folder, input_folder)
    undoMultiPageTIFFs(input_folder)

Now we identify the TileIndices through known heuristics

In [4]:
patterns = ["IND", "_1."]
index_tiles = [file for pattern in patterns for file in glob.glob(input_folder + "\\*" + pattern + "*")]

for file in tqdm(index_tiles):
    shutil.copy(file, os.path.join(index_folder, os.path.basename(file)))

  0%|          | 0/127 [00:00<?, ?it/s]

These are the parameters we define to run BBNN

In [5]:
target_size = 1024
original_shapes = []

# COCO DATASET PARAMS
category_labels = {
    0 : "County",
    1 : "Tile",
    2 : "Box",
    3 : "Legend"
}

categories=[0, 1]

In [6]:
model = ultralytics.YOLO(model_weights).to("cpu")

In [7]:
results = model(glob.glob(index_folder+"\\*"), imgsz=(1920, 1920),)


0: 1024x1024 82 tiles, 1: 1024x1024 35 tiles, 2: 1024x1024 35 tiles, 3: 1024x1024 37 tiles, 4: 1024x1024 34 tiles, 5: 1024x1024 34 tiles, 6: 1024x1024 2 tiles, 7: 1024x1024 2 tiles, 8: 1024x1024 2 tiles, 9: 1024x1024 2 tiles, 10: 1024x1024 1 tile, 11: 1024x1024 1 tile, 12: 1024x1024 2 tiles, 13: 1024x1024 1 tile, 14: 1024x1024 2 tiles, 15: 1024x1024 5 tiles, 16: 1024x1024 4 tiles, 17: 1024x1024 61 tiles, 18: 1024x1024 63 tiles, 19: 1024x1024 64 tiles, 20: 1024x1024 18 tiles, 21: 1024x1024 59 tiles, 22: 1024x1024 62 tiles, 23: 1024x1024 44 tiles, 24: 1024x1024 60 tiles, 25: 1024x1024 2 tiles, 26: 1024x1024 1 tile, 27: 1024x1024 1 tile, 28: 1024x1024 2 tiles, 29: 1024x1024 7 tiles, 30: 1024x1024 18 tiles, 31: 1024x1024 17 tiles, 32: 1024x1024 4 tiles, 33: 1024x1024 7 tiles, 34: 1024x1024 8 tiles, 35: 1024x1024 8 tiles, 36: 1024x1024 3 tiles, 37: 1024x1024 2 tiles, 38: 1024x1024 7 tiles, 39: 1024x1024 4 tiles, 40: 1024x1024 5 tiles, 41: 1024x1024 110 tiles, 42: 1024x1024 22 tiles, 43: 10

In [8]:
# Process results list
for result in results:
    fn = os.path.join(infer_folder, os.path.basename(result.path))
    result.save(filename=os.path.join(infer_folder, os.path.basename(result.path)))

In [9]:
model = ultralytics.YOLO(model_weights).to("cpu")
results = model(glob.glob(index_folder+"\\*"), imgsz=1024,)




RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3196059648 bytes.

In [None]:
indexes_to_copy = ["480035A_1", "480036A_1", "480037A_1", "480038A_1", "480039_1", "480040A_1", "480041_1", "480041A_1", "480043_1", "480045IND0_0186", "480045IND0_0791", "480046_1", "480047A_1", "480049A_1", "480077_1", "480077A_1", "480243IND0_0583", "480269_1", "480287B_1", "480289_1", "480290A_1", "480293_1", "480293A_1", "480296A_1", "480297A_1", "480298A_1", "480303A_1", "480304_1", "480305A_1", "480307A_1", "480311A_1", "480424B_1", "480710_1", "481141_1", "485466B_1", "485468B_1", "485469B_1", "485470B_1", "485470IND0_1190", "485487C_1", "485491C_1", "485516_1"]
copy_folder = r"D:\FloodChange\Outputs\redo\\"

for i in indexes_to_copy:
    fn = glob.glob(f'{input_folder}/{i}.*')[0]
    shutil.copy(fn, os.path.join(copy_folder, os.path.basename(fn)))

In [None]:
def extract_bounded_area(image, bbox):
    """
    Extracts the bounded area from the image defined by the bounding box.

    Parameters:
        image_path (str): Path to the image file.
        bbox (tuple): Bounding box coordinates in the format (x, y, width, height),
                      normalized by the total image width and height.

    Returns:
        cropped_image (PIL.Image): Cropped region of the image.
    """
    # image = Image.fromarray(image)

    # Get image width and height
    image_width, image_height = image.size
    
    # Convert normalized bounding box to absolute coordinates
    x, y, width, height = bbox
    x_abs = int(x * image_width)
    y_abs = int(y * image_height)
    width_abs = int(width * image_width)
    height_abs = int(height * image_height)
    
    # Define bounding box region
    bbox_region = (x_abs, y_abs, width_abs, height_abs)

    # Crop the image
    cropped_image = image.crop(bbox_region)
    
    return cropped_image

for i, result in enumerate(results):

    classes = result.boxes.data.numpy()[:, -1]
    conf    = result.boxes.data.numpy()[:, -2]

    slice = np.logical_and(classes==0, conf > 0.92)

    # GOTTA FIND CORRECT FILE BC RESIZED WERE SAVED WITH PNG EXTENSION
    basen = os.path.basename(result.path)[:-4]
    in_fn = glob.glob(os.path.join(input_folder,  basen + '*'))[0]
    key = findKey(basen)

    image = Image.open(in_fn)

    for i in np.where(slice)[0]: 
        
        # GET TILE DATA
        bbox = result.boxes.xyxyn.numpy()[i]
        data = extract_bounded_area(image, bbox)

        # GET ID FROM TILE
        text = pytesseract.image_to_string(data, config='--psm 12 --oem 3') # -c tessedit_char_whitelist=0123456789
        word = find_word_with_key(text, key, threshold=80, verbose=False)
        


        # if len(ocr_text) == 0:
        #     continue
            
        # text = find_word_with_key(ocr_text, key, verbose=verbose)

        # plt.imshow(test)
        # plt.show()
    # print(result.boxes.xyxyn.numpy()[slice])
    # error

48023300104
4802330005
48023300204
4802330015
4802870250
4802870050
4802870300
4802870275
4802870225
4802870175
4802870400
4802870150
4302870550
4802870200
4802870325
4802870350
1802870
4802870075
4802870425
4802870100
4802870375
4802870450
None
No ID Found!
None
No ID Found!
None
No ID Found!
4802870475
4802870460
4802870470
4802870035
4802870055
4802870045
480287792530
4802870370
4802870160
4802870015
4802879305
4802870380
4802870095
48028701450


KeyboardInterrupt: 

In [None]:
from DataUtils import *
from IndexUtils import *
from FindGrid import *

In [None]:
image_fn = r"C:\Users\fhacesga\OneDrive - University Of Houston\AAA_RECTDNN\data\AAA_Images\TileIndicesStore\480304IND0_1077.jpg"

outputs, model = findTiles(image_fn, model=model)


image 1/1 C:\Users\fhacesga\OneDrive - University Of Houston\AAA_RECTDNN\data\AAA_Images\TileIndicesStore\480304IND0_1077.jpg: 576x1024 2 tiles, 1 county, 535.3ms
Speed: 9.0ms preprocess, 535.3ms inference, 1.0ms postprocess per image at shape (1, 3, 576, 1024)


In [None]:
outputs['4803040001']

{'bbox': array([   0.046571,    0.058673,     0.37079,     0.50163], dtype=float32),
 'data': <PIL.Image.Image image mode=L size=704x540>}

In [None]:
outputs['4803040001']['bbox']

array([   0.046571,    0.058673,     0.37079,     0.50163], dtype=float32)