In [11]:
import pandas as pd
import numpy as np
from osgeo import gdal
import sys
import os
sys.path.append(os.path.join('..'))
from counting_boats.boat_utils.spatial_helpers import get_array_from_tif, use_udm_2



In [12]:
UDM_DIR = "U:\\Research\\Projects\\sef\\livingplayingmb\\Boat Detection TMBF\\UDM"
DETECTION_FILE = "C:\\ML_Software\\All_Results\\boat_detections.csv"
GEOJSON = "C:\\ML_Software\\data\\moreton-geojson-format.geojson"
print(locals().get('removed'))

None


In [None]:
# join all the results 
DIR = "C:\\ML_Software\\All_Results"
# get all folders in the folder
folders = [f for f in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, f))]
print(folders)
detections = pd.DataFrame()
coverage = pd.DataFrame()
orders = pd.DataFrame()

for folder in folders:
    print(folder)
    if folder == "UDM": continue
    path = os.path.join(DIR, folder)
    new_detections = pd.read_csv(os.path.join(path, "boat_detections.csv"))
    new_detections['date'] = pd.to_datetime(new_detections['date'], dayfirst=True)
    new_coverage = pd.read_csv(os.path.join(path, "coverage.csv"))
    new_coverage['date'] = pd.to_datetime(new_coverage['date'])
    new_orders = pd.read_csv(os.path.join(path, "orders.csv"))
    new_orders['date'] = pd.to_datetime(new_orders['date'])
    # merge the dataframes - 
    # Important: check if any rows are duplicates i.e new_orders contains an order
    # that is already in the orders dataframe
    duplicate_dates = pd.DataFrame()
    if not orders.empty:
        duplicate_dates = new_orders[new_orders['date'].isin(orders['date'])]
    if not duplicate_dates.empty:
        print(f"Duplicate orders found in {folder}")
        print(duplicate_dates['date'])
        # need to remove the duplicates
        new_orders = new_orders[~new_orders['date'].isin(duplicate_dates['date'])]
        # also remove the corresponding detections and coverage
        new_detections = new_detections[~new_detections['date'].isin(duplicate_dates['date'])]
        new_coverage = new_coverage[~new_coverage['date'].isin(duplicate_dates['date'])]
    detections = pd.concat([detections, new_detections], ignore_index=True)
    coverage = pd.concat([coverage, new_coverage], ignore_index=True)
    orders = pd.concat([orders, new_orders], ignore_index=True)

# sort the dataframes by date
detections = detections.sort_values(by='date')
coverage = coverage.sort_values(by='date')
orders = orders.sort_values(by='date')

# change date back to "dd/mm/yyyy"
detections['date'] = detections['date'].dt.strftime('%d/%m/%Y')
coverage['date'] = coverage['date'].dt.strftime('%d/%m/%Y')
orders['date'] = orders['date'].dt.strftime('%d/%m/%Y')

# save the dataframes
detections.to_csv(os.path.join(DIR, "boat_detections.csv"), index=False)
coverage.to_csv(os.path.join(DIR, "coverage.csv"), index=False)
orders.to_csv(os.path.join(DIR, "orders.csv"), index=False)



In [8]:
# Removes point outside the geojson

# import json
# from shapely.geometry import shape, Point

# with open(GEOJSON) as f:
#     gj = json.load(f)

# polygons = []
# for feature in gj['features']:
#     polygon = shape(feature['geometry'])
#     polygons.append(polygon)

# ids_to_remove = []
# detections = pd.read_csv(DETECTION_FILE)

# for i, row in detections.iterrows():
#     point = Point(row['longitude'], row['latitude'])
#     for j, polygon in enumerate(polygons):
#         if polygon.contains(point):
#             ids_to_remove.append(i)
#             break
#     if i % 1000 == 0:
#         print(f"Processed {i} detections", end="\r")
    
# print(f"Removing {len(ids_to_remove)} detections")




Removing 1049407 detectionss


In [11]:
# ids_to_keep = np.array(ids_to_remove) # actually we want to keep these ones

# detections_keep = detections.iloc[ids_to_keep]

# print(f"Writing {len(detections_keep)}/{len(detections)} detections to {DETECTION_FILE}")

# detections_keep.to_csv(DETECTION_FILE, index=False)


Writing 1049407/1060870 detections to C:\ML_Software\All_Results\boat_detections.csv


For each Usable Data Mask, we want to:
- Check if it is udm_2, if not then skip for now
- Grab the cloud mask 
- Grab the detections for that day
- filter out detections that are in the cloud mask -> false positives

In [14]:
from tqdm import tqdm

CLOUD_BAND = 6
HAZE_BAND = 4
RESOLUTION = 3


def process_udm(udm_data, day_detections, day):
    """
    Using the mask, get the ids of detections that have x and y within the mask
    """
    day_detections = day_detections[day_detections["date"] == day]

    mask, top_x, top_y = udm_data

    clear = mask
    H, W = clear.shape

    items_x = day_detections["epsg32756_x"]
    items_y = day_detections["epsg32756_y"]

    delta_x = items_x - top_x
    delta_y = top_y - items_y

    pixel_i = np.round(delta_y / RESOLUTION).astype(int)
    pixel_j = np.round(delta_x / RESOLUTION).astype(int)

    within_bounds = (pixel_i >= 0) & (pixel_i < H) & (pixel_j >= 0) & (pixel_j < W)

    pixel_i = pixel_i[within_bounds]
    pixel_j = pixel_j[within_bounds]

    clear_values = clear[pixel_i, pixel_j] == 1

    filtered_ids = day_detections[within_bounds][clear_values].index
    # print(f"Found {len(filtered_ids)}/{len(day_detections)} detections for {day}. First 5: {filtered_ids[:5]}")

    return filtered_ids


detections = pd.read_csv(DETECTION_FILE)
detections["date"] = pd.to_datetime(detections["date"], dayfirst=True)
print(f"Loaded {len(detections)} detections")

all_udm = [udm for udm in os.listdir(UDM_DIR) if udm.endswith(".tif")]

print(f"Preparing to process {len(all_udm)} UDM files")

all_valid_ids = []

if os.path.exists("completed.npy"):
    completed = list(np.load("completed.npy"))
    all_valid_ids = list(np.load("valid_ids.npy"))
else:
    completed = []


for udm in tqdm(all_udm):
    if udm in completed:
        continue
    udm_path = os.path.join(UDM_DIR, udm)
    data = np.load(udm_path)
    mask = data["mask"]
    minx = data["minx"].item()
    miny = data["miny"].item()
    maxx = data["maxx"].item()
    maxy = data["maxy"].item()
    resx = data["resx"].item()
    resy = data["resy"].item()
    date = udm.split("_")[1] # date is in the format "YYYYMMDD"
    date = f"{date[:4]}-{date[4:6]}-{date[6:]}"
    valid_ids = process_udm((mask, minx, maxy), detections[detections['date'] == date], date)
    all_valid_ids.extend(valid_ids)
    completed.append(udm)
    np.save("completed.npy", completed)
    np.save("valid_ids.npy", all_valid_ids)

filtered_detection_ids = np.array(all_valid_ids)
filtered_detections = detections.iloc[filtered_detection_ids]
filtered_detections.to_csv(DETECTION_FILE + ".filtered.csv", index=False)

  detections["date"] = pd.to_datetime(detections["date"], dayfirst=True)


Loaded 1086201 detections
Preparing to process 2378 UDM files


100%|██████████| 2378/2378 [00:00<00:00, 45797.50it/s]
