# Given 1-to-1 image to descriptor --> Find a smaller subset for patching.

In [2]:
import pandas as pd
import math

In [22]:
def calculate_bounding_box(centroid_lon, centroid_lat, zoom_level, patch_size_pixels=256):
    """
    Calculate the bounding box for a centroid and zoom level.
    
    Args:
        centroid_lon (float): Longitude of the centroid.
        centroid_lat (float): Latitude of the centroid.
        zoom_level (int): The zoom level.
        patch_size_pixels (int): Size of the patch in pixels (default is 256x256).

    Returns:
        tuple: (min_lon, min_lat, max_lon, max_lat)
    """
    # Earth's circumference in meters
    earth_circumference = 40075017
    tile_size = 256  # Tile size in pixels
    origin_shift = earth_circumference / 2.0

    # Calculate resolution (meters per pixel)
    resolution = earth_circumference / (tile_size * (2 ** zoom_level))
    
    # Calculate patch size in meters
    patch_size_meters = patch_size_pixels * resolution
    
    # Convert centroid to Web Mercator meters
    mx = (centroid_lon * origin_shift) / 180.0
    my = math.log(math.tan((90 + centroid_lat) * math.pi / 360.0)) / (math.pi / 180.0)
    my = (my * origin_shift) / 180.0
    
    # Calculate bounding box in Web Mercator meters
    min_mx = mx - patch_size_meters / 2
    max_mx = mx + patch_size_meters / 2
    min_my = my - patch_size_meters / 2
    max_my = my + patch_size_meters / 2
    
    # Convert back to latitude/longitude
    min_lon = (min_mx / origin_shift) * 180.0
    max_lon = (max_mx / origin_shift) * 180.0
    min_lat = 180.0 / math.pi * (2 * math.atan(math.exp((min_my / origin_shift) * math.pi)) - math.pi / 2.0)
    max_lat = 180.0 / math.pi * (2 * math.atan(math.exp((max_my / origin_shift) * math.pi)) - math.pi / 2.0)
    
    return min_lon, min_lat, max_lon, max_lat


In [21]:
def get_patch_for_coordinate(lat, lon, min_lon, min_lat, patch_width_lon, patch_height_lat):
    # Determine the column (longitude-wise)
    col = int((lon - min_lon) // patch_width_lon)
    
    # Determine the row (latitude-wise)
    row = int((lat - min_lat) // patch_height_lat)
    
    return row, col

In [100]:
# read in training data
data = pd.read_csv("/data/cher/Sat2Habitat/data/crisp-data-split/train.csv")

In [None]:
def assign_patches(data):
    # loop through dataset
    patches_dict = {}
    remaining_data = data.copy()

    while not remaining_data.empty:
        # Randomly select a row (centroid) for bounding box calculation
        row = remaining_data.sample(1)
        centroid_lon, centroid_lat, key = row['lon'].values[0], row['lat'].values[0], row['key'].values[0]

        patches_dict[key] = {}

        # get bounding box of row
        min_lon, min_lat, max_lon, max_lat = calculate_bounding_box(centroid_lon, centroid_lat, zoom_level = 18)

        # find other rows within that bounding box
        filtered_data = remaining_data[
            (remaining_data['lon'] >= min_lon) & (remaining_data['lon'] <= max_lon) &
            (remaining_data['lat'] >= min_lat) & (remaining_data['lat'] <= max_lat)
        ]

        if filtered_data.empty:
            continue

        #### get the image patches for those rows

        # Define the 3x3 grid patch size (in terms of latitude and longitude)
        delta_lon, delta_lat = max_lon - min_lon, max_lat - min_lat
        patch_width_lon, patch_height_lat = delta_lon / 3, delta_lat / 3

        patches = []
        for idx, row in filtered_data.iterrows():
            lat, lon, filtered_key = row['lat'], row['lon'], row['key']  
            patch_row, patch_col = get_patch_for_coordinate(lat, lon, min_lon, min_lat, patch_width_lon, patch_height_lat)
            patch = (patch_row, patch_col)
            
            # Store the patch in the dictionary with 'key' as the dictionary key
            patches_dict[key][filtered_key] = patch

        # Remove the assigned rows from remaining_data to avoid reassignment
        remaining_data = remaining_data.drop(filtered_data.index)

    return patches_dict


In [None]:
# Convert the nested dictionary into a list of tuples (key, inner_key, patch)
def convert_nested_dict_to_df()
df = []
for key, inner_dict in patches_dict.items():
    for inner_key, patch in inner_dict.items():
        df.append((key, inner_key, patch))

# Create the DataFrame
df = pd.DataFrame(df, columns=['assigned_image', 'key', 'patch'])

In [None]:
patches_dict = assign_patches(data)
df = convert_nested_dict_to_df(patches_dict)

In [None]:
# Calculate the percentage of data assigned to a patch that is not the middle patch
patch_not_middle = df[df['patch'] != (1, 1)].shape[0] / df.shape[0] * 100

# Calculate the percentage of data assigned to a different image (inner_key != key)
different_image = df[df['key'] != df['assigned_image']].shape[0] / df.shape[0] * 100

# How many images do we actually need?
unique_image = df['key'].unique().shape[0]

print(f"Percentage of data assigned to a patch not the middle patch: {patch_not_middle:.2f}%")
print(f"Percentage of data assigned to a different image: {different_image:.2f}%")
print(f"Percentage of images necessary: {(unique_image df.shape[0])/ :.2f}%")

Percentage of data assigned to a patch not the middle patch: 1.07%
Percentage of data assigned to a different image: 96.20%


In [None]:
data_w_patches = data.merge(df, on = 'key', how = 'left')

Unnamed: 0,key,species,occurrenceID,level2Gid,lat,lon,habitat,habitat_wiki,distribution and habitat_wiki,description_wiki,ecology_wiki,distribution_wiki,header_wiki,assigned_image,patch
0,269738,Trapeliopsis flexuosa,3280ae61-e5ee-4778-a912-a49b9f4247c9,USA.43.5_1,35.613780,-83.972000,"Acidic mixed hardwood (Acer rubrum, Carya, Nys...",,,,,,Trapeliopsis is a genus of lichenized fungi in...,270963,"(1, 1)"
1,225514,Fellhanera bouteillei,cb3e766e-a776-49ad-927c-5d28ddd42ab7,USA.34.44_1,35.620800,-83.116100,Humid Rhododendron thicket around seep,,,,,,Fellhanera bouteillei is a species of leaf-dwe...,203339,"(1, 1)"
2,90561,Sphagnum papillosum,67204cd8-1a9a-47eb-b297-9349cf03d597,USA.15.47_1,41.570000,-86.830000,ombotrophic bog mat,,,,,S. papillosum is widely distributed throughout...,,90536,"(1, 1)"
3,275710,Solanum elaeagnifolium,f42f769f-208c-4a7c-a75f-86bfe958363a,USA.44.253_1,26.797446,-99.015807,Part shade; next to building,,,It is a perennial 10 cm (3.9 in) to 1 m in hei...,It can grow in poor soil with very little wate...,,,275710,"(1, 1)"
4,136417,Astragalus gracilis,84491bc3-9463-4061-aa12-88438b0d8227,USA.28.79_1,41.741378,-103.712313,Plants in grassy meadow in low canyon area; wi...,,,,,,There are at least 174 members of the pea fami...,136417,"(1, 1)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199547,119879,Inoderma byssaceum,650676f4-089a-48fb-bb65-5874d44281e3,USA.24.1_1,46.296958,-93.310086,in mesic hardwood (MHn35) community,,,,,,Inoderma is a genus of lichen-forming fungi in...,119879,"(1, 1)"
199548,259178,Buellia stillingiana,1f5fa29e-f0b1-42fe-8bac-9a8cb628242d,USA.41.8_1,33.195300,-79.522800,pine forest with scattered oaks,,,Genus Buellia consists of lichens that are eit...,,,,255428,"(1, 1)"
199549,131932,Punctelia rudecta,9ea00f75-04f3-4417-b124-243bd677d93c,USA.26.101_1,37.320000,-91.430000,"Oak woods, disturbed area near road",,,The thallus of Punctelia rudecta ranges in col...,"Punctelia rudecta has an annual radial (i.e., ...",,,126065,"(1, 1)"
199550,146867,Astragalus purshii,7993d366-2ddf-40d0-b909-5a9c7b6b10b7,USA.29.8_1,41.130000,-117.740000,Dry hilltop with volcanic tuff in sagebrush hills,,The plant is native to much of western North A...,Astragalus purshii is a small perennial herb f...,,,,145852,"(1, 1)"


In [3]:
data_w_patches = pd.read_csv("/data/cher/Sat2Habitat/data/crisp-data-split/train_w_patches.csv")

In [9]:
patch_key = data_w_patches['patch'][0]

In [11]:
a, b = map(int, patch_key.split('_'))

In [13]:
b

1