In [4]:
import math

def calculate_zoom_level(min_lon, min_lat, max_lon, max_lat, resolution):
    # Earth's circumference in meters
    earth_circumference = 40075017
    tile_size = 256  # Tile size in pixels
    
    # Bounding box dimensions in degrees
    delta_x = max_lon - min_lon
    delta_y = max_lat - min_lat
    
    # Convert dimensions to meters
    width_m = delta_x * (earth_circumference / 360)
    height_m = delta_y * (earth_circumference / 360)
    
    # Calculate the zoom level based on desired resolution (meters per pixel)
    zoom_level = math.log2(earth_circumference / (tile_size * resolution))
    
    # Round to the nearest integer zoom level
    return int(round(zoom_level))

# Example usage
min_lon, min_lat, max_lon, max_lat = -122.5, 37.5, -121.5, 38.5
desired_resolution = 0.6  # meters per pixel
zoom = calculate_zoom_level(min_lon, min_lat, max_lon, max_lat, desired_resolution)
print(f"Recommended Zoom Level: {zoom}")

Recommended Zoom Level: 18


Zoom 18 is consistent with NAIP imagery.

In [22]:
from gbif_utils import build_geodataframe
from sklearn.model_selection import train_test_split
import pandas as pd

dimension = 256
meters_per_pixel = 0.6
dimension_distance = dimension * meters_per_pixel


### Input file downloaded from gbif -- occurrences
gbif_path = "/data/cher/Sat2Habitat/data/occurrence.txt"
gdf = build_geodataframe(gbif_path)

train_data, test_data = train_test_split(gdf, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

train_and_val_data = pd.concat([train_data, val_data])

  habitat_info = pd.read_csv(gbif_path, sep="\t", on_bad_lines='skip')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['stateProvince'] = data['stateProvince'].replace({'New jersey' : 'New Jersey',


In [39]:
# Data split for clustering
train_bbox_pt=f'/data/cher/Sat2Habitat/data/cluster/clusters_{dimension}m_{meters_per_pixel}ppixel_centroid.csv'

imagery = pd.read_csv(train_bbox_pt)

train_data, val_data = train_test_split(imagery, test_size=0.25, random_state=42)

In [40]:
train_data

Unnamed: 0,key,lon,lat
41512,41512,-75.70769,40.897867999999995
2561,2561,-73.957901,40.79634899999999
12710,12710,-115.6061,38.62576
51303,51303,-102.640112,36.16196699999999
32081,32081,-152.5,62.67000000000001
...,...,...,...
37194,37194,-74.769054,41.020375
6265,6265,-85.183432,44.866352000000006
54886,54886,-95.307111,38.584807
860,860,-76.593056,36.61833299999999


In [41]:
train_data.to_csv("/data/cher/Sat2Habitat/data/cluster-data-split/train_imagery.csv", index=False)
val_data.to_csv("/data/cher/Sat2Habitat/data/cluster-data-split/val_imagery.csv", index=False)

In [None]:
# New easier idea: (instead of concatenating descriptors or anything like that) -- keep them seperate, just smaller images.

# 1. Patchify images beforehand.
# 2. Save in a folder called "patches"
# 3. Get the patch indices (row2patchid) for each row in train & val dataset.
# 4. Run the model using the patches instead of the whole image.

In [6]:
# Read in training data
train = pd.read_csv("/data/cher/Sat2Habitat/data/crisp-data-split/train.csv")
train_imagery = pd.read_csv("/data/cher/Sat2Habitat/data/cluster-data-split/train_imagery.csv")
val_imagery = pd.read_csv("/data/cher/Sat2Habitat/data/cluster-data-split/val_imagery.csv")

all_imagery = pd.concat([train_imagery, val_imagery])

key = pd.read_csv("/data/cher/Sat2Habitat/data/cluster/clusters_key_256m.csv")

In [7]:
all_imagery

Unnamed: 0,key,lon,lat
0,41512,-75.707690,40.897868
1,2561,-73.957901,40.796349
2,12710,-115.606100,38.625760
3,51303,-102.640112,36.161967
4,32081,-152.500000,62.670000
...,...,...,...
16647,45054,-107.955092,43.606623
16648,52223,-84.506400,31.237200
16649,6720,-120.220000,41.890000
16650,21937,-111.540000,39.940000


In [20]:
set(train['occurrenceID']) - set(key['occurrenceID'])

{'d773bac6-77d0-426f-bbc8-3bcd127e893d',
 '68c79f7e-1b53-4cbf-99a8-e612532b624c',
 '2e6014b8-f16a-4ad7-8ad1-c3275ee18047',
 '898c797a-b35e-48db-9a5e-2e5ea2da7a47',
 '9396038d-1483-481c-9ece-bd4a733dc006',
 '354c30f3-6cf3-4cc7-9863-bfaf6c7f3835',
 '19f5bbb1-e2d3-41c3-85e6-11844beb5234',
 '46b4ef1d-ab8e-40b7-935e-5a42e8ce0cad',
 'a8ca0c58-84b2-4733-b67c-44b073434907',
 '284782c8-e4cc-4f9e-b54a-37e41408606a',
 '57f6ace2-df0f-49a0-b31a-9817ca2efd8b',
 'df5c1a3a-61c5-4eb4-bbd7-e81e1d0a3f92',
 '8bbe4ef6-b2af-4911-a6b8-52fd2c405ece',
 'a5244d88-84f5-43cf-b599-84e2fafa406d',
 '0a306411-b7cb-436e-a562-ea58f413293a',
 'fdabb472-1764-45a4-b907-f50eefcb1d5e',
 '7194bfd9-3f69-4b6a-b539-6afe40afc46b',
 'f686705f-f185-45f4-a77c-eb1a63e9ead0',
 'e994499e-5080-42d6-bad1-cd4cc305ccb5',
 'dbadb287-5182-4bd1-9757-1e3dcb44e204',
 'f8c1c2ae-2a1e-4d42-9356-779eeb646b58',
 '50e04379-c720-433f-bcb3-6d0c5570e033',
 '99ac9777-2acb-464d-9467-470b23de853f',
 '87c8413f-0914-4984-a8f3-ea0308f8ba7a',
 '160fb867-b7be-

In [22]:
set(key['occurrenceID']) - set(train['occurrenceID'])

{'3c5fbd8f-ab5b-4f5e-916a-f5cc58266d7f',
 'cf2c2360-c8df-4311-886b-ca27a7a833f2',
 '4c7fa1fd-083c-4e28-9038-023c64c2bfa1',
 '842e2e4c-4deb-4b24-a6ef-2cf3ed8c6c0c',
 '9538e490-1e2e-44c7-9c59-f6d338ee7005',
 'b6f9c958-b49d-41e6-8e9b-828bb6177a72',
 'ccfa65e5-bac3-403b-84df-08445236b2a6',
 '9feb3854-f5bc-4c16-8be9-0919a9fcc7e1',
 'b2c39572-ff84-4ceb-a68c-a0b8f05a061d',
 '20e7243a-8aef-415d-ab1b-0abb97ae8787',
 'f0f0beef-ab07-476d-9f07-8d3bb9b6f0d8',
 '1cb371df-0a99-4d9a-a118-eea648812857',
 '993a29f3-d46c-4c0e-925c-8dbef690564c',
 '179b1a38-79bd-4a68-acea-630ef509ff7a',
 '70a924a9-61ed-40d3-a435-397ac6ce7c4f',
 '18c17bb9-c286-480c-a47c-9b1b981c6aac',
 'f5a6f96e-5236-4765-8299-29340de644c1',
 '3febfc4f-7440-444c-8127-9edfd8318aa6',
 '42c3ddec-87d4-4a96-8b1b-70515682114d',
 'e6c23f9a-2b49-4665-ac41-d8b842c3eac5',
 '76bbb27a-7659-4444-b83c-babb1d7e23c4',
 '45947c39-ef9c-4c59-a4f3-83bdf30ea7e6',
 '4279610e-196c-4478-84af-3ecc57fedf4a',
 'aacd1df8-33ac-454b-be61-e00013479fcd',
 '9fad6d4a-c248-

In [21]:
set(train['occurrenceID'])

{'1213630e-87df-4e94-bf17-b163f880c1f1',
 '859ecdbb-1633-4c03-b4ed-586e3568464e',
 '05182f98-3ed2-437b-a297-96f6cf36f901',
 'b982a17b-c411-4a06-80ff-8fc2d0fde999',
 '5be71f21-e232-4974-9293-4ab186d53be2',
 '416e6f74-f6d9-4f8a-b7e8-e393a368afc3',
 '3da2bf4d-8e18-4e65-854a-b577df1dd6e8',
 '24a5c3a2-2309-4b15-a5b8-2032e71a1005',
 '19f5bbb1-e2d3-41c3-85e6-11844beb5234',
 'b32cb2a2-bc09-4bf6-8052-59a238097124',
 '74544bbc-8ac9-4af7-be65-493e63f917d7',
 'dc79e3b4-f5c4-4e08-9f0d-600a604dbf90',
 '57f6ace2-df0f-49a0-b31a-9817ca2efd8b',
 'df5c1a3a-61c5-4eb4-bbd7-e81e1d0a3f92',
 '1a926dc0-93cd-48e7-97e7-c7c8cb39a3dd',
 '0bfcda4e-5b5f-4a9f-b501-8751bf0e49d3',
 '0e1e5e9e-cf1d-4f98-9979-db95e42c4f49',
 'b9ee7546-2cc4-406c-ab0b-e22d1cd3dc41',
 'd80ec4a3-dc77-42ee-8cd4-5806212cf0fc',
 '0a306411-b7cb-436e-a562-ea58f413293a',
 'c4856c78-fff2-4f0a-9232-205ea80fe71a',
 '904b462c-15c3-4f4c-8b64-ccf474e8665e',
 '7ed3f0d2-6bd2-4b73-9840-4bbd8115e74d',
 '09332e18-d23e-4989-a484-98908240f6c7',
 'e820dc4e-bdd7-

In [45]:
key

Unnamed: 0,occurrenceID,key
0,c9e4bcda-79c1-45b9-992c-8bdc445c77ac,0
1,c439dd20-8df7-4982-b122-7e1b7bbadb3c,1
2,1e4b7085-940a-44e1-8d2e-b221c4987d88,2
3,1b8623b8-e8f6-49c5-ab34-1fef6bf08087,3
4,fd712e65-fcf3-49d8-869b-97df3746f450,4
...,...,...
266064,77f2ee7f-6d2a-43db-b1fb-380e3e7ac04e,66637
266065,e03f0da6-a744-4d09-97b9-0c404200868e,66659
266066,22251231-575a-4820-8c8e-6cb2250bf6a5,66636
266067,a9845ba5-2f1c-493f-bb15-63a6b0ad9df2,66636


In [2]:
for idx, row in train.head(5).iterrows():
    cnt = 0
    try:
        im_key = key[key["occurrenceID"].str.strip() == row['occurrenceID'].str.strip()]['key'].values[0]
    except:
        cnt+=1
        continue
    im_info = train_imagery[train_imagery['key'] == im_key]
    

    

NameError: name 'train' is not defined

In [None]:
cnt

In [78]:
row['occurrenceID']

'67204cd8-1a9a-47eb-b297-9349cf03d597'

In [75]:
key[key["occurrenceID"] == row['occurrenceID']]['key']

Series([], Name: key, dtype: int64)

In [None]:
import pyproj

# Define the projection (assuming UTM Zone 30N, EPSG:32630, for example)
projection = pyproj.CRS("EPSG:32630")  # Change this based on your projection
wgs84 = pyproj.CRS("EPSG:4326")  # WGS84 lat-lon (assuming input is in lat-lon)

# Transformer to convert lat-lon to UTM
transformer = pyproj.Transformer.from_crs(wgs84, projection, always_xy=True)

# Example: Given habitat coordinate and image centroid
lat, lon = 37.43337799999999, -79.604968  # Habitat coordinates
centroid_lat, centroid_lon = 37.433000, -79.604000  # Centroid coordinates

# Convert lat-lon to projected coordinates (meters)
x, y = transformer.transform(lon, lat)  # Coordinate of habitat in meters
centroid_x, centroid_y = transformer.transform(centroid_lon, centroid_lat)  # Centroid in meters

# Calculate the pixel offset (image resolution is 0.6m per pixel)
pixel_x = (x - centroid_x) / 0.6  # Offset in pixels
pixel_y = (y - centroid_y) / 0.6  # Offset in pixels

# Convert to patch (divide by 85 to get 3x3 grid)
patch_x = int(pixel_x // 85)
patch_y = int(pixel_y // 85)

# Create patch key (for example, patch 5 means row 2, column 2)
patch_key = f"Image_{patch_x + 1}_{patch_y + 1}"

print(f"Patch key: {patch_key}")


In [None]:
# Current thought process is to develop the patch indices prior to computation (strictly say I will be doing X by X patching) but then allow for getting the
# other stuff during the data loading process

# In Data loader:
# 1. Get image
# 2. Patchify image
# 3. Get the patch indices (im2patchid)
# 4. Get the habitat descriptors (patchid2occurrenceid)
# 5. 
# 6. Model

In [None]:
# Code to determine the patch indices related to the underlying training data

In [None]:
gbif_path = "/data/cher/Sat2Habitat/data/occurrence.txt"
species_wiki_path = "/data/cher/Sat2Habitat/data/species_wiki.csv"

occurrences = build_geodataframe(gbif_path)
species_wiki_df = pd.read_csv(species_wiki_path)

# Data cleaning
occurrences = occurrences.sort_values(by='stateProvince').reset_index(drop=True)
occurrences.index.name = 'key'
occurrences.reset_index(inplace=True)
occurrences.rename(columns={'decimalLatitude': 'lat', 'decimalLongitude': 'lon'}, inplace=True)

## Create the training and validation datasets
species_wiki_df.rename(columns={col: f"{col}_wiki" if col != 'species' else col for col in species_wiki_df.columns}, inplace=True)

habitat_info_w_wiki = occurrences.merge(species_wiki_df, on='species', how='left')
habitat_info_w_wiki = habitat_info_w_wiki[all_cols]

Unnamed: 0,occurrenceID,level2Gid,species,habitat,stateProvince,decimalLatitude,decimalLongitude,geometry
0,604cee1e-de47-4bc1-995a-16a184ae23d0,USA.47.62_1,Myelochroa galbina,Pinus rigida dominated forest with sparse mixe...,Virginia,38.371696,-79.573741,POINT (-79.57374 38.3717)
7,607fd920-5ed0-4d87-a5e1-b32ecf24e33f,USA.33.10_1,Cladonia chlorophaea,Acer saccharinum-Fraxinus pennsylvanica swamp,New York,44.623600,-73.427200,POINT (-73.4272 44.6236)
8,6081e4e4-e585-4636-a3ae-60a8db3bb0c5,USA.23.85_1,Brachythecium rutabulum,River bank,Michigan,42.280566,-83.711555,POINT (-83.71156 42.28057)
10,608366a9-4891-47d6-b5d4-8fa76d982193,USA.5.27_1,Salvia mellifera,Growing in coastal sagebrush scrub,California,36.650000,-121.770000,POINT (-121.77 36.65)
12,608e24ec-c54e-4fd0-9de7-b12e81a4880f,USA.4.27_1,Sphagnum lescurii,soil in ditch,Arkansas,34.205485,-92.651656,POINT (-92.65166 34.20548)
...,...,...,...,...,...,...,...,...
942149,6fa14344-afa4-4124-95ef-e215b7eb64af,USA.39.67_1,Pseudosagedia cestrensis,"Humid hemlock (Tsuga) - hardwood (Magnolia, Be...",Pennsylvania,39.875000,-76.394400,POINT (-76.3944 39.875)
942153,6fa75b43-69b4-48ad-ab16-db202823db4b,USA.21.22_1,Ceratodon purpureus,In woods,Maryland,38.336787,-75.847154,POINT (-75.84715 38.33679)
942154,970e30e0-f25e-4431-aa9e-09206a885741,USA.10.64_1,Phycomyces nitens,In marsh,Florida,29.290305,-81.126270,POINT (-81.12627 29.2903)
942155,97112994-d66e-40b5-b279-ea12464e6358,USA.18.67_1,Cetrelia olivetorum,"Mixed hardwood (Acer saccharum, Carya, Liriode...",Kentucky,37.084820,-82.793080,POINT (-82.79308 37.08482)


In [None]:
train_data

Unnamed: 0,key,lon,lat
52322,52322,-83.50030000000001,35.6261
49049,49049,-94.79087,35.63145399999999
45279,45279,-106.759696,44.34829100000001
16828,16828,-116.9203,38.099450000000004
22160,22160,-109.15000000000002,38.43
...,...,...,...
37194,37194,-74.769054,41.020375
6265,6265,-85.183432,44.866352000000006
54886,54886,-95.307111,38.584807
860,860,-76.593056,36.61833299999999
