In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from joblib import Parallel, delayed
from tqdm import tqdm

def get_geoid(lat, lon, counties):
    # Create a Point object from latitude and longitude
    point = Point(lon, lat)
    
    # Ensure the counties shapefile is in the correct projection (usually WGS84, EPSG:4326)
    counties = counties.to_crs(epsg=4326)
    
    # Use spatial operations to find the county containing the point
    matching_county = counties[counties.contains(point)]
    
    if not matching_county.empty:
        # Extract the GEOID (replace 'GEOID' with the correct column name if different)
        geoid = matching_county.iloc[0]['GEOID']
        return geoid
    
    return None  # Return None if no match found

# Function to process each row and extract GEOID
def process_row(row, counties):
    lat, lon = row['lat'], row['lon']
    geoid = get_geoid(lat, lon, counties)
    return geoid

def add_geoid_column_parallel(test_csv, counties_shapefile, num_jobs=-1):
    # Read the counties shapefile
    counties = gpd.read_file(counties_shapefile)
    
    # Create a tqdm-enabled list for progress tracking
    rows = list(test_csv.iterrows())
    
    # Parallelize the row processing using joblib, wrapping it in tqdm
    geoid_results = Parallel(n_jobs=num_jobs)(
        delayed(process_row)(row, counties) for _, row in tqdm(rows, desc="Processing rows", total=len(rows))
    )
    
    # Add the GEOID as a new column in the DataFrame
    test_csv['GEOID'] = geoid_results
    
    return test_csv


In [19]:
# Load your test CSV into a DataFrame
test_csv = pd.read_csv("/data/cher/Sat2Habitat/data/crisp-data-split/test.csv")

# Path to the shapefile containing county polygons
counties_shapefile = '/data/cher/Sat2Habitat/data/tl_2023_us_county.zip!tl_2023_us_county'

# Add county and state columns
updated_test_csv = add_geoid_column_parallel(test_csv, counties_shapefile)

# Save the updated DataFrame (optional)
updated_test_csv.to_csv('updated_test_csv.csv', index=False)

# View the updated DataFrame with county and state
print(updated_test_csv.head())

Processing rows:   0%|          | 240/66517 [00:59<5:55:40,  3.11it/s]

KeyboardInterrupt: 

In [3]:
gbif_path = "/data/cher/Sat2Habitat/data/occurrence.txt"
txt = pd.read_csv(gbif_path, sep="\t", on_bad_lines='skip')

  txt = pd.read_csv(gbif_path, sep="\t", on_bad_lines='skip')


KeyboardInterrupt: 

In [4]:
test = pd.read_csv("/data/cher/Sat2Habitat/data/crisp-data-split/test.csv")

In [8]:
test[~test['level2Gid'].isna()]

Unnamed: 0,key,species,occurrenceID,level2Gid,lat,lon,habitat,habitat_wiki,distribution and habitat_wiki,description_wiki,ecology_wiki,distribution_wiki,header_wiki
0,178260,Danthonia spicata,5002ff11-2f9f-4882-9d69-6d2c18ea6e02,USA.33.3_1,40.861100,-73.875800,"Lightly shaded, mesic, thin soil on rock outc...",,,,,,Danthonia spicata is a species of grass known ...
1,328494,Fraxinus pennsylvanica,5fd9eca5-ec03-4f8b-8085-9fa29aa58f41,USA.50.25_1,43.186900,-90.291400,"Edge of woods at gravel road and fields, upper...",,,Fraxinus pennsylvanica is a medium-sized decid...,It is the most widely distributed of all the A...,,
2,116593,Campylium protensum,87b042a4-244b-434d-a139-05f0afddf0dd,USA.23.44_1,45.494700,-83.972800,wet sand swales on lake shore,,,,,,Campylium protensum is a species of moss belon...
3,206300,Cladonia petrophila,0685c77a-b558-43f0-974d-f6f10ed62838,USA.34.44_1,35.378170,-82.868930,"Mature northern hardwood (Amelanchier, Betula ...",,,,,,Cladonia is a large genus of lichens in the fa...
4,320356,Cladophora sakaii,e7fd06b9-c94e-4099-aef6-1a31a1a127fe,USA.48.5_1,48.368122,-124.624960,Growing on rocks in shallow tidepools; middle ...,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66513,170707,Cenchrus alopecuroides,f22feeb1-de0b-4ae6-9ede-674380874ba1,USA.33.61_1,41.112627,-73.866214,15 plants growing in large clumps along upper ...,,,,,,Cenchrus is a widespread genus of plants in th...
66514,100103,Pseudosagedia cestrensis,d6fbec8c-3465-4c15-8fb2-b2cb7863b740,USA.19.24_1,30.296100,-91.116700,open woodland,,,,,,Pseudosagedia is a genus of corticolous (bark-...
66515,120145,Bathelium carolinianum,ac968faa-3b85-41cf-ac44-e00657746d9d,USA.25.77_1,31.460929,-88.766928,"Xeric sandhill with pine (Pinus palustris, P. ...",,,Bathelium carolinianum features a greenish-bro...,,,
66516,228940,Leptogium isidiosellum,f8294d0e-3139-4a87-ae8c-be530153780d,USA.34.22_1,35.085300,-83.616900,"Riparian forest (Betula, Tsuga, Acer) with Rho...",,,,,,


In [13]:
len(counties['GEOID'].unique())

3235

In [5]:
# only keep images in the test set that exist
import pandas as pd
test = pd.read_csv("/data/cher/Sat2Habitat/data/crisp-data-split/test_imagery.csv")

In [9]:
im_dir = "/data/cher/Sat2Habitat/data/crisp-imagery/bing_test"

# TODO: Remove all image files that are not in the test set
file_paths = [f"/data/cher/Sat2Habitat/data/crisp-imagery/bing_test/{i}.jpg" for i in test['key']]


    

In [18]:
import os

# Directory containing images
im_dir = "/data/cher/Sat2Habitat/data/crisp-imagery/bing_test"

# List of image files that are part of the test set (based on test['key'])
test_keys = test['key'].tolist()  # Assuming 'test' is a DataFrame and 'key' contains the image identifiers

# List all files in the image directory
all_files = os.listdir(im_dir)

# Iterate over the files and remove those not in the test set
for file in all_files:
    # Extract the base name (assuming the file name format is like '123.jpg')
    base_name = os.path.splitext(file)[0]  # Remove the extension
    if int(base_name) not in test_keys:
        # Full path of the file to delete
        file_path = os.path.join(im_dir, file)
        print(f"Removing file: {file_path}")
        os.remove(file_path)  # Remove the file


Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/18128.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/112433.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/18865.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/17905.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/11699.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/186382.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/43581.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/312504.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/211494.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/206622.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/21828.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/17819.jpg
Removing file: /data/cher/Sat2Habitat/data/crisp-imagery/bing_test/2346

In [22]:
test[test['key'] == 274251]

Unnamed: 0,key,lon,lat
36153,274251,-103.65,30.35


In [17]:
int(base_name) in test_keys

True