In [248]:
import re

import pandas as pd

In [249]:
column_map = {'Sr. No': 'pond_serial_no',
              'Date of data collection': 'pond_data_date',
              'Pond ID': 'pond_id',
              'Farmer': 'farmer',
              'Enrollment mechanism': 'enrollment_mechanism',
              'Culture change': 'culture_change',
              'Existing practices': 'existing_practices',
              'Notes (existing practices)': 'notes_existing_practices',
              'Fertilizers used': 'fertilizers_used',
              'Fish source (e.g. hatchery name)': 'fish_source',
              'Notes': 'notes',
              'Location': 'location',
              'Village': 'village',
              'Added by': 'added_by',
              'Property area in acres': 'property_area_acres',
              'Pond area in acres': 'pond_area_acres',
              'Depth in meters': 'pond_depth_meters',
              'Status': 'treatment_group', # Use treatment group from measurements since that hopefully reflects historical status
              'Measurements': 'measurements',
              'Equipment': 'equipment',
              'Feed type': 'feed_type',
              'Feed source': 'feed_source',
              'Feed brand or name': 'feed_brand'}

In [250]:
ponds_raw = pd.read_csv("../data/raw/ara_exports/2024_11_27/ponds.csv",
                        parse_dates=['Date of data collection'])
ponds = ponds_raw

In [251]:
ponds = ponds.rename(columns=column_map)
assert(ponds.columns.isin(column_map.values()).all()) # Assert columns renamed

In [252]:
len(ponds[ponds['pond_id'].duplicated(keep=False)])

0

No duplicate pond IDs

In [253]:
ponds['culture_change'].value_counts()
ponds['culture_change'] = ponds['culture_change'].str.capitalize().str.replace('N0', 'No')

In [254]:
ponds['fish_source'].value_counts()
ponds['fish_source'] = ponds['fish_source'] \
  .str.capitalize() \
  .str.replace('farmers', 'farmer') \
  .str.replace('nurseries', 'nursery') \
  .str.replace('nursary', 'nursery') \
  .str.replace('nursury', 'nursery') \
  .str.replace('pond.', 'pond') \
  .str.replace('ponds', 'pond') \
  .str.replace('pond', '')
# ponds['fish_source'].value_counts()

Consider cleaning above more if ever useful

Now parse location data

In [255]:
print(f"{ponds['location'].isna().sum()} / {len(ponds)} ponds without locations")

12 / 240 ponds without locations


In [256]:
def dms_to_decimal(dms_string):
    """
    Parses a DMS string and converts it to decimal degrees.
    Automatically detects degrees, minutes, and seconds.
    """
    # Regex to match the DMS pattern
    pattern = r"(\d+)°(\d+)'([\d.]+)\"?([NSEW])"
    match = re.match(pattern, dms_string)

    if not match:
        raise ValueError(f"Invalid DMS format: {dms_string}")

    degrees, minutes, seconds, direction = match.groups()
    decimal_degrees = (
        float(degrees) + float(minutes) / 60 + float(seconds) / 3600
    )
    if direction in "SW":
        decimal_degrees = -decimal_degrees
    return decimal_degrees


In [257]:
import requests
from urllib.parse import unquote

def get_coords_from_gmaps_bitly(short_url: str) -> tuple:
  # Expand the short URL
  response = requests.get(short_url, allow_redirects=True)
  full_url = response.url

  # Parse the URL to extract coordinates
  if '/place' in full_url:
    # Coordinates are in the path, e.g., @lat,lng
    dms = full_url.split('/place/')[1].split('/')[0]
    dms = unquote(unquote(dms))
    return tuple((dms_to_decimal(coord) for coord in dms.split('+')))
  elif '/search' in full_url:
    parsed_url = unquote(full_url)
    str_coords = parsed_url.split('/search/')[1].split('?')[0]
    return tuple((float(coord) for coord in str_coords.split(",+")))
  else:
    return "Coordinates not found in URL."

short_url = "https://goo.gl/maps/o5xQfRYisfhLqMh27"
print(get_coords_from_gmaps_bitly(short_url))

(16.641444444444446, 81.13280555555555)


In [258]:
def is_coordinate_string(s: str):
  pattern = r"^\d+\.\d+,\s\d+\.\d+$"
  return bool(re.match(pattern, s))

def is_gmaps_bitly(s: str):
  if isinstance(s, str):
    return s.startswith("https://goo.gl/maps/")
  else:
    return False

def load_coords(x: str) -> tuple:
  if not isinstance(x, str):
    return x

  elif is_coordinate_string(x):
    return tuple(float(coord) for coord in x.split(", "))
  
  elif is_gmaps_bitly(x):
    return get_coords_from_gmaps_bitly(x)
  
  else:
    return x # For now

In [None]:
ponds['location_parsed'] = ponds['location'].apply(load_coords)
not_blocked = ponds.loc[
  ponds['location'].apply(is_gmaps_bitly), 'location_parsed'
].apply(lambda x: isinstance(x, tuple)).all()
if not_blocked:
  print("Successfully parsed locations from Google maps.")
else:
  print("Parsing Gmaps hyperlinks did not work.")

Successfully parsed locations from Google maps.


Map pond locations if successful

In [260]:
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
from fwi_predict.geo.viz import create_map

In [261]:
# Load in geopandas dataframe
points = [Point(tup[1], tup[0])
          if isinstance(tup, tuple) else np.nan
          for tup in ponds['location_parsed'].tolist()]
ponds = gpd.GeoDataFrame(ponds, geometry=points, crs=4326)
center = ponds.to_crs(ponds.estimate_utm_crs()).dissolve().centroid.to_crs(4326)

In [262]:
m = create_map(center.y, center.x, map_kwargs={'zoom_start': 7})
ponds['gmaps_link'] = ponds['location'].apply(is_gmaps_bitly)

ponds.explore(
  m=m,
  column='gmaps_link',
  tooltip='pond_id',
  style_kwds={'opacity': 0.5, 'fillOpacity': 0.5}
)

  float(coord)
  if math.isnan(float(coord)):
  return [float(x) for x in coords]


It looks like the pond locations are mostly within identified ponds. Some of the gmaps links are slightly off but still good.

It would be useful to overlay this with the weather data locations

In [263]:
ponds['farm_id'] = ponds['pond_id'].str.replace(r"\d+$", "", regex=True)
farms_with_id = ponds.groupby('geometry')['farm_id'].apply(list)
farms_with_id = farms_with_id[farms_with_id.apply(lambda x: len(x)).sort_values(ascending=False).index]
farms_with_id.head(10)

geometry
POINT (81.02833 16.76433)    [WG-SKD, WG-SKD, WG-ASR]
POINT (81.13356 16.61653)    [WG-BKR, WG-NAR, WG-VPS]
POINT (81.02086 16.74069)            [WG-UCU, WG-UCU]
POINT (81.02719 16.72392)            [WG-SPD, WG-SPD]
POINT (81.01494 16.73467)            [WG-BRM, WG-BRM]
POINT (80.1168 14.44769)             [NL-GMS, NL-VAS]
POINT (81.13375 16.61714)            [WG-KKR, WG-VPS]
POINT (81.126 16.65644)              [WG-RRU, WG-RRU]
POINT (80.10511 14.48241)                    [NL-JUB]
POINT (80.0963 14.4735)                      [NL-VEN]
Name: farm_id, dtype: object

There appear to be a handful of farms that share the same locations.

Will flag this to FWI but unlikely to be critical until we use specific satellite locations. We can also limit that to when specific point lat/lons are used, and try to use the water body mask as well.

In [None]:
ponds.to_csv("../data/clean/pond_metadata_clean.csv")