# Data Loading

Load the raw GBIF parquet (~314M records) and extract India's subset (~200K records). Results are cached to `_cache/` for use by subsequent notebooks.

In [1]:
import sys; sys.path.insert(0, ".")
from _shared import *
import dask.dataframe as dd
setup_plotting()

CACHE_DIR.mkdir(exist_ok=True)

In [2]:
# Global scan: load only countrycode to get country-level record counts
global_country_counts = (
    dd.read_parquet(PARQUET_PATH, columns=["countrycode"])
    ["countrycode"]
    .value_counts()
    .compute()
    .sort_values(ascending=False)
    .rename("n_records")
)

total_global = global_country_counts.sum()
india_n = global_country_counts.get("IN", 0)
india_rank = (global_country_counts > india_n).sum() + 1

print(f"Total global records: {total_global:,}")
print(f"Number of countries: {len(global_country_counts)}")
print(f"\nIndia (IN): {india_n:,} records \u2014 rank {india_rank} of {len(global_country_counts)} countries")
print(f"India's share: {india_n / total_global:.4%}")
print(f"\nTop 10 countries:")
for i, (cc, n) in enumerate(global_country_counts.head(10).items(), 1):
    print(f"  {i:>2}. {cc}: {n:,}")

Total global records: 314,644,916
Number of countries: 249

India (IN): 200,175 records â€” rank 36 of 249 countries
India's share: 0.0636%

Top 10 countries:
   1. FR: 71,030,169
   2. SE: 32,864,971
   3. GB: 27,719,916
   4. US: 20,265,969
   5. AU: 19,913,966
   6. NL: 18,028,571
   7. DE: 15,104,474
   8. CH: 15,031,124
   9. DK: 14,338,390
  10. BE: 12,212,275


In [3]:
# Load full India subset
INDIA_COLS = [
    "countrycode", "stateprovince", "basisofrecord",
    "year", "month",
    "publishingorgkey", "datasetkey", "institutioncode",
    "species", "specieskey",
    "decimallatitude", "decimallongitude",
    "coordinateuncertaintyinmeters",
    "family", "genus", "taxonrank",
]

india_df = (
    dd.read_parquet(PARQUET_PATH, columns=INDIA_COLS)
    .query("countrycode == 'IN'")
    .compute()
)

print(f"India records loaded: {len(india_df):,}")
india_df.head(3)

India records loaded: 200,175


Unnamed: 0,countrycode,stateprovince,basisofrecord,year,month,publishingorgkey,datasetkey,institutioncode,species,specieskey,decimallatitude,decimallongitude,coordinateuncertaintyinmeters,family,genus,taxonrank
3244,IN,Delhi,PRESERVED_SPECIMEN,1957.0,10.0,814cdfb5-d4f8-4453-815f-ea5df98e76bf,e05fd4c2-4d77-4c50-880e-1b22096af300,ASU,Ammi majus,3034182,28.584252,77.161639,500.0,Apiaceae,Ammi,SPECIES
11105,IN,,PRESERVED_SPECIMEN,,,814cdfb5-d4f8-4453-815f-ea5df98e76bf,e05fd4c2-4d77-4c50-880e-1b22096af300,ASU,Gamochaeta pensylvanica,3103082,28.666,77.1,301.0,Asteraceae,Gamochaeta,SPECIES
16537,IN,Delhi,PRESERVED_SPECIMEN,1970.0,11.0,814cdfb5-d4f8-4453-815f-ea5df98e76bf,e05fd4c2-4d77-4c50-880e-1b22096af300,ASU,Foeniculum vulgare,3034922,28.58642,77.16256,600.0,Apiaceae,Foeniculum,SPECIES


In [4]:
# Cache results for other notebooks
india_df.to_parquet(CACHE_DIR / "india_df.parquet")
global_country_counts.to_frame().to_parquet(CACHE_DIR / "global_country_counts.parquet")

# Save scalar metadata
pd.Series({"total_global": total_global}).to_json(CACHE_DIR / "metadata.json")

print(f"Cached india_df: {len(india_df):,} rows")
print(f"Cached global_country_counts: {len(global_country_counts)} countries")
print(f"Cached total_global: {total_global:,}")

Cached india_df: 200,175 rows
Cached global_country_counts: 249 countries
Cached total_global: 314,644,916
