# Data Loading

Load the raw GBIF parquet (~314M records) and extract India's subset (~200K records). Results are cached to `_cache/` for use by subsequent notebooks.

In [12]:
import sys; sys.path.insert(0, ".")
from _shared import *
import dask.dataframe as dd
setup_plotting()

CACHE_DIR.mkdir(exist_ok=True)

In [13]:
# Global scan: load only countrycode to get country-level record counts
global_country_counts = (
    dd.read_parquet(PARQUET_PATH, columns=["countrycode"])
    ["countrycode"]
    .value_counts()
    .compute()
    .sort_values(ascending=False)
    .rename("n_records")
)

total_global = global_country_counts.sum()
india_n = global_country_counts.get("IN", 0)
india_rank = (global_country_counts > india_n).sum() + 1

print(f"Total global records: {total_global:,}")
print(f"Number of countries: {len(global_country_counts)}")
print(f"\nIndia (IN): {india_n:,} records \u2014 rank {india_rank} of {len(global_country_counts)} countries")
print(f"India's share: {india_n / total_global:.4%}")
print(f"\nTop 10 countries:")
for i, (cc, n) in enumerate(global_country_counts.head(10).items(), 1):
    print(f"  {i:>2}. {cc}: {n:,}")

Total global records: 338,978,088
Number of countries: 249

India (IN): 244,100 records â€” rank 36 of 249 countries
India's share: 0.0720%

Top 10 countries:
   1. FR: 75,133,454
   2. SE: 33,495,468
   3. GB: 33,284,163
   4. US: 22,522,238
   5. AU: 21,763,948
   6. NL: 18,260,970
   7. DE: 16,016,735
   8. ES: 14,998,730
   9. CH: 14,767,833
  10. DK: 14,511,771


In [14]:
# Load full India subset
INDIA_COLS = [
    "countrycode", "stateprovince", "basisofrecord",
    "year", "month",
    "publishingorgkey", "datasetkey", "institutioncode",
    "species", "specieskey",
    "decimallatitude", "decimallongitude",
    "coordinateuncertaintyinmeters",
    "family", "genus", "taxonrank",
]

india_df = (
    dd.read_parquet(PARQUET_PATH, columns=INDIA_COLS)
    .query("countrycode == 'IN'")
    .compute()
)

print(f"India records loaded: {len(india_df):,}")
india_df.head(3)

India records loaded: 244,100


Unnamed: 0,countrycode,stateprovince,basisofrecord,year,month,publishingorgkey,datasetkey,institutioncode,species,specieskey,decimallatitude,decimallongitude,coordinateuncertaintyinmeters,family,genus,taxonrank
5630,IN,,PRESERVED_SPECIMEN,1899.0,4.0,605e7170-1123-11d9-8433-b8a03c50a862,064508e2-255e-4d82-9f13-05d73476cc03,UZH:Z,Dendrobium ochreatum,5316374,22.887376,92.743093,1186.0,Orchidaceae,Dendrobium,SPECIES
8554,IN,,PRESERVED_SPECIMEN,1936.0,8.0,605e7170-1123-11d9-8433-b8a03c50a862,064508e2-255e-4d82-9f13-05d73476cc03,UZH:Z,Gentiana nubigena,3839972,30.662852,80.15807,3002.0,Gentianaceae,Gentiana,SPECIES
9295,IN,,PRESERVED_SPECIMEN,1981.0,1.0,605e7170-1123-11d9-8433-b8a03c50a862,064508e2-255e-4d82-9f13-05d73476cc03,UZH:Z,Diplazium polypodioides,7289271,8.751194,77.125,392.0,Athyriaceae,Diplazium,SPECIES


In [15]:
# Cache results for other notebooks
india_df.to_parquet(CACHE_DIR / "india_df.parquet")
global_country_counts.to_frame().to_parquet(CACHE_DIR / "global_country_counts.parquet")

# Save scalar metadata
pd.Series({"total_global": total_global}).to_json(CACHE_DIR / "metadata.json")

print(f"Cached india_df: {len(india_df):,} rows")
print(f"Cached global_country_counts: {len(global_country_counts)} countries")
print(f"Cached total_global: {total_global:,}")

Cached india_df: 244,100 rows
Cached global_country_counts: 249 countries
Cached total_global: 338,978,088
