In [11]:
# Import Libs
import os
import sys
import time
import json
import requests
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# Define the CMR URL and parameters
cmr_url = "https://cmr.earthdata.nasa.gov/search/collections.umm_json"
params = {
    "provider": "SEDAC",
    "page_size": 2000  # Adjust the page size as needed
}

# Make the request to the CMR API
response = requests.get(cmr_url, params=params)
response.raise_for_status()  # Raise an error for bad status codes

# Parse the JSON response
collections = response.json()["items"]

# Extract the required fields
data = []
for item in collections:
    meta = item["meta"]
    collection = item["umm"]
    entry_title = collection.get("EntryTitle", "")
    concept_id = meta.get("concept-id", "")
    short_name = collection.get("ShortName", "")
    native_id = meta.get("native-id", "")
    related_urls = collection.get("RelatedUrls", [])
    url = related_urls[0].get("URL", "") if related_urls else ""
    
    data.append({
        "Entry Title": entry_title,
        "Concept ID": concept_id,
        "Short Name": short_name,
        "Native ID": native_id,
        "URL": url
    })

# Create a DataFrame
df_collections = pd.DataFrame(data, columns=["Entry Title", "Concept ID", "Short Name", "Native ID", "URL"])

# Add granule count to each collection
granule_counts = {}
for item in collections:
    meta = item["meta"]
    concept_id = meta.get("concept-id", "")
    
    # Define the CMR URL and parameters for granules
    cmr_granules_url = "https://cmr.earthdata.nasa.gov/search/granules.umm_json"
    granules_params = {
        "echo_collection_id": concept_id,
        "page_size": 1  # We only need the count, so limit to 1
    }
    
    # Make the request to the CMR API for granules
    granules_response = requests.get(cmr_granules_url, params=granules_params)
    granules_response.raise_for_status()  # Raise an error for bad status codes
    
    # Get the granule count from the response headers
    granule_count = int(granules_response.headers.get("CMR-Hits", 0))
    granule_counts[concept_id] = granule_count

# Add granule count to the DataFrame
df_collections["Granule Count"] = df_collections["Concept ID"].map(granule_counts)

In [3]:
df_collections

Unnamed: 0,Entry Title,Concept ID,Short Name,Native ID,URL,Granule Count
0,2000 Pilot Environmental Sustainability Index ...,C179001887-SEDAC,CIESIN_SEDAC_ESI_2000,2000 Pilot Environmental Sustainability Index ...,https://sedac.ciesin.columbia.edu/downloads/ma...,2
1,2001 Environmental Sustainability Index (ESI),C1000000220-SEDAC,CIESIN_SEDAC_ESI_2001,2001 Environmental Sustainability Index (ESI),https://sedac.ciesin.columbia.edu/downloads/ma...,4
2,2002 Environmental Sustainability Index (ESI),C179001967-SEDAC,CIESIN_SEDAC_ESI_2002,2002 Environmental Sustainability Index (ESI),https://sedac.ciesin.columbia.edu/downloads/ma...,4
3,2005 Environmental Sustainability Index (ESI),C179001889-SEDAC,CIESIN_SEDAC_ESI_2005,2005 Environmental Sustainability Index (ESI),https://sedac.ciesin.columbia.edu/downloads/ma...,14
4,2008 Environmental Performance Index (EPI),C179001707-SEDAC,CIESIN_SEDAC_EPI_2008,2008 Environmental Performance Index (EPI),https://sedac.ciesin.columbia.edu/downloads/ma...,6
...,...,...,...,...,...,...
295,West Africa Coastal Vulnerability Mapping: Sub...,C1577568049-SEDAC,CIESIN_SEDAC_WACVM_SMAMAMP_RICH,sedac_CIESIN_SEDAC_WACVM_SMAMAMP_RICH_1.0,https://sedac.ciesin.columbia.edu/downloads/ma...,1
296,West Africa Coastal Vulnerability Mapping: Sub...,C1577573428-SEDAC,CIESIN_SEDAC_WACVM_SUBSETACE2,sedac_CIESIN_SEDAC_WACVM_SUBSETACE2_1.0,https://sedac.ciesin.columbia.edu/downloads/ma...,1
297,West Africa Coastal Vulnerability Mapping: Sub...,C1577578302-SEDAC,CIESIN_SEDAC_WACVM_JRCMA,sedac_CIESIN_SEDAC_WACVM_JRCMA_1.0,https://sedac.ciesin.columbia.edu/downloads/ma...,1
298,West Africa Coastal Vulnerability Mapping: Sub...,C1577579590-SEDAC,CIESIN_SEDAC_WACVM_OSM_ROADS,sedac_CIESIN_SEDAC_WACVM_OSM_ROADS_1.0,https://sedac.ciesin.columbia.edu/downloads/ma...,1


In [None]:
# Add a sum of the granule count
int(df_collections["Granule Count"].sum())

61926

In [5]:
df_collections.to_csv("sedac_collections.csv", index=False)

In [None]:
# Define the CMR URL and parameters for collections
cmr_collections_url = "https://cmr.earthdata.nasa.gov/search/collections.umm_json"
collections_params = {
    "provider": "SEDAC",
    "page_size": 2000  # Adjust the page size as needed
}

# Make the request to the CMR API for collections
collections_response = requests.get(cmr_collections_url, params=collections_params)
collections_response.raise_for_status()  # Raise an error for bad status codes

# Parse the JSON response for collections
collections = collections_response.json()["items"]

# Initialize an empty list to store the data
data = []

# Iterate over each collection to get the required fields and granules
for item in collections:
    meta = item["meta"]
    collection = item["umm"]
    entry_title = collection.get("EntryTitle", "")
    concept_id = meta.get("concept-id", "")
    short_name = collection.get("ShortName", "")
    native_id = meta.get("native-id", "")
    
    # Define the CMR URL and parameters for granules
    cmr_granules_url = "https://cmr.earthdata.nasa.gov/search/granules.umm_json"
    granules_params = {
        "echo_collection_id": concept_id,
        "page_size": 2000  # Adjust the page size as needed
    }
    
    # Initialize pagination variables
    page_num = 1
    while True:
        granules_params["page_num"] = page_num
        # Make the request to the CMR API for granules
        granules_response = requests.get(cmr_granules_url, params=granules_params)
        granules_response.raise_for_status()  # Raise an error for bad status codes
        
        # Parse the JSON response for granules
        granules = granules_response.json().get("items", [])
        
        # If no granules are found, break the loop
        if not granules:
            break
        
        # Extract the required fields for each granule

        for granule_item in granules:
            granule_meta = granule_item["meta"]
            granule = granule_item["umm"]
            granule_ur = granule.get("GranuleUR", "")
            granule_id = granule_meta.get("concept-id", "")
            granule_revision_id = granule_meta.get("revision-id", "")
            granule_native_id = granule_meta.get("native-id", "")
            granule_collection_concept_id = granule_meta.get("collection-concept-id", "")
            granule_urls = granule.get("RelatedUrls", [])
            granule_url_get_data = next((url.get("URL", "") for url in granule_urls if url.get("Type") == "GET DATA"), "")
           # granule_urls = granule.get("RelatedUrls", [])
           # granule_url_get_data = next((url.get("URL", "") for url in granule_urls if url.get("Type") == "GET DATA"), "")
           # granule_spatial_extent = granule.get("SpatialExtent", {})
           # granule_spatial_extent_horizontal = granule_spatial_extent.get("HorizontalSpatialDomain", {}).get("Geometry", {}).get("BoundingRectangles", [{}])[0]
           # granule_temporal_extent = granule.get("TemporalExtent", {})
           # measured_parameters = ", ".join([param.get("ParameterName", "") for param in granule.get("MeasuredParameters", [])])

            # Extract granule size
            archive_info = granule.get("DataGranule", {}).get("ArchiveAndDistributionInformation", [])
            granule_size = sum(info.get("Size", 0) for info in archive_info if info.get("SizeUnit") == "MB")
            
            # Add granule data to the list
            data.append({
                "Entry Title": entry_title,
                "Concept ID": concept_id,
                "Short Name": short_name,
                "Native ID": native_id,
                "Granule UR": granule_ur,
                "Granule ID": granule_id,
                "Granule Revision ID": granule_revision_id,
                "Granule Native ID": granule_native_id,
                "Granule Collection Concept ID": granule_collection_concept_id,
                "Granule URL": granule_url_get_data,
               # "West Bounding Coordinate": granule_spatial_extent_horizontal.get("WestBoundingCoordinate", ""),
               # "East Bounding Coordinate": granule_spatial_extent_horizontal.get("EastBoundingCoordinate", ""),
               # "North Bounding Coordinate": granule_spatial_extent_horizontal.get("NorthBoundingCoordinate", ""),
               # "South Bounding Coordinate": granule_spatial_extent_horizontal.get("SouthBoundingCoordinate", ""),
               # "Granule Temporal Extent": granule_temporal_extent,
               # "Measured Parameters": measured_parameters,
                "Granule Size (MB)": granule_size
            })
        
        # Increment the page number for the next iteration
        page_num += 1

# Create a DataFrame
df_granules = pd.DataFrame(data, columns=[
    "Entry Title", "Concept ID", "Short Name", "Native ID", "Granule UR", "Granule ID", "Granule Revision ID", "Granule Native ID", 
    "Granule Collection Concept ID", "Granule URL", "Granule Size (MB)"
])

In [7]:
# Count number of rows for each collection in df_granules and store in column "Row Count"
row_counts = df_granules["Concept ID"].value_counts()
df_collections["Row Count"] = df_collections["Concept ID"].map(row_counts)

In [9]:
# Generate today's date stamp
today_date_stamp = datetime.now().strftime('%Y%m%d')

# Save the DataFrame to a CSV file with today's date stamp
df_granules.to_csv(f'sedac_collections_and_granules_{today_date_stamp}.csv', index=False)

In [None]:
# Total archive volume in GB
int(df_granules["Granule Size (MB)"].sum() / 1024)

6044

In [12]:
# Notebook Last Ran On
print("Last ran on: ", time.asctime())

Last ran on:  Thu Feb  6 20:00:24 2025
