## Imports

In [1]:
from aind_data_access_api.document_db import MetadataDbClient
import pandas as pd

# Configure pandas to display all columns
pd.set_option('display.max_columns', None)

## Connect to the metadata database

In [2]:
# Initialize the client
client = MetadataDbClient(
    host="api.allenneuraldynamics.org",
    database="metadata_index",
    collection="data_assets",
)

In [3]:
query = {"data_description.funding_source.funder": "PGA"}

results = client.retrieve_docdb_records(
    filter_query=query,
    projection={"data_description": 1}
)

len(results)

81

In [4]:
# First normalize the main structure
df = pd.json_normalize(results, sep='.', max_level=None)

# Then normalize the funding_source column which contains list of dictionaries
# The 'prefix' parameter is not supported in json_normalize
# We need to rename the columns after normalization instead
funding_df = pd.json_normalize(
    df['data_description.funding_source'].explode().tolist(),
    sep='.'
)
# Add the prefix to column names
funding_df.columns = 'data_description.funding_source.' + funding_df.columns

# Join the funding information back to the main dataframe
df = df.drop('data_description.funding_source', axis=1)
df = pd.concat([df, funding_df], axis=1)

df.sample(10)

Unnamed: 0,_id,data_description.creation_date,data_description.creation_time,data_description.data_level,data_description.describedBy,data_description.group,data_description.institution,data_description.license,data_description.modality,data_description.name,data_description.project_id,data_description.project_name,data_description.restrictions,data_description.schema_version,data_description.subject_id,data_description.data_summary,data_description.experiment_type,data_description.input_data_name,data_description.institution.abbreviation,data_description.institution.name,data_description.investigators,data_description.related_data,data_description.ror_id,data_description.funding_source.fundee,data_description.funding_source.funder,data_description.funding_source.grant_number
59,e24263d6-f792-4ec2-bcb9-71aad0c0a62a,2023-04-04,13:33:00,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,,CC-BY-4.0,"[{'abbreviation': 'SPIM', 'name': 'Selective p...",SmartSPIM_656702_2023-04-04_13-33-00,102-01-014-10,- CTY GenTools Mouse,,0.4.0,656702,,SmartSPIM,,AIBS,Allen Institute for Brain Science,,[],,,PGA,
50,9d24787d-cb1b-40cd-8500-59e6899f75b5,2023-02-28,19:26:13,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,AIBS,CC-BY-4.0,SmartSPIM,SmartSPIM_654145_2023-02-28_19-26-13,102-01-014-10,MGT,,0.3.0,654145,,,,,,,,,,PGA,
29,ab2c5322-f2f1-4579-80a3-5c5187873ece,2023-03-30,21:06:38.037254,derived data,https://raw.githubusercontent.com/AllenNeuralD...,,,CC-BY-4.0,"[{'abbreviation': 'SPIM', 'name': 'Selective p...",SmartSPIM_650562_2023-03-27_20-02-47_stitched_...,,,,0.4.0,650562,,SmartSPIM,SmartSPIM_650562_2023-03-27_20-02-47,AIBS,Allen Institute for Brain Science,,[],,,PGA,
78,5bd72287-8808-4e41-ba87-039442bd035c,2023-02-27,14:03:57,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,AIBS,CC-BY-4.0,SmartSPIM,SmartSPIM_658357_2023-02-27_14-03-57,102-01-014-10,MGT,,0.3.0,658357,,,,,,,,,,PGA,
18,f42aa199-ca24-40ae-a5dd-db5e9029417e,2023-03-09,11:58:15,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,AIBS,CC-BY-4.0,SmartSPIM,SmartSPIM_660935_2023-03-09_11-58-15,102-01-064-10,CTY Genetic Tools,,0.3.0,660935,,,,,,,,,,PGA,
30,e4d9f654-7cda-48d0-b7b4-b7b78be32ea3,2023-03-22,18:37:30,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,AIBS,CC-BY-4.0,SmartSPIM,SmartSPIM_650632_2023-03-22_18-37-30,102-01-014-10,MGT,,0.3.2,650632,,,,,,,[],,,PGA,
11,6751f876-3050-4e34-804a-4c70d1ffe504,2023-03-01,13:05:45,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,AIBS,CC-BY-4.0,SmartSPIM,SmartSPIM_657999_2023-03-01_13-05-45,102-01-014-10,MGT,,0.3.0,657999,,,,,,,,,,PGA,
20,ecfabdd6-466c-44c0-8912-653cf8f38492,2023-01-10,10:55:59,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,AIBS,CC-BY-4.0,SmartSPIM,SmartSPIM_645131_2023-01-10_10-55-59,,,,0.3.0,645131,,,,,,,,,,PGA,
60,3ecea2ba-ff15-46c5-b168-6e77b3e9bf23,2023-04-06,16:55:38,raw data,https://raw.githubusercontent.com/AllenNeuralD...,,,CC-BY-4.0,"[{'abbreviation': 'SPIM', 'name': 'Selective p...",SmartSPIM_656425_2023-04-06_16-55-38,102-01-014-10,- CTY GenTools Mouse,,0.4.0,656425,,SmartSPIM,,AIBS,Allen Institute for Brain Science,,[],,,PGA,
25,f367e4ee-f1dc-41f2-a6f7-b5a3ce7152a3,2023-03-30,22:11:58.166466,derived data,https://raw.githubusercontent.com/AllenNeuralD...,,,CC-BY-4.0,"[{'abbreviation': 'SPIM', 'name': 'Selective p...",SmartSPIM_650554_2023-03-23_18-02-34_stitched_...,,,,0.4.0,650554,,SmartSPIM,SmartSPIM_650554_2023-03-23_18-02-34,AIBS,Allen Institute for Brain Science,,[],,,PGA,


In [14]:
print(df['data_description.funding_source.funder'].unique())

['PGA']


In [12]:
results[0]['data_description']['funding_source']

[{'fundee': None, 'funder': 'PGA', 'grant_number': None}]

In [13]:
from aind_data_schema_models.organizations import Organization
new_funder = Organization.AI.model_dump()
new_funder

{'name': 'Allen Institute',
 'abbreviation': 'AI',
 'registry': {'name': 'Research Organization Registry', 'abbreviation': 'ROR'},
 'registry_identifier': '03cpe7c52'}