In [3]:
#install missing packages
!pip install pandas
!pip install pangaeapy



In [4]:
# import packages
import pandas as pd
import pangaeapy as pan
from pangaeapy.pandataset import PanDataSet


### Search for dataset list of specific project

In [5]:
query = pan.PanQuery("project:label:CDRmare", limit=100)

In [6]:
query.totalcount

30

In [7]:
query.query

'project:label:CDRmare'

In [8]:
 #show first entry of query result (query.result = list of dictionaries)
query.result[0]

{'URI': 'doi:10.1594/PANGAEA.964092',
 'score': 6.32072,
 'html': '<li><div class="citation"><a href="https://doi.pangaea.de/10.1594/PANGAEA.964092" target="_self" class="dataset-link"><strong>Esposito, M; Bach, W; Achterberg, EP (2023):</strong> Physical oceanography and hydrochemistry measurements along ROV-Squid dive M183_24-1 during METEOR cruise M183</a></div><table class="result" summary="Dataset reference and size" cellspacing="0" cellpadding="0"><tr><td class="title">Related to:</td><td class="content"><strong>Bach, W (2023):</strong> Master track of METEOR cruise M183 in 1 sec resolution (zipped, 12.2 MB). <em>University of Bremen</em></td></tr><tr><td class="title">Size:</td><td class="content">16562 data points</td></tr></table><div class="datasetid"><a href="https://doi.pangaea.de/10.1594/PANGAEA.964092" target="_self" class="dataset-link doi-link">https://doi.org/10.1594/PANGAEA.964092</a> – <span title="The score is a measurement of relevancy (see TF-IDF algorithm). The v

In [9]:
query.result[0].keys() 

dict_keys(['URI', 'score', 'html', 'type', 'position'])

In [10]:
# get a list of URIs for the query result
l_dois = [d.get('URI') for d in query.result]

In [11]:
l_dois

['doi:10.1594/PANGAEA.964092',
 'doi:10.1594/PANGAEA.964072',
 'doi:10.1594/PANGAEA.964096',
 'doi:10.1594/PANGAEA.964090',
 'doi:10.1594/PANGAEA.964089',
 'doi:10.1594/PANGAEA.964095',
 'doi:10.1594/PANGAEA.964076',
 'doi:10.1594/PANGAEA.964078',
 'doi:10.1594/PANGAEA.964094',
 'doi:10.1594/PANGAEA.964097',
 'doi:10.1594/PANGAEA.964086',
 'doi:10.1594/PANGAEA.964083',
 'doi:10.1594/PANGAEA.964093',
 'doi:10.1594/PANGAEA.964091',
 'doi:10.1594/PANGAEA.964069',
 'doi:10.1594/PANGAEA.963428',
 'doi:10.1594/PANGAEA.963781',
 'doi:10.1594/PANGAEA.951200',
 'doi:10.1594/PANGAEA.963467',
 'doi:10.1594/PANGAEA.963462',
 'doi:10.1594/PANGAEA.954527',
 'doi:10.1594/PANGAEA.956371',
 'doi:10.1594/PANGAEA.954852',
 'doi:10.1594/PANGAEA.963541',
 'doi:10.1594/PANGAEA.963589',
 'doi:10.1594/PANGAEA.954531',
 'doi:10.1594/PANGAEA.962782',
 'doi:10.1594/PANGAEA.951417',
 'doi:10.1594/PANGAEA.963468',
 'doi:10.1594/PANGAEA.963590']

In [12]:
len(l_dois) #length of the list

30

### Use case 1: Print a full list of dataset citations to a .txt file

In [13]:
# write all citations to a .txt file
file=open("citations_CDRmare.txt", "w")
for doi in l_dois:
    ds = PanDataSet(doi, include_data=False)
    citation = ds.citation
    #print(ds.citation)
    file.write(citation + "\n")
file.close()

Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.964086
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963467
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963462
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963541
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963589
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963468
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963590


### Use case 2: Extract the DOIs, Publication dates and the Project names in table format for further analysis

In [15]:
# access to xml metadata scheme via pangaeapy
df_metadata = pd.DataFrame()

for ind, doi in enumerate(l_dois):
    ds = PanDataSet(doi, include_data=False)
    df_metadata.loc[ind,'DOI'] = doi
    df_metadata.loc[ind,'Publication_Date'] = ds.date
    #print(ds.date)
    x = []
    for pro in ds.projects:
        #print(pro.label)
        x.append(pro.label)
    df_metadata.loc[ind,'Project'] = ', '.join(x)
    
df_metadata

Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.964086
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963467
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963462
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963541
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963589
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963468
Data set is protected - 
Dataset is either restricted or of type "collection" - https://doi.org/10.1594/PANGAEA.963590


Unnamed: 0,DOI,Publication_Date,Project
0,doi:10.1594/PANGAEA.964092,2023-12-12T12:33:11,"AIMS3, CDRmare"
1,doi:10.1594/PANGAEA.964072,2023-12-11T13:54:47,"AIMS3, CDRmare"
2,doi:10.1594/PANGAEA.964096,2023-12-12T12:37:50,"AIMS3, CDRmare"
3,doi:10.1594/PANGAEA.964090,2023-12-12T12:30:50,"AIMS3, CDRmare"
4,doi:10.1594/PANGAEA.964089,2023-12-12T12:11:33,"AIMS3, CDRmare"
5,doi:10.1594/PANGAEA.964095,2023-12-12T12:36:45,"AIMS3, CDRmare"
6,doi:10.1594/PANGAEA.964076,2023-12-11T14:31:38,"AIMS3, CDRmare"
7,doi:10.1594/PANGAEA.964078,2023-12-11T15:04:08,"AIMS3, CDRmare"
8,doi:10.1594/PANGAEA.964094,2023-12-12T12:35:41,"AIMS3, CDRmare"
9,doi:10.1594/PANGAEA.964097,2023-12-12T12:38:47,"AIMS3, CDRmare"


In [16]:
# export metadata table as tab-delimited text-file
df_metadata.to_csv('Metadata_datapub_CDRmare.txt', sep = '\t', index = False)