# CGSN Metadata Communications
Author: Andrew Reed  
Date: 2019-08-21  
Ver: 1.02

This notebook lays out the development process for querying the relevant data fields necessary for CGSN to fill out the UW Metadata Changes & Communications spreadsheet. The goal is to explore the possible sources of the necessary information by M2M calls to OOINet based upon the available information recorded in CGSN's Metadata Tracking Spreadsheet. Eventually, the exploratory and development process laid out below will be transitioned into an automated function which fills out the requisite information with manual execution of the relevant scripts and code.

In [19]:
import os, shutil, sys, time, re, requests, csv, datetime, pytz
import pandas as pd
import numpy as np
import netCDF4 as nc
import xarray as xr

Set my OOINet username, token, and the base url for querying the system via M2M:

In [20]:
username = 'OOIAPI-C9OSZAQABG1H3U'
token = 'JA48WUQVG7F'

In [21]:
base_url = 'https://ooinet.oceanobservatories.org/api/m2m'
sensor_url = '12576/sensor/inv'
asset_url = '12587/asset'

The goal in this code is to, for a given asset. Let's start with the SPKIRs because of the small number of different individual instruments:

In [4]:
# Specify some functions to convert timestamps
ntp_epoch = datetime.datetime(1900, 1, 1)
unix_epoch = datetime.datetime(1970, 1, 1)
ntp_delta = (unix_epoch - ntp_epoch).total_seconds()

def ntp_seconds_to_datetime(ntp_seconds):
    return datetime.datetime.utcfromtimestamp(ntp_seconds - ntp_delta).replace(microsecond=0)
  
def convert_ooi_time(ms):
    if ms is None:
        return None
    elif np.isnan(ms):
        return None
    else:
        return datetime.datetime.utcfromtimestamp(ms/1000)

In [5]:
def get_calData(uid):
    """
    Function to call the m2m interface and return the calibration
    data for a given instrument uid
    """
    cal_url = '/'.join((base_url, '12587/asset/cal?uid=')) + uid
    calData = requests.get(cal_url, auth=(username, token)).json()
    return calData

In [6]:
def get_deployData(uid):
    """
    Function to call the m2m interface and get the deployment
    data for a given instrument uid
    """
    deploy_url = '/'.join((base_url, '12587/asset/deployments', uid)) + '?editphase=ALL'
    deployData = requests.get(deploy_url, auth=(username, token)).json()
    return deployData

In [7]:
def reformat_deployData(deployData):
    """
    Reformats the deployment information for a given instrument
    into a dataframe with the following fields: subsite, node,
    deployment number, start time, and end time.
    """
    df = pd.DataFrame(columns=['subsite','node','deploymentNumber','startTime','endTime'])
    for d in deployData:
        df = df.append({
            'subsite':d['subsite'],
            'node':d['node'],
            'deploymentNumber':d['deploymentNumber'],
            'startTime':d['startTime'],
            'endTime':d['endTime'],
        }, ignore_index=True)
        
    df['startTime'] = df['startTime'].apply(convert_time)
    df['endTime'] = df['endTime'].apply(convert_time)
    df.sort_values(by=['deploymentNumber'], inplace=True)
    
    return df

In [8]:
def reformat_calData(calData):
    caldict = {}
    for coeff in calData['calibration']:
        name = coeff['name']
        cdata = coeff['calData']
        eventStart = []
        eventStop = []
        eventMod = []
        dataSource = []
        for event in cdata:
            eventStart.append(convert_time(event['eventStartTime']))
            eventMod.append(convert_time(event['lastModifiedTimestamp']))
            dataSource.append(event['dataSource'])
        caldict.update({name:{'eventStartTime':eventStart, 'lastModifiedTimestamp':eventMod, 'dataSource':dataSource}})
    dfCal = pd.DataFrame(caldict).transpose()
    
    return dfCal

In [9]:
os.listdir(os.getcwd())

['PHSENE_metadata_communications.csv',
 'PHSEND_metadata_communications.csv',
 'CTDBPE_metadata_communications.csv',
 'NUTNRB_metadata_communications.csv',
 'PRESFC_metadata_communications.csv',
 'FLORDG_metadata_communications.csv',
 'PCO2WC_metadata_communications.csv',
 'PCO2WB_metadata_communications.csv',
 '.~lock.CTDBPE_metadata_communications.csv#',
 'CTDBPP_metadata_communications.csv',
 'CGSN Metadata Review.xlsx',
 'Asset_Management_Instrument_Metadata.ipynb',
 'PHSENF_metadata_communications.csv',
 'Metadata_Communications_Spreadsheet.ipynb',
 'Stream_identifier.ipynb',
 'CTDBPD_metadata_communications.csv',
 '.~lock.CTDBPC_metadata_communications.csv#',
 'FLORTD_metadata_communications.csv',
 'SPKIRB_metadata_communications.csv',
 'DOSTAD_metadata_communications.csv',
 'CTDBPC_metadata_communications.csv',
 '.ipynb_checkpoints',
 'CTDBPF_metadata_communications.csv',
 'PRESFB_metadata_communications.csv']

In [10]:
metadata_review = pd.read_excel('CGSN Metadata Review.xlsx',sheet_name='Cal Review Log')

In [11]:
metadata_review

Unnamed: 0,CLASS-SERIES,S/N,Cal Date,Original Calibration CSV,Vendor Docs exist,Cal coeff match,Filename correct,In progress?,Duplicate,Notes,Pull request #,Date pull request submitted,Pull request verified primary,Pull request verified secondary,Unnamed: 14
0,PCO2W-B,C0050,2012-08-28 00:00:00,CGINS-PCO2WB-C0050__20150315,Yes,Yes,"No, need to change date","SW, AP",No,changed date (RGT),659,2019-03-28 00:00:00,CD,AR,
1,PCO2W-B,C0050,2016-05-20 00:00:00,CGINS-PCO2WB-C0050__20161125,Yes,Yes,"No, need to change date","SW, AP",No,changed date (RGT),659,2019-03-28 00:00:00,CD,AR,
2,PCO2W-B,C0051,2012-08-28 00:00:00,CGINS-PCO2WB-C0051__20140910,Yes,Yes,"No, need to change date","SW, AP",No,changed date (RGT),659,2019-03-28 00:00:00,CD,AR,
3,PCO2W-B,C0051,2015-12-10 00:00:00,CGINS-PCO2WB-C0051__20160513,Yes,Yes,"No, need to change date","SW, AP",No,changed date (RGT),659,2019-03-28 00:00:00,CD,AR,
4,PCO2W-B,C0051,2016-12-21 00:00:00,CGINS-PCO2WB-C0051__20161221,Yes,Yes,Yes,"SW, AP",No,,,,,,
5,PCO2W-B,C0051,2017-11-14 00:00:00,CGINS-PCO2WB-C0051__20171114,Yes,Yes,Yes,"SW, AP",No,,,,,,
6,PCO2W-B,C0069,2013-08-02 00:00:00,CGINS-PCO2WB-C0069__20131121,Yes,Yes,"No, need to change date","SW, AP",No,changed date (RGT),659,2019-03-28 00:00:00,CD,AR,
7,PCO2W-B,C0069,2015-04-28 00:00:00,CGINS-PCO2WB-C0069__20151021,Yes,Yes,"No, need to change date","SW, AP",No,changed date (RGT),659,2019-03-28 00:00:00,CD,AR,
8,PCO2W-B,C0069,2016-06-10 00:00:00,CGINS-PCO2WB-C0069__20160610,Yes,Yes,Yes,"SW, AP",No,,,,,,
9,PCO2W-B,C0069,2017-07-19 00:00:00,CGINS-PCO2WB-C0069__20170719,Yes,Yes,Yes,"SW, AP",No,,,,,,


In [12]:
metadata_review.dropna(subset=['CLASS-SERIES'], inplace=True)

In [13]:
metadata_review = metadata_review[metadata_review['Original Calibration CSV'] != 'Bad']

In [14]:
def generate_uid(inst, sn, whoi_inst=True):
    """
    Function which takes in instrument class - series and serial number to generate an instrument uid. The exception
    to the rule is the METBK instruments, which are classified as Loggers, and thus are recorded as METLGR
    """
    
    # Clean the names of the class-series
    if '-' in inst:
        inst = inst.replace('-','')
        
    # Clean the serial numbers
    sn = str(sn)
    if '-' in sn:
        ind = sn.index('-')
        sn = sn[ind+1:].zfill(5)
    elif len(sn) < 5:
        sn = sn.zfill(5)
    else:
        pass
    
    # If the instrument is a METBK, have to handle differently
    if 'METBKA' in inst:
        inst = 'METLGR'
        if 'UNKNOWN' in sn:
            sn = sn.split('\n')[-1]
        else:
            sn = sn[3:].zfill(5)   
        
    # Generate the UID
    if whoi_inst == True:
        uid = '-'.join(('CGINS',inst,sn))
        
    return uid

In [18]:
metadata_review['UID'] = metadata_review.apply(lambda x: generate_uid(x['CLASS-SERIES'], x['S/N']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


**====================================================================================================================**
Next, we need to classify the different metadata reviews based on the results with the following classes:
1. Good - this is the best result
2. Wrong calibration date - the date in the csv name differed from the calibration date and has been corrected to the calibration date of the instrument
3. Wrong calibration value - a calibration coefficient was identified as being incorrect, by either an exact match or by a comparative difference of > 0.001%
4. Missing - a csv is missing from asset management
5. Duplicate - a csv was identified as a duplicate or unneeded and removed

Individual csv files may fall into several categories. We

In [None]:
def wrong_cal_date(x):
    if type(x) == str:
        if 'no' in x.lower():
            return True
        else:
            return False
    else:
        return False

In [None]:
metadata_review['Wrong Date'] = metadata_review['Filename correct'].apply(wrong_cal_date)

In [None]:
def wrong_cal_coef(x):
    if type(x) == str:
        if 'yes' in x.lower():
            return False
        else:
            return True
    elif np.isnan(x):
        return False
    else:
        return False

In [None]:
metadata_review['Wrong cal'] = metadata_review['Cal coeff match'].apply(wrong_cal_coef)

In [None]:
def is_missing(x):
    if type(x) is str:
        if x.lower() == 'new':
            return True
        else:
            return False
    else:
        return False

In [None]:
metadata_review['Is missing'] = metadata_review['Duplicate'].apply(is_missing)

In [None]:
def is_duplicate(x):
    if type(x) is str:
        if x.lower() == 'yes':
            return True
        else:
            return False
    else:
        return False        

In [None]:
metadata_review['Is duplicate'] = metadata_review['Duplicate'].apply(is_duplicate)

Now that I have functions to test for four of the five situations, I can check against the fifth situation to determine if the file is "good," i.e. it didn't fail any of the other tests.

In [None]:
def is_good(x):
    if any(x) == True:
        return False
    else:
        return True
    

In [None]:
metadata_review['Is good'] = metadata_review[['Wrong Date', 'Wrong cal', 'Is missing', 'Is duplicate']].apply(is_good, axis=1)

In [None]:
metadata_review[['Wrong Date','Wrong cal','Is missing','Is duplicate','Is good']].head(10)

**====================================================================================================================**
Next we want to construct the new CSV filenames based on the calibration date corrections. These are the names which will show up in the OOINet system. We can build the new CSV filenames from the instrument UID, which was previously bilt from the instrument class-series and serial number, and the correct/corrected calibration date.


In [None]:
def convert_df_time(x):
    if type(x) == datetime.datetime:
        return x.strftime('%Y%m%d')
    elif type(x) == int:
        return pd.to_datetime(x).strftime('%Y%m%d')
    else:
        pass

In [None]:
metadata_review['Cal Date'] = metadata_review['Cal Date'].apply(convert_df_time)

In [None]:
def new_csv_filename(x):
    og_csv = x['Original Calibration CSV']
    if not og_csv.endswith('.csv') and og_csv != None:
        og_csv = og_csv + '.csv'
        x['Original Calibration CSV'] = og_csv
        
    if x['Is duplicate']:
        return np.nan
    elif x['Cal Date'] == None:
        return og_csv
    elif x['Wrong Date'] or x['Is missing']:
        new_csv = x['UID'] + '__' + x['Cal Date'] + '.csv'
        if new_csv == x['Original Calibration CSV'] and not x['Is missing']:
            print("Check calibration date for {} for errors.".format(x['Original Calibration CSV']))
        else:
            return new_csv
    else:
        return x['Original Calibration CSV']

In [None]:
metadata_review['New Calibration CSV'] = metadata_review.apply(new_csv_filename, axis=1)

In [None]:
metadata_review[metadata_review['New Calibration CSV'] == 'None.csv']

In [None]:
set(metadata_review['CLASS-SERIES'])

**====================================================================================================================**
### Demonstration: SPKIRBs
This section is to work through the approach to generating the requisite information need to fill out the UW metadata communication spreadsheet based upon CGSN's metadata review approach. I need to gather the following information for the spreadsheet:
* Array
* Platform
* Node
* Instrument
* RefDes
* Asset UID -  
* Serial
* Deployment(s)
* Github Change Date - I don't think this is necessary, since it doesn't affect the end user until a release and ingestion to OOINet
* OOI Change Date - Question on 
* CSV file name - this is the filename which is in the system (so changes which have not been pushed to OOI net should not be put on the spreadsheet?)
* Github URL - Is this also necessary, when they can directly call (via M2M) or download the calibration information from the Portal
* Change type - This is my 5 categories from above
* dateRange Start
* dateRange End
* Annotation

Starting with my Metadata tracking spreadsheet above, I want to be able to use a series of M2M calls to the OOI API in order to get the data necessary to fill out the spreadsheet above. To test the possible approaches, I'm going to use the SPKIR-B instrument series, whose calibration changes have been pushed to OOI-Integration and Released into Production. 

In [None]:
spkirb = metadata_review[metadata_review['CLASS-SERIES'] == 'SPKIRB']
spkirb

Get the first instance of the spkir-b:

In [None]:
spkirb[spkirb['UID'] == 'CGINS-SPKIRB-00229']

**====================================================================================================================** 
### Deployment Information
Try querying the deployment information for a given instrument UID.

In [None]:
url = '/'.join((base_url,'12587','asset','deployments','CGINS-SPKIRB-00229?editphase=ALL'))
url

In [None]:
data = requests.get(url, auth=(username, token)).json()

In [None]:
deploydf = pd.DataFrame(data)
deploydf.sort_values(by='deploymentNumber', inplace=True)
deploydf.reset_index(drop=True, inplace=True)
deploydf

In [None]:
deploydf.columns

Okay, querying the Deployment information for the instrument UID gives me the following key information:
* Deployment Number
* Node
* Subsite
* Sensor
* Start Time
* End Time

However, we are lacking the following information:
* Instrument
* Reference Designator
* CSV File

Thus, we still cannot directly link a particular CSV file to a given deployment yet. Now, we want to try to query the calibration information associated with a particular deployment for the given instrument:

In [None]:
subsite = deploydf['subsite']
node = deploydf['node']
sensor = deploydf['sensor']
depnum = deploydf['deploymentNumber']
startTime = deploydf['startTime']
endTime = deploydf['endTime']

In [None]:
url = '/'.join((base_url,'12587','events','deployment','inv',subsite[0],node[0],sensor[0],str(depnum[0])))
url

In [None]:
data = requests.get(url, auth=(username, token)).json()

In [None]:
data[0].keys()

In [None]:
data[0]['sensor'].keys()

In [None]:
data[0]['sensor']['calibration'][0]

So, querying the calibration data for a specific UID for a specific deployment number does not, in fact, return just that deployment information. It returns the _all_ of the calibration info, separated based on the calibration coefficient name. This is a problem, because I want _only_ the calibration info for a specific deployment without having to dig into each and every calibration coefficient checking for the closest startTime and endTime.

We're going to double check the deployment info by just querying for the deployment numbers for a specified instrument:

In [None]:
url = '/'.join((base_url,'12587','events','deployment','inv',subsite[0],node[0],sensor[0]))
url

In [None]:
data = requests.get(url, auth=(username, token)).json()
data

Okay, now I am confused. Requesting this specific deployment data returns [1,2,3]. This is probably because a new deployment starts before an old deployment stops, returning deployment 2. Additionally, this only returns the time that the instrument was located on that specific array, site, and node. It does not return all the calibration coefficients for a specific UID.

In [None]:
startTime.apply(convert_ooi_time)

In [None]:
# Add one day to the start time to limit to a single deployment
dt = 1*24*60*60*1000
dt

In [None]:
T1 = convert_ooi_time(startTime[0]).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
T2 = convert_ooi_time(startTime[0]+dt).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
(T1, T2)

In [None]:
url = '/'.join((base_url,'12587','asset','cal?uid=CGINS-SPKIRB-00229&beginDT={}&endDT={}'.format(T1,T2)))
url

In [None]:
data = requests.get(url, auth=(username, token)).json()

In [None]:
data

This provides me with the calibration information for a single deployment based on the instrument UID rather than having to construct the full reference designator, which is what I want. Now, I need to loop through all of the deployments and return the "data source" along with the "last modified time stamp"

In [None]:
dataSource = ()
lastModifiedTimestamp = ()
instrument = ()
serialNum = ()
for i in startTime:
    T1 = convert_ooi_time(i).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
    T2 = convert_ooi_time(i+dt).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
    url = '/'.join((base_url,'12587','asset','cal?uid=CGINS-SPKIRB-00229&beginDT={}&endDT={}'.format(T1,T2)))
    calData = requests.get(url, auth=(username, token)).json()
    inst = calData['description']
    instrument = instrument + (inst,)
    sn = calData['serialNumber']
    serialNum = serialNum + (sn,)
    Source = calData['calibration'][0]['calData'][0]['dataSource']
    dataSource = dataSource + (Source,)
    lastMod = calData['calibration'][0]['calData'][0]['lastModifiedTimestamp']
    lastModifiedTimestamp = lastModifiedTimestamp + (lastMod,)
deploydf['dataSource'] = dataSource
deploydf['lastModifiedTimestamp'] = lastModifiedTimestamp
deploydf['instrument'] = instrument
deploydf['serialNum'] = serialNum

In [None]:
deploydf.columns

**====================================================================================================================**
Now, I want to construct a dataframe which contains the relevant information to fill out the Metadata Changes & Communications spreadsheet. Start by building up a dictionary, then put it into a dataframe.

In [None]:
def reformat_dataSource(x):
    new = x.replace('_Cal_Info.xlsx','.csv')
    return new

In [None]:
reformat_dataSource(deploydf['dataSource'].iloc[0])

In [None]:
deploydf['dataSource'] = deploydf['dataSource'].apply(reformat_dataSource)
deploydf['dataSource']

In [None]:
deploydf['RefDes'] = deploydf['subsite'] + '-' + deploydf['node'] + '-' + deploydf['sensor']
deploydf['RefDes']

Create a subselection of columns from the deploy dataframe which contains the relevant data 

In [None]:
calData = deploydf[['subsite','node','instrument','RefDes','sensor_uid','deploymentNumber','lastModifiedTimestamp','dataSource','startTime','endTime']]

In [None]:
calData

Now, I can merge the subselected dataframe above with the metadata review based on the key of 'New Calibration CSV'::'dataSource'. This should provide us with all the necessary data to fill the spreadsheet (that can filled out from M2M calls)

In [None]:
spkirb229 = spkirb[spkirb['UID'] == 'CGINS-SPKIRB-00229']

In [None]:
spkirb229 = spkirb229.merge(deploydf, left_on='New Calibration CSV', right_on='dataSource')


In [None]:
spkirb229.columns

In [None]:
cols = ('Array','Platform','Node','Instrument','RefDes','Asset ID','Serial Number','deployment','gitHub changeDate',
        'OOI changeDate','file','URL','changeType','dateRangeStart','dateRangeEnd','annotation','Wrong Date',
       'Wrong cal','Is missing','Is duplicate','Is good')

In [None]:
name_map = {
    'Array':None,
    'Platform':'subsite',
    'Node':'node',
    'Instrument':'instrument',
    'RefDes':'RefDes',
    'Asset ID':'UID',
    'Serial Number':'serialNum',
    'deployment':'deploymentNumber',
    'gitHub changeDate':'Pull request #',
    'OOI changeDate':'lastModifiedTimestamp',
    'file':'dataSource',
    'dateRangeStart':'startTime',
    'dateRangeEnd':'endTime',
    'annotation':None,
    'Wrong Date':'Wrong Date',
    'Wrong cal':'Wrong cal',
    'Is missing':'Is missing',
    'Is duplicate':'Is duplicate',
    'Is good':'Is good'
}

In [None]:
comdf = pd.DataFrame(columns=cols)
comdf

In [None]:
for i in cols:
    if name_map.get(i) is not None:
        comdf[i] = spkirb229[name_map.get(i)]

In [None]:
comdf

In [None]:
def generate_arrayName(x):
    if 'GA' in x:
        arrayName = 'Global Argentine Basin'
    elif 'GI' in x:
        arrayName = 'Global Irminger Sea'
    elif 'GP' in x:
        arrayName = 'Global Station Papa'
    elif 'GS' in x:
        arrayName = 'Global Southern Ocean'
    elif 'CP' in x:
        arrayName = 'Coastal Pioneer'
    else:
        arrayName = np.nan
    return arrayName

In [None]:
comdf['Array'] = comdf['Platform'].apply(generate_arrayName)
comdf

In [None]:
comdf['OOI changeDate'] = comdf['OOI changeDate'].apply(convert_ooi_time)
comdf['dateRangeStart'] = comdf['dateRangeStart'].apply(convert_ooi_time)
comdf['dateRangeEnd'] = comdf['dateRangeEnd'].apply(convert_ooi_time)

In [None]:
comdf

In [None]:
def generate_gitHub_url(x):
    base_url = 'https://github.com/ooi-integration/asset-management/blob/master/calibration'
    inst = x.split('-')[1]
    full_url = '/'.join((base_url,inst,x))
    return full_url

In [None]:
comdf['URL'] = comdf['file'].apply(generate_gitHub_url)
comdf

In [None]:
def classify_changeType(x):
    statement = ''
    if x['Is good'] == True:
        return 'No errors found'
    elif x['Is missing'] == True:
        return 'Missing file added'
    elif x['Is duplicate'] == True:
        return 'File deleted'
    else:
        if x['Wrong Date'] == True:
            statement = statement + 'File renamed with correct date '
        if x['Wrong cal'] == True:
            statement = statement + 'Calibration coefficients were modified'
        return statement

In [None]:
comdf['changeType'] = comdf[['Wrong Date','Wrong cal','Is missing','Is duplicate','Is good']].apply(classify_changeType, axis=1)

In [None]:
comdf.drop(columns=['Wrong Date','Wrong cal','Is missing','Is duplicate','Is good'], inplace=True)
comdf

**====================================================================================================================**
Now, I have successfully generated the spreadsheet with only a few wrinkles:
1. gitHub changeDate: I have put in the Pull Request (PR) number here instead of a date. I can hopefully match those numbers up at a later date by identifying when the PR was merged with master and pushed to ooi-integration
2. annotation: I have not populated the annotation format yet, and part of the reason is:
3. downstream: I have not determined downstream impacts for a given file change. This can be added later.

Next, I want to structure this into a scriptable, automated format that makes use of some key identifiers that avoid doing this on a instrument-by-instrument basis

Start by selecting an instrument class-series, preferably one which where the review has been finished and pushed to ooi-integration.

In [None]:
metadata_review['CLASS-SERIES'] = metadata_review['CLASS-SERIES'].apply(lambda x: x.replace('-',''))

In [None]:
instrument = 'SPKIRB'

Select from the metadata review the relevant instruments:

In [None]:
np.unique(metadata_review['CLASS-SERIES'])

In [None]:
meta = metadata_review[metadata_review['CLASS-SERIES'] == instrument]
meta

In [None]:
uids = np.unique(meta['UID'])
uids

In [None]:
uid = uids[3]
uid

In [None]:
udf = meta[meta['UID'] == uid]

Get the relevant deployment info:

In [None]:
def get_deployData(uid):
    url = '/'.join((base_url,'12587','asset','deployments',uid+'?editphase=ALL'))
    data = requests.get(url, auth=(username, token)).json()
    df = pd.DataFrame(data)
    df.sort_values(by='deploymentNumber', inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [None]:
deploydf = get_deployData(uid)
deploydf

In [None]:
def get_calData(uid, deployData):
    """
    This function takes in the instrument uid and a dataframe of the
    deployment information for the uid, and loops through all of the
    instrument deployments to return the calibration data for the
    instrument for each individual deployment.
    """
    
    startTime = deployData['startTime']
    dt = 8.64E10 # microseconds in a day
    
    # Initialize tuples for non-mutable storage of data
    dataSource = ()
    lastModifiedTimestamp = ()
    instrument = ()
    serialNumber = ()
    
    # Loop over the deployment startTime and get the data
    for t in startTime:
        T1 = convert_ooi_time(t).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        T2 = convert_ooi_time(t+dt).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        # Generate the url and get the calibration data for a single deployment
        url = '/'.join((base_url,'12587','asset','cal?uid='+uid+'&beginDT={}&endDT={}'.format(T1,T2)))
        calData = requests.get(url, auth=(username, token)).json()
        # Fill out the data tuples
        instrument = instrument + (calData['description'],)
        serialNumber = serialNumber + (calData['serialNumber'],)
        dataSource = dataSource + (calData['calibration'][0]['calData'][0]['dataSource'],)
        lastModifiedTimestamp = lastModifiedTimestamp + (calData['calibration'][0]['calData'][0]['lastModifiedTimestamp'],)
        
    # Now, put the data tuples into the deploy data dataframe
    deployData['dataSource'] = dataSource
    deployData['lastModifiedTimestamp'] = lastModifiedTimestamp
    deployData['instrument'] = instrument
    deployData['serialNumber'] = serialNumber
    
    # Return the expanded deployment data
    return deployData

In [None]:
deploydf = get_calData(uid, deploydf)
deploydf

In [None]:
def reformat_dataSource(x):
    new = x.replace('_Cal_Info.xlsx','.csv')
    return new

In [None]:
deploydf['dataSource'] = deploydf['dataSource'].apply(reformat_dataSource)
deploydf['RefDes'] = deploydf['subsite'] + '-' + deploydf['node'] + '-' + deploydf['sensor']

In [None]:
udf = udf.merge(deploydf, left_on='New Calibration CSV', right_on='dataSource')

In [None]:
udf

In [None]:
# Function to make an API request and print the results
def get_and_print_api(url):
    r = requests.get(url, auth=(username, token))
    data = r.json()
    for d in data:
        print(d)

Now to track down all of the streams with a particular instrument...or is that doing this the wrong way?

In [None]:
get_and_print_api('/'.join((base_url,sensor_url,'CP01CNSM','RID27')))

In [None]:
# Put together a walk function which will return all of the reference designators for a particular instrument class
inst = 'SPKIRB'
url = '/'.join((base_url, sensor_url))
r = requests.get(url, auth=(username, token))
arrays = r.json()
arrays = [a for a in arrays if a.startswith('CP') or a.startswith('G')]
for array in arrays:
    url = '/'.join((base_url, sensor_url, array))
    nodes = requests.get(url, auth=(username, token)).json()
    for node in nodes:
        url = '/'.join((base_url, sensor_url, array, node))
        sensors = requests.get(url, auth=(username, token)).json()
        sensors = [s for s in sensors if 'CTDBP' in s]
        for sensor in sensors:
            url = '/'.join((base_url, sensor_url, array, node, sensor, 'metadata'))
            metadata = requests.get(url, auth=(username, token)).json()
            print(array + '/' + node + '/' + sensor + ': ' + metadata)

In [None]:
arrays

In [None]:
node