# SPKIR Metadata Review

This notebook describes the process for reviewing the calibration coefficients

In [1]:
import csv
import re
import os
import numpy as np
import pandas as pd

In [2]:
from utils import *

**=========================================================================================================================**
Define some useful functions for the metadata review (in future will move to a utilities file):

In [699]:
def whoi_asset_tracking(spreadsheet, sheet_name, instrument_class='All', whoi=True, series=None):
    """
    Loads all the individual sensors of a specific instrument class and
    series type. Currently applied only for WHOI deployed instruments.

    Args:
        spreadsheet - directory path and name of the excel spreadsheet with
            the WHOI asset tracking information.
        sheet_name - name of the sheet in the spreadsheet to load
        instrument_class - the type (i.e. CTDBP, CTDMO, PCO2W, etc). Defaults
            to 'All', which will load all of the instruments
        whoi - return only whoi instruments? Defaults to True.
        series - a specified class of the instrument to load. Defaults to None,
            which will load all of the series for a specified instrument class
    """

    all_sensors = pd.read_excel(spreadsheet, sheet_name=sheet_name, header=1)
    # Select a specific class of instruments
    if instrument_class == 'All':
        inst_class = all_sensors
    else:
        inst_class = all_sensors[all_sensors['Instrument\nClass'] == instrument_class]
    # Return only the whoi instruments?
    if whoi:
        whoi_insts = inst_class[inst_class['Deployment History'] != 'EA']
    else:
        whoi_insts = inst_class
    # Slect a specific series of the instrument?
    if series is not None:
        instrument = whoi_insts[whoi_insts['Series'] == series]
    else:
        instrument = whoi_insts

    return instrument

In [700]:
def load_asset_management(instrument, filepath):
    """
    Loads the calibration csv files from a local repository containing
    the asset management information.

    Args:
        instrument - a pandas dataframe with the asset tracking information
            for a specific instrument.
        filepath - the directory path pointing to where the csv files are
            stored.
    Raises:
        TypeError - if the instrument input is not a pandas dataframe
    Returns:
        csv_dict - a dictionary with keys of the UIDs from the instrument dataframe
            which correspond to lists of the relevant calibration csv files
    """

    # Check that the input is a pandas DataFrame
    if type(instrument) != pd.core.frame.DataFrame:
        raise TypeError()

    uids = sorted(list(set(instrument['UID'])))

    csv_dict = {}
    for uid in uids:
        # Get a specified uid from the instrument dataframe
        instrument['UID_match'] = instrument['UID'].apply(lambda x: True if uid in x else False)
        instrument[instrument['UID_match'] == True]

        # Now, get all the csvs from asset management for a particular UID
        csv_files = []
        for file in os.listdir(filepath):
            if fnmatch.fnmatch(file, '*'+uid+'*'):
                csv_files.append(file)
            else:
                pass

        # Update the dictionary storing the asset management files for each UID
        if len(csv_files) > 0:
            csv_dict.update({uid: csv_files})
        else:
            pass

    return csv_dict

In [3]:
def get_qct_files(df, qct_directory):
    """
    Function which gets all the QCT files associated with the
    instrument serial numbers.

    Args:
        serial_nums - serial numbers of the instruments
        dirpath - path to the directory containing the calibration files
    Returns:
        calibration_files - a dictionary of instrument uids with associated
            calibration files
    """
    
    qct_dict = {}
    uids = list(set(df['UID']))
    for uid in uids:
        df['UID_match'] = df['UID'].apply(lambda x: True if uid in x else False)
        qct_series = df[df['UID_match'] == True]['QCT Testing']
        qct_series = list(qct_series.iloc[0].split('\n'))
        qct_dict.update({uid:qct_series})
    return qct_dict

In [698]:
def get_calibration_files(serial_nums, cal_directory):
    """
    Function which gets all the calibration files associated with the
    instrument serial numbers.

    Args:
        serial_nums - serial numbers of the instruments
        cal_directory - path to the directory containing the calibration files
    Returns:
        calibration_files - a dictionary of instrument uids with associated
            calibration files
    """
    calibration_files = {}
    for uid in serial_nums.keys():
        sn = serial_nums.get(uid)
        if type(sn) is not str:
            sn = str(sn[0])
        files = []
        for file in os.listdir(cal_directory):
            if 'calibration_file' in file.lower():
                if sn in file:
                    files.append(file)
        calibration_files.update({uid: files})

    return calibration_files

In [6]:
def splitDataFrameList(df,target_column):
    """
    Args:
        df = dataframe to split
        target_column = the column containing the values to split
    Returns:
        new_rows - a dataframe with each entry for the target column
            separated, with each element moved into a new row. The
            values in the other columns are duplicated across the
            newly divided rows.
    """
    
    def splitListToRows(row,row_accumulator,target_column):
        split_row = row[target_column]
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
            
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column))
    new_df = pd.DataFrame(new_rows)
    
    return new_df

In [7]:
# Now, write a function to copy over the file
def copy_to_local(cal_path):
    """
    Function which copies the files from the cal_path to a locally
    created temp directory
    """
    
    for filepath in cal_path:
        # Create a folder in which to save extracted data
        folder, *ignore = filepath.split('/')[-1].split('.')
        savedir = '/'.join((os.getcwd(),'temp','cal_data',folder))
        # Now make sure that the save directory exists and can be used
        ensure_dir(savedir)
    
        if filepath.endswith('.zip'):
            with ZipFile(filepath,'r') as zfile:
                for file in zfile.namelist():
                    zfile.extract(file,path=savedir)    
        else:
            shutil.copy(filepath, savedir)

**====================================================================================================================**
Define the directories where the QCT, Pre, and Post deployment document files are stored, where the vendor documents are stored, where asset tracking is located, and where the calibration csvs are located.

In [8]:
doc_directory = '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/SPKIR/SPKIR_Results/'
cal_directory = '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/SPKIR/SPKIR_Cal/'
asset_management_directory = '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/'

In [9]:
excel_spreadsheet = '/media/andrew/OS/Users/areed/Documents/Project_Files/Documentation/System/System Notebook/WHOI_Asset_Tracking.xlsx'
sheet_name = 'Sensors'

In [11]:
SPKIR = whoi_asset_tracking(spreadsheet=excel_spreadsheet,sheet_name=sheet_name,instrument_class='SPKIR',series='B')
SPKIR

Unnamed: 0,Instrument Class,Series,Supplier Serial Number,WHOI #,OOI #,UID,Model,CGSN PN,Firmware Version,Supplier,...,QCT Testing,PreDeployment,Post Deployment,Refurbishment/ Repair,DO Number,Date Received,Deployment History,Current Deployment,Instrument Location on Current Deployment,Notes
1066,SPKIR,B,229,115140,A00092,CGINS-SPKIRB-00229,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.0A,Satlantic,...,3305-00114-00001\n3305-00114-00025\n3305-00114...,,,3305-00900-00092\n3305-00900-00325,WH-SC12-15-SPKIR-1001,2012-11-19 00:00:00,GA01SUMO-00001\nGA01SUMO-00003\nCP03ISSM-00010,CP03ISSM-00010,NSIF,"Bioshutter 171\nReturned on 12/3, back on 12/1..."
1067,SPKIR,B,230,115141,A00093,CGINS-SPKIRB-00230,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.0A,Satlantic,...,3305-00114-00002\n3305-00114-00056\n3305-00114...,,,3305-00900-00092\n3305-00900-00325,WH-SC12-15-SPKIR-1001,2012-11-19 00:00:00,GI Spare\nGA01SUMO-00001\nGA01SUMO-00003\nGS 5...,Pioneer 12 spare,,"Bioshutter 172\nReturned on 12/3, back on 12/12\n"
1068,SPKIR,B,238,115714,A00405,CGINS-SPKIRB-00238,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.0A,Satlantic,...,3305-00114-00008\n3305-00114-00031\n3305-00114...,,,3305-00900-00008\n3305-00900-00206\n3305-00900...,WH-SC12-15-SPKIR-1005,2013-09-19 00:00:00,CP01CNSM-00001\nGA01SUMO-00002\nCP03ISSM-00007,GI01SUMO-00006,Halo,Bioshutter 184
1069,SPKIR,B,239,115715,A00406,CGINS-SPKIRB-00239,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.0A,Satlantic,...,3305-00114-00009\n3305-00114-00052\n3305-00114...,,,3305-00900-00075\n3305-00900-00277,WH-SC12-15-SPKIR-1005,2013-09-19 00:00:00,CP1 Spare\nGI01SUMO-00001\nGI01SUMO-00003\nGI0...,GI01SUMO-00005,Halo,Bioshutter 185
1070,SPKIR,B,240,115716,A00407,CGINS-SPKIRB-00240,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.0A,Satlantic,...,3305-00114-00010\n3305-00114-00049\n3305-00114...,,,3305-00900-00067\n3305-00900-00277,WH-SC12-15-SPKIR-1005,2013-09-19 00:00:00,GI01SUMO-00001\nGI01SUMO-00003\nGI01SUMO-00005,GI01SUMO-00005,NSIF,Bioshutter 186 \n(NSIF)
1071,SPKIR,B,269,116534,A00857,CGINS-SPKIRB-00269,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.2A,Satlantic,...,3305-00114-00019,,,3305-00900-00109,WH-SC12-15-SPKIR-1009,2014-06-12 00:00:00,GS01SUMO-00001\nGA/GS spare\nCP8 spare\nGI Spa...,,,Bioshutter 216
1072,SPKIR,B,270,116535,A00858,CGINS-SPKIRB-00270,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.2A,Satlantic,...,3305-00114-00020\n3305-00114-00050\n3305-00114...,,,3305-00900-00071\n3305-00900-00173\n3305-00900...,WH-SC12-15-SPKIR-1009,2014-06-12 00:00:00,CP03ISSM-00001\nCP01CNSM-00005\nCP01CNSM-00007...,,NSIF,Bioshutter 217\nVendor was contacted regarding...
1073,SPKIR,B,271,116536,A00859,CGINS-SPKIRB-00271,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.2A,Satlantic,...,3305-00114-00018\n3305-00114-00043\n3305-00114...,,,3305-00900-00042\n3305-00900-00173\n3305-00900...,WH-SC12-15-SPKIR-1009,2014-06-12 00:00:00,CP04OSSM-00001\nCP04OSSM-00004\nCP04OSSM-00006...,,NSIF,Bioshutter 218
1074,SPKIR,B,272,116537,A00860,CGINS-SPKIRB-00272,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.2A,Satlantic,...,3305-00114-00017\n3305-00114-00051,,,3305-00900-00071,WH-SC12-15-SPKIR-1009,2014-06-12 00:00:00,CP01CNSM-00002\nCP01CNSM-00003\nGS01SUMO-00003,,,Bioshutter 219
1075,SPKIR,B,273,116684,A00976,CGINS-SPKIRB-00273,OCR-507 with Midrange Bioshutter,1336-00006-00002,3.2A,Satlantic,...,3305-00114-00021\n3305-00114-00053,,,3305-00900-00071,WH_SC12-15-SPKIR-1010,2014-08-15 00:00:00,CP3a Spare\nCP03ISSM-00002\nGS01SUMO-00003,,,Bioshutter 221


**======================================================================================================================**
Now, I want to load all the calibration csvs and group them by UID:

In [12]:
uids = sorted( list( set(SPKIR['UID']) ) )
uids

['CGINS-SPKIRB-00229',
 'CGINS-SPKIRB-00230',
 'CGINS-SPKIRB-00238',
 'CGINS-SPKIRB-00239',
 'CGINS-SPKIRB-00240',
 'CGINS-SPKIRB-00269',
 'CGINS-SPKIRB-00270',
 'CGINS-SPKIRB-00271',
 'CGINS-SPKIRB-00272',
 'CGINS-SPKIRB-00273',
 'CGINS-SPKIRB-00274',
 'CGINS-SPKIRB-00275',
 'CGINS-SPKIRB-00276',
 'CGINS-SPKIRB-00288',
 'CGINS-SPKIRB-00289',
 'CGINS-SPKIRB-00290',
 'CGINS-SPKIRB-00291',
 'CGINS-SPKIRB-00299',
 'CGINS-SPKIRB-00300',
 'CGINS-SPKIRB-00301',
 'CGINS-SPKIRB-00302']

In [13]:
csv_dict = {}
asset_management = os.listdir(asset_management_directory)
for uid in uids:
    files = [file for file in asset_management if uid in file]
    csv_dict.update({uid: sorted(files)})

In [14]:
csv_paths = {}
for uid in sorted(csv_dict.keys()):
    paths = []
    for file in csv_dict.get(uid):
        path = generate_file_path(asset_management_directory, file, ext=['.csv','.ext'])
        paths.append(path)
    csv_paths.update({uid: paths})

In [15]:
csv_paths

{'CGINS-SPKIRB-00229': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/CGINS-SPKIRB-00229__20150315.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/CGINS-SPKIRB-00229__20161027.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/CGINS-SPKIRB-00229__20180420.csv'],
 'CGINS-SPKIRB-00230': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/CGINS-SPKIRB-00230__20150315.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/CGINS-SPKIRB-00230__20161027.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/CGINS-SPKIRB-00230__20180418.csv'],
 'CGINS-SPKIRB-00238': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/SPKIRB/CGINS-SPKIRB-00238__20131121.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibratio

**=======================================================================================================================**
The SPKIR QCT capture files are stored with the following Document Control Numbers (DCNs): 3305-00114-XXXXX. Most are storead as **.txt** or **.log** files. The problem is that the encoding of the data is not clear how the QCT is stored. Consequently, the QCT files aren't going to be used to check the SPKIR instrument calibration (for now).




In [18]:
qct_dict = get_qct_files(SPKIR, doc_directory)
qct_paths = {}
for uid in sorted(qct_dict.keys()):
    paths = []
    for file in qct_dict.get(uid):
        path = generate_file_path(doc_directory, file)
        paths.append(path)
    qct_paths.update({uid: paths})

**=======================================================================================================================** Find and return the calibration files which contain vendor supplied calibration information. This is achieved by searching the calibration directories and matching serial numbers to UIDs:

In [19]:
serial_nums = get_serial_nums(SPKIR, uids)

In [693]:
serial_nums;

In [21]:
cal_dict = get_calibration_files(serial_nums, cal_directory)

In [22]:
# Retrieve and save the full directory path to the calibration files
cal_paths = {}
for uid in sorted(cal_dict.keys()):
    paths = []
    for file in cal_dict.get(uid):
        path = generate_file_path(cal_directory, file, ext=['.zip','.cap', '.txt', '.log'])
        paths.append(path)
    cal_paths.update({uid: paths})

In [692]:
cal_paths;

**=======================================================================================================================**
# Parsing Calibration Coefficients
Above, we have worked through identifying and mapping the calibration files and QCT check-ins to the individual instruments through their UIDs and serial numbers. The next step is to open the relevant files and parse out the calibration coefficients. This will require writing a parser for the SPKIR.

In [24]:
class SPKIRCalibration():
    # Class that stores calibration values for CTDs.

    def __init__(self, uid):
        self.serial = None
        self.uid = uid
        self.date = []
        self.coefficients = {
            'CC_immersion_factor': [],
            'CC_offset': [],
            'CC_scale': []
        }
        self.notes = {
            'CC_immersion_factor': '',
            'CC_offset': '',
            'CC_scale': '',
        }

    @property
    def uid(self):
        return self._uid

    @uid.setter
    def uid(self, d):
        r = re.compile('.{5}-.{6}-.{5}')
        if r.match(d) is not None:
            self._uid = d
            self.serial = d.split('-')[-1].lstrip('0')
        else:
            raise Exception(f"The instrument uid {d} is not a valid uid. Please check.")
            
            
    def load_cal(self, filepath):
        """
        Wrapper function to load all of the calibration coefficients
        
        Args:
            filepath - path to the directory with filename which has the
                calibration coefficients to be parsed and loaded
        Calls:
            open_cal
            parse_cal
        """
        
        data = self.open_cal(filepath)
        
        self.parse_cal(data)
        
        
    def open_cal(self, filepath):
        """
        Function that opens and reads in cal file
        information for a SPKIR. Zipfiles are acceptable inputs.
        """
        
        if filepath.endswith('.zip'):
            with ZipFile(filepath) as zfile:
                # Check if OPTAA has the .dev file
                filename = [name for name in zfile.namelist() if name.lower().endswith('.cal')]
                
                # Get and open the latest calibration file
                if len(filename) == 1:
                    data = zfile.read(filename[0]).decode('ascii')
                    self.source_file(filepath, filename[0])
                    
                elif len(filename) > 1:
                    raise FileExistsError(f"Multiple .cal files found in {filepath}.")

                else:
                    raise FileNotFoundError(f"No .cal file found in {filepath}.")
                        
        elif filepath.lower().endswith('.cal'):
            with open(filepath) as file:
                data = file.read()
            self.source_file(filepath, file)
          
        else:
            raise FileNotFoundError(f"No .cal file found in {filepath}.")
        
        return data
        
        
    def parse_cal(self, data):
        
        flag = False
        for line in data.splitlines():
            if line.startswith('#'):
                parts = line.split('|')
                if len(parts) > 5 and 'Calibration' in parts[-1].strip():
                    cal_date = parts[0].replace('#','').strip()
                    self.date.append(pd.to_datetime(cal_date).strftime('%Y%m%d'))
                    
            elif line.startswith('SN'):
                parts = line.split()
                _, sn, *ignore = parts
                sn = sn.lstrip('0')
                if self.serial != sn:
                    raise ValueError(f'Instrument serial number {sn} does not match UID {self.uid}')
                    
            elif line.startswith('ED'):
                flag = True
                
            elif flag:
                offset, scale, immersion_factor = line.split()
                self.coefficients['CC_immersion_factor'].append(immersion_factor)
                self.coefficients['CC_offset'].append(offset)
                self.coefficients['CC_scale'].append(scale)
                flag = False
                
            else:
                continue
        
        
    def source_file(self, filepath, filename):
        """
        Routine which parses out the source file and filename
        where the calibration coefficients are sourced from.
        """
        
        if filepath.lower().endswith('.cal'):
            dcn = filepath.split('/')[-2]
            filename = filepath.split('/')[-1]
        else:
            dcn = filepath.split('/')[-1]
        
        self.source = f'Source file: {dcn} > {filename}'
        
        
    def write_csv(self, outpath):
        """
        This function writes the correctly named csv file for the ctd to the
        specified directory.

        Args:
            outpath - directory path of where to write the csv file
        Raises:
            ValueError - raised if the CTD object's coefficient dictionary
                has not been populated
        Returns:
            self.to_csv - a csv of the calibration coefficients which is
                written to the specified directory from the outpath.
        """

        # Run a check that the coefficients have actually been loaded
        if len(self.coefficients.values()) <= 2:
            raise ValueError('No calibration coefficients have been loaded.')

        # Create a dataframe to write to the csv
        data = {
            'serial': [self.serial]*len(self.coefficients),
            'name': list(self.coefficients.keys()),
            'value': list(self.coefficients.values())
        }
        df = pd.DataFrame().from_dict(data)
      
        # Now merge the coefficients dataframe with the notes
        notes = pd.DataFrame().from_dict({
            'name':list(self.notes.keys()),
            'notes':list(self.notes.values())
        })
        df = df.merge(notes, how='outer', left_on='name', right_on='name')
            
        # Add in the source file
        df['notes'].iloc[0] = df['notes'].iloc[0] + ' ' + self.source
        
        # Sort the data by the coefficient name
        df = df.sort_values(by='name')

        # Generate the csv names
        csv_name = self.uid + '__' + max(self.date) + '.csv'
        
        # Write the dataframe to a csv file
        check = input(f"Write {csv_name} to {outpath}? [y/n]: ")
        # check = 'y'
        if check.lower().strip() == 'y':
            df.to_csv(outpath+'/'+csv_name, index=False)

**=======================================================================================================================**
# Source Loading of Calibration Coefficients
With a SPKIR Calibration object created, we can now begin parsing the different calibration sources for each SPKIR. We will then compare all of the calibration values from each of the sources, checking for any discrepancies between them.

Below, I plan on going through each of the SPKIR UIDs, and parse the data into csvs. For source files which may contain multiple calibrations or calibration sources, I plan on extracting each of the calibrations to a temporary folder using the following structure:

    <local working directory>/<temp>/<source>/data/<calibration file>
    
The separate calibrations will be saved using the standard UFrame naming convention with the following directory structure:

    <local working directory>/<temp>/<source>/<calibration csv>
    
The csvs themselves will also be copied to the temporary folder. This allows for the program to be looking into the same temp directory for every SPKIR check.

In [25]:
import shutil

In [643]:
uid = uids[20]
uid

'CGINS-SPKIRB-00302'

Make the local temp directory. If it already exists; purge it and rewrite:

In [644]:
temp_directory = '/'.join((os.getcwd(),'temp'))
if os.path.exists(temp_directory):
    shutil.rmtree(temp_directory)
    ensure_dir(temp_directory)

Copy the existing csvs from asset management to the temp directory:

In [645]:
for path in csv_paths[uid]:
    savedir = '/'.join((temp_directory,'csv'))
    ensure_dir(savedir)
    savepath = '/'.join((savedir, path.split('/')[-1]))
    shutil.copyfile(path, savepath)

**=======================================================================================================================**
Load the calibration coefficients from the vendor calibration source files. Start by extracting or copying them to the source data folder in the temporary directory.

Extract the calibration zip files to the local temp directory:

In [647]:
for path in cal_paths[uid]:
    with ZipFile(path) as zfile:
        files = [name for name in zfile.namelist() if name.lower().endswith('.cal')]
        for file in files:
            exdir = path.split('/')[-1].strip('.zip')
            expath = '/'.join((temp_directory,'cal','data',exdir))
            ensure_dir(expath)
            zfile.extract(file,path=expath)

Write the vendor calibration files to csvs following the UFrame convention:

In [648]:
savedir = '/'.join((temp_directory,'cal'))
ensure_dir(savedir)
# Now parse the calibration coefficients
for dirpath, dirnames, filenames in os.walk('/'.join((temp_directory,'cal','data'))):
    for file in filenames:
        filepath = os.path.join(dirpath, file)
        # With the filepath for the given calibration retrived, I can now start an instance of the NUTNR Calibration
        # object and begin parsing the coefficients
        spkir = SPKIRCalibration(uid)
        spkir.load_cal(filepath)
        spkir.write_csv(savedir)

Write CGINS-SPKIRB-00302__20150729.csv to /home/andrew/Documents/OOI-CGSN/QAQC_Sandbox/Metadata_Review/temp/cal? [y/n]: y
Write CGINS-SPKIRB-00302__20170911.csv to /home/andrew/Documents/OOI-CGSN/QAQC_Sandbox/Metadata_Review/temp/cal? [y/n]: y


**=======================================================================================================================**
# Calibration Coefficient Comparison
We have now successfully parsed the calibration files from all the possible sources: the vendor calibration files, the pre-deployments files, and the post-deployment files. Furthermore, we have saved csvs in the UFrame format for all of these calibrations. Now, we want to load those csvs into pandas dataframes, which allow for easy element-by-element comparison of calibration coefficients.

In [650]:
def get_file_date(x):
    x = str(x)
    ind1 = x.index('__')
    ind2 = x.index('.')
    return x[ind1+2:ind2]

In [689]:
# Now we want to compare dataframe
csv_files = [file for file in sorted(os.listdir('temp/csv')) if 'data' not in file]
csv_files = pd.DataFrame(csv_files, columns=['csv'])
csv_files['cal date'] = csv_files['csv'].apply(lambda x: get_file_date(x))
csv_files.set_index('cal date', inplace=True)

In [690]:
# Now we want to compare dataframe
cal_files = [file for file in sorted(os.listdir('temp/cal')) if 'data' not in file]
cal_files = pd.DataFrame(cal_files, columns=['cal'])
cal_files['cal date'] = cal_files['cal'].apply(lambda x: get_file_date(x))
cal_files.set_index('cal date', inplace=True)

In [691]:
df_files = csv_files.join(cal_files,how='outer').fillna(value='-999')
df_files

Unnamed: 0_level_0,csv,cal
cal date,Unnamed: 1_level_1,Unnamed: 2_level_1
20150729,CGINS-SPKIRB-00302__20150729.csv,CGINS-SPKIRB-00302__20150729.csv
20170911,CGINS-SPKIRB-00302__20170911.csv,CGINS-SPKIRB-00302__20170911.csv


In [661]:
# Rename above CSV file names
sn = '00302'
d1 = '20170913'
d2 = '20170911'

In [662]:
src = 'temp/csv/' + f'CGINS-SPKIRB-{sn}__{d1}.csv'
dst = 'temp/csv/' + f'CGINS-SPKIRB-{sn}__{d2}.csv'
shutil.move(src, dst)

'temp/csv/CGINS-SPKIRB-00302__20170911.csv'

Reload the csv files in order to perform the comparison:

In [664]:
# CSV files
csv_files = [file for file in sorted(os.listdir('temp/csv')) if 'data' not in file]
csv_files = pd.DataFrame(csv_files, columns=['csv'])
csv_files['cal date'] = csv_files['csv'].apply(lambda x: get_file_date(x))
csv_files.set_index('cal date', inplace=True)

In [665]:
# Calibration source files
cal_files = [file for file in sorted(os.listdir('temp/cal')) if 'data' not in file]
cal_files = pd.DataFrame(cal_files, columns=['cal'])
cal_files['cal date'] = cal_files['cal'].apply(lambda x: get_file_date(x))
cal_files.set_index('cal date', inplace=True)

In [666]:
df_files = csv_files.join(cal_files,how='outer').fillna(value='-999')

In [667]:
df_files

Unnamed: 0_level_0,csv,cal
cal date,Unnamed: 1_level_1,Unnamed: 2_level_1
20150729,CGINS-SPKIRB-00302__20150729.csv,CGINS-SPKIRB-00302__20150729.csv
20170911,CGINS-SPKIRB-00302__20170911.csv,CGINS-SPKIRB-00302__20170911.csv


**=======================================================================================================================**
Now, with the csv files renamed to match their associated calibration dates following the OOI UFrame format, we can load the info into pandas dataframe which will allow for the direct comparison of calibration coefficients using built in array comparison tools from numpy. 

A complication is that, when loading a csv using pandas, it reads the csv as strings. This includes characters such as **[]**. Consequently, we need to reformat the arrays in the dataframe and convert to 64-bit floating point numbers. 

In [684]:
def reformat_arrays(array):
    # First, need to strip extraneous characters from the array
    array = array.replace("'","").replace('[','').replace(']','')
    # Next, split the array into a list
    array = array.split(',')
    # Now, need to eliminate any white space surrounding the individual coeffs
    array = [num.strip() for num in array]
    # Next, float the nums
    try:
        array = [float(num) for num in array]
        # Check if the array is len == 1; if so, can just return the number
        if len(array) == 1:
            array = array[0]
    except:
        pass
    # Now we are done
    return array

Load the calibration coefficients into pandas dataframes:

In [676]:
# Use fstring literals to allow on the fly file-renaming
dt = '20170911'
fname = f'CGINS-SPKIRB-{sn}__{dt}.csv'

In [677]:
CSV = pd.read_csv('temp/csv/'+fname)
CSV

Unnamed: 0,serial,name,value,notes
0,302,CC_immersion_factor,"[1.368, 1.41, 1.365, 1.354, 1.372, 1.322, 1.347]",
1,302,CC_offset,"[2147969425.9, 2147121996.9, 2147405792.6, 214...",
2,302,CC_scale,"[2.01373417518e-07, 1.9315808239e-07, 2.003087...",


In [678]:
CAL = pd.read_csv('temp/cal/'+fname)
CAL

Unnamed: 0,serial,name,value,notes
0,302,CC_immersion_factor,"['1.368', '1.410', '1.365', '1.354', '1.372', ...",Source file: SPKIR_OCR-507_SN_302_Calibration...
1,302,CC_offset,"['2147969425.9', '2147121996.9', '2147405792.6...",
2,302,CC_scale,"['2.01373417518e-007', '1.9315808239e-007', '2...",


In [680]:
# Reformat the arrays
CSV['value'] = CSV['value'].apply(lambda x: reformat_arrays(x))
CAL['value'] = CAL['value'].apply(lambda x: reformat_arrays(x))

In [681]:
# Check that the calibration coefficients agree
np.equal(CSV,CAL)

Unnamed: 0,serial,name,value,notes
0,True,True,True,False
1,True,True,True,False
2,True,True,False,False


In [683]:
# Check the source file for the calibration coefficients
CAL['notes'].iloc[0]

' Source file: SPKIR_OCR-507_SN_302_Calibration_Files_2017-09-13 > DI7302B.cal'