# CTDMO Metadata Review

This notebook describes the process for reviewing the calibration coefficients for the CTDMO IM-37. The CTDMO contains 24 different calibration coefficients to check. Additionally, possible calibration sources include vendor documents as well as the QCT check-in. 

A complication is that the vendor documents are principally available only as PDFs that are copies of images. This requires the use of Optical Character Recognition (OCR) in order to parse the PDFs. Unfortunately, OCR frequently misinterprets certain character combinations, since it utilizes Levenstein-distance to do character matching. 

Furthermore, using OCR to read PDFs requires significant preprocessing of the PDFs to create individual PDFs with uniform metadata and encoding. Without this preprocessing, the OCR will not generate uniformly spaced characters, making parsing significantly more difficult nee impossible.

In [1]:
# Import likely important packages, etc.
import sys, os, csv, re
from wcmatch import fnmatch
import datetime
import time
import xml.etree.ElementTree as et
from zipfile import ZipFile
import numpy as np
import pandas as pd
import xarray as xr
import shutil

In [2]:
from utils import *

**=========================================================================================================================**
Define some useful functions for the metadata review (in future will move to a utilities file):

In [3]:
def whoi_asset_tracking(spreadsheet, sheet_name, instrument_class='All', whoi=True, series=None):
    """
    Loads all the individual sensors of a specific instrument class and
    series type. Currently applied only for WHOI deployed instruments.

    Args:
        spreadsheet - directory path and name of the excel spreadsheet with
            the WHOI asset tracking information.
        sheet_name - name of the sheet in the spreadsheet to load
        instrument_class - the type (i.e. CTDBP, CTDMO, PCO2W, etc). Defaults
            to 'All', which will load all of the instruments
        whoi - return only whoi instruments? Defaults to True.
        series - a specified class of the instrument to load. Defaults to None,
            which will load all of the series for a specified instrument class
    """

    all_sensors = pd.read_excel(spreadsheet, sheet_name=sheet_name, header=1)
    # Select a specific class of instruments
    if instrument_class == 'All':
        inst_class = all_sensors
    else:
        inst_class = all_sensors[all_sensors['Instrument\nClass'] == instrument_class]
    # Return only the whoi instruments?
    if whoi:
        whoi_insts = inst_class[inst_class['Deployment History'] != 'EA']
    else:
        whoi_insts = inst_class
    # Slect a specific series of the instrument?
    if series is not None:
        instrument = whoi_insts[whoi_insts['Series'] == series]
    else:
        instrument = whoi_insts

    return instrument

In [4]:
def load_asset_management(instrument, filepath):
    """
    Loads the calibration csv files from a local repository containing
    the asset management information.

    Args:
        instrument - a pandas dataframe with the asset tracking information
            for a specific instrument.
        filepath - the directory path pointing to where the csv files are
            stored.
    Raises:
        TypeError - if the instrument input is not a pandas dataframe
    Returns:
        csv_dict - a dictionary with keys of the UIDs from the instrument dataframe
            which correspond to lists of the relevant calibration csv files

    """

    # Check that the input is a pandas DataFrame
    if type(instrument) != pd.core.frame.DataFrame:
        raise TypeError()

    uids = sorted(list(set(instrument['UID'])))

    csv_dict = {}
    for uid in uids:
        # Get a specified uid from the instrument dataframe
        instrument['UID_match'] = instrument['UID'].apply(lambda x: True if uid in x else False)
        instrument[instrument['UID_match'] == True]

        # Now, get all the csvs from asset management for a particular UID
        csv_files = []
        for file in os.listdir(filepath):
            if fnmatch.fnmatch(file, '*'+uid+'*'):
                csv_files.append(file)
            else:
                pass

        # Update the dictionary storing the asset management files for each UID
        if len(csv_files) > 0:
            csv_dict.update({uid: csv_files})
        else:
            pass

    return csv_dict

In [5]:
def get_serial_nums(df, uids):
    """
    Returns the serial numbers of all the instrument uids.

    Args:
        df - dataframe with the asset management information
        uids - list of the uids for the instruments
    Returns:
        serial_nums - a dictionary of uids (key) matched to their
            respective serial numbers

    """
    serial_nums = {}

    for uid in uids:
        df['UID_match'] = df['UID'].apply(lambda x: True if uid in x else False)
        serial_num = list(df[df['UID_match'] == True]['Supplier\nSerial Number'])
        if 'CTD' in uid:
            serial_num = serial_num[0].split('-')[1]
        serial_nums.update({uid: serial_num})

    return serial_nums

In [6]:
def get_qct_files(df, qct_directory):
    """
    Function which gets all the QCT files associated with the
    instrument serial numbers.

    Args:
        serial_nums - serial numbers of the instruments
        dirpath - path to the directory containing the calibration files
    Returns:
        calibration_files - a dictionary of instrument uids with associated
            calibration files
    """
    
    qct_dict = {}
    uids = list(set(df['UID']))
    for uid in uids:
        df['UID_match'] = df['UID'].apply(lambda x: True if uid in x else False)
        qct_series = df[df['UID_match'] == True]['QCT Testing']
        qct_series = list(qct_series.iloc[0].split('\n'))
        qct_dict.update({uid:qct_series})
    return qct_dict

In [7]:
def get_calibration_files(serial_nums, dirpath):
    """
    Function which gets all the calibration files associated with the
    instrument serial numbers.

    Args:
        serial_nums - serial numbers of the instruments
        dirpath - path to the directory containing the calibration files
    Returns:
        calibration_files - a dictionary of instrument uids with associated
            calibration files
    """
    calibration_files = {}
    for uid in serial_nums.keys():
        sn = serial_nums.get(uid)
        if type(sn) is list:
            sn = str(sn[0])
        files = []
        for file in os.listdir(cal_directory):
            if 'calibration_file' in file.lower():
                if sn in file:
                    files.append(file)
        calibration_files.update({uid: files})

    return calibration_files

In [8]:
def ensure_dir(filepath):
    """
    Function which checks that the directory where you want
    to save a file exists. If it doesn't, it creates the
    directory.
    """
    if not os.path.exists(filepath):
        os.makedirs(filepath)

In [9]:
# Now I need to load the all of the csv files based on their UID
def load_csv_info(csv_dict,filepath):
    """
    Loads the calibration coefficient information contained in asset management
    
    Args:
        csv_dict - a dictionary which associates an instrument UID to the
            calibration csv files in asset management
        filepath - the path to the directory containing the calibration csv files
    Returns:
        csv_cals - a dictionary which associates an instrument UID to a pandas
            dataframe which contains the calibration coefficients. The dataframes
            are indexed by the date of calibration
    """
    
    # Load the calibration data into pandas dataframes, which are then placed into
    # a dictionary by the UID
    csv_cals = {}
    for uid in csv_dict:
        cals = pd.DataFrame()
        for file in csv_dict[uid]:
            data = pd.read_csv(filepath+file)
            date = file.split('__')[1].split('.')[0]
            data['CAL DATE'] = pd.to_datetime(date)
            cals = cals.append(data)
        csv_cals.update({uid:cals})
        
    # Pivot the dataframe to be sorted based on calibration date
    for uid in csv_cals:
        csv_cals[uid] = csv_cals[uid].pivot(index=csv_cals[uid]['CAL DATE'], columns='name')['value']
        
    return csv_cals

In [10]:
def splitDataFrameList(df, target_column):
    """
    Args:
        df = dataframe to split
        target_column = the column containing the values to split
    Returns:
        new_rows - a dataframe with each entry for the target column
            separated, with each element moved into a new row. The
            values in the other columns are duplicated across the
            newly divided rows.
    """

    def splitListToRows(row, row_accumulator, target_column):
        split_row = row[target_column]
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)

    new_rows = []
    df.apply(splitListToRows, axis=1, args=(new_rows, target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df

In [11]:
def generate_file_path(dirpath, filename, ext=['.cap', '.txt', '.log'], exclude=['_V', '_Data_Workshop']):
    """
    Function which searches for the location of the given file and returns
    the full path to the file.

    Args:
        dirpath - parent directory path under which to search
        filename - the name of the file to search for
        ext - file endings to search for
        exclude - optional list which allows for excluding certain
            directories from the search
    Returns:
        fpath - the file path to the filename from the current
            working directory.
    """
    # Check if the input file name has an extension already
    # If it does, parse it for input into the search algo
    if '.' in filename:
        check = filename.split('.')
        filename = check[0]
        ext = ['.'+check[1]]

    for root, dirs, files in os.walk(dirpath):
        dirs[:] = [d for d in dirs if d not in exclude]
        for fname in files:
            if fnmatch.fnmatch(fname, [filename+'*'+x for x in ext]):
                fpath = os.path.join(root, fname)
                return fpath

**====================================================================================================================**
Define the directories where the QCT document files are stored as well as where the vendor documents are stored, where asset tracking is located, and where the calibration csvs are located.

In [12]:
qct_directory = '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results'
cal_directory = '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Cal'
asset_management_directory = '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG'

In [13]:
excel_spreadsheet = '/media/andrew/OS/Users/areed/Documents/Project_Files/Documentation/System/System Notebook/WHOI_Asset_Tracking.xlsx'
sheet_name = 'Sensors'

In [15]:
CTDMO = whoi_asset_tracking(excel_spreadsheet,sheet_name,instrument_class='CTDMO',whoi=True,series='G')
CTDMO.head(10)

Unnamed: 0,Instrument Class,Series,Supplier Serial Number,WHOI #,OOI #,UID,Model,CGSN PN,Firmware Version,Supplier,...,QCT Testing,PreDeployment,Post Deployment,Refurbishment/ Repair,DO Number,Date Received,Deployment History,Current Deployment,Instrument Location on Current Deployment,Notes
95,CTDMO,G,37-10214,115166,A00103,CGINS-CTDMOG-10214,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00001\n3305-00101-00082\n3305-00101...,,,3305-00900-00060\n3305-00900-00257,WH-SC11-01-CTD-1004,12/12/2012,GP02HYPM-00002\nGP03FLMA-00004,,,
96,CTDMO,G,37-10215,115167\n118298,A00104,CGINS-CTDMOG-10215,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00002\n3305-00101-00404\n3305-00101...,,,3305-00900-00146\n3305-00900-00402,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMA-00001\nGP03FLMA-00003\nGP03FLMB-00005,,,
97,CTDMO,G,37-10216,115168,A00105,CGINS-CTDMOG-10216,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00003\n3305-00101-00255\n3305-00101...,,,3305-00900-00060\n3305-00900-00257,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMB-00001\nGP03FLMB-00004,,,
98,CTDMO,G,37-10217,115169\n118299,A00106,CGINS-CTDMOG-10217,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00004\n3305-00101-00399\n3305-00101...,,,3305-00900-00146\n3305-00900-00402,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMB-00001\nGP03FLMA-00003\nGP03FLMB-00005,,,
99,CTDMO,G,37-10218,115170,A00107,CGINS-CTDMOG-10218,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00005\n3305-00101-00083\n3305-00101...,,,3305-00900-00060\n3305-00900-00257,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMA-00002\nGP03FLMB-00004,,,
100,CTDMO,G,37-10219,115171\n118138,A00108,CGINS-CTDMOG-10219,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00006\n3305-00101-00436\n3305-00101...,,,3305-00900-00152\n3305-00900-00392,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMA-00001\nGP03FLMA-00003\nGI02HYPM-00004,,,
101,CTDMO,G,37-10220,115172\n118300,A00109,CGINS-CTDMOG-10220,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00007\n3305-00101-00403\n3305-00101...,,,3305-00900-00146\n3305-00900-00402,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMA-00001\nGP03FLMA-00003\nGP03FLMB-00005,,,
102,CTDMO,G,37-10221,115173\n118301,A00110,CGINS-CTDMOG-10221,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00008\n3305-00101-00426\n3305-00101...,,,3305-00900-00146\n3305-00900-00402,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMB-00001\nGP03FLMA-00003\nGP03FLMB-00005,,,
103,CTDMO,G,37-10222,115174,A00111,CGINS-CTDMOG-10222,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00009\n3305-00101-00257\n3305-00101...,,,3305-00900-00060\n3305-00900-00257,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMB-00001\nGP03FLMB-00004,,,
104,CTDMO,G,37-10223,115175,A00112,CGINS-CTDMOG-10223,37IM,1336-00001-00007,3.1,SeaBird,...,3305-00101-00010\n3305-00101-00084\n3305-00101...,,,3305-00900-00060\n3305-00900-00257,WH-SC11-01-CTD-1004,12/12/2012,GP03FLMB-00002\nGP03FLMB-00004,,,


**======================================================================================================================**

First, get all the unique CTDMO Instrument UIDs:

In [16]:
uids = sorted(list(set(CTDMO['UID'])))

Identify the QCT Testing documents associated with each individual instrument (the UID):

In [17]:
qct_dict = get_qct_files(CTDMO, qct_directory)
qct_dict

{'CGINS-CTDMOG-12410': ['3305-00101-00407', '3305-00101-00628'],
 'CGINS-CTDMOG-13601': ['3305-00101-00204', '3305-00101-00531'],
 'CGINS-CTDMOG-13336': ['3305-00101-00178', '3305-00101-00291'],
 'CGINS-CTDMOG-12378': ['3305-00101-00429', '3305-00101-00709'],
 'CGINS-CTDMOG-12412': ['3305-00101-00391', '3305-00101-00629'],
 'CGINS-CTDMOG-10216': ['3305-00101-00003',
  '3305-00101-00255',
  '3305-00101-00578'],
 'CGINS-CTDMOG-13604': ['3305-00101-00206', '3305-00101-00540'],
 'CGINS-CTDMOG-12330': ['3305-00101-00372', '3305-00101-00634'],
 'CGINS-CTDMOG-13421': ['3305-00101-00147',
  '3305-00101-00473',
  '3305-00101-00726'],
 'CGINS-CTDMOG-13430': ['3305-00101-00154',
  '3305-00101-00480',
  '3305-00101-00674'],
 'CGINS-CTDMOG-12609': ['3305-00101-00396', '3305-00101-00643'],
 'CGINS-CTDMOG-13420': ['3305-00101-00146',
  '3305-00101-00472',
  '3305-00101-00729'],
 'CGINS-CTDMOG-13603': ['3305-00101-00205', '3305-00101-00541'],
 'CGINS-CTDMOG-11647': ['3305-00101-00068',
  '3305-00101-0

Identify the calibration csvs stored in asset management which correspond to a particular instrument:

In [18]:
csv_dict = load_asset_management(CTDMO, asset_management_directory)
csv_dict

{'CGINS-CTDMOG-10214': ['CGINS-CTDMOG-10214__20170829.csv',
  'CGINS-CTDMOG-10214__20121113.csv',
  'CGINS-CTDMOG-10214__20140214.csv',
  'CGINS-CTDMOG-10214__20151110.csv'],
 'CGINS-CTDMOG-10215': ['CGINS-CTDMOG-10215__20160915.csv',
  'CGINS-CTDMOG-10215__20121113.csv',
  'CGINS-CTDMOG-10215__20181009.csv'],
 'CGINS-CTDMOG-10216': ['CGINS-CTDMOG-10216__20170903.csv',
  'CGINS-CTDMOG-10216__20121113.csv',
  'CGINS-CTDMOG-10216__20151113.csv'],
 'CGINS-CTDMOG-10217': ['CGINS-CTDMOG-10217__20160831.csv',
  'CGINS-CTDMOG-10217__20121114.csv',
  'CGINS-CTDMOG-10217__20181009.csv'],
 'CGINS-CTDMOG-10218': ['CGINS-CTDMOG-10218__20151108.csv',
  'CGINS-CTDMOG-10218__20170901.csv',
  'CGINS-CTDMOG-10218__20121115.csv',
  'CGINS-CTDMOG-10218__20140214.csv'],
 'CGINS-CTDMOG-10219': ['CGINS-CTDMOG-10219__20160921.csv',
  'CGINS-CTDMOG-10219__20180818.csv',
  'CGINS-CTDMOG-10219__20121114.csv'],
 'CGINS-CTDMOG-10220': ['CGINS-CTDMOG-10220__20121114.csv',
  'CGINS-CTDMOG-10220__20160827.csv',
  'C

Get the serial numbers for each CTDMO, and use those serial numbers to search for and return all of the relevant vendor documents for a particular instrument:

In [19]:
serial_nums = get_serial_nums(CTDMO, uids)

In [20]:
cal_dict = get_calibration_files(serial_nums, cal_directory)
cal_dict

{'CGINS-CTDMOG-10214': ['CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2012-11-13.pdf',
  'CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2014-02-14.pdf',
  'CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2016-01-13.pdf',
  'CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2017-08-29.zip'],
 'CGINS-CTDMOG-10215': ['CTDMO-G_SBE_37IM_SN_37-10215_Calibration_Files_2012-11-13.pdf',
  'CTDMO-G_SBE_37IM_SN_37-10215_Calibration_Files_2016-10-03.zip',
  'CTDMO-G_SBE_37IM_SN_37-10215_Calibration_Files_2018-10-09.zip'],
 'CGINS-CTDMOG-10216': ['CTDMO-G_SBE_37IM_SN_37-10216_Calibration_Files_2012-11-13.pdf',
  'CTDMO-G_SBE_37IM_SN_37-10216_Calibration_Files_2016-01-13.pdf',
  'CTDMO-G_SBE_37IM_SN_37-10216_Calibration_Files_2017-09-03.zip'],
 'CGINS-CTDMOG-10217': ['CTDMO-G_SBE_37IM_SN_37-10217_Calibration_Files_2012-11-14.pdf',
  'CTDMO-G_SBE_37IM_SN_37-10217_Calibration_Files_2016-10-03.zip',
  'CTDMO-G_SBE_37IM_SN_37-10217_Calibration_Files_2018-10-09.zip'],
 'CGINS-CTDMOG-10218': ['CTDMO-G_SBE_

**========================================================================================================================**
Print all of the CTDMO CSV files in order to retrieve all of the relevant files that need to be checked:

In [21]:
for uid in sorted(csv_dict.keys()):
    files = sorted(csv_dict[uid])
    sn = serial_nums[uid]
    for f in files:
        print('CTDMO-G' + '  ' + '37-' + sn + '  ' + f)

CTDMO-G  37-10214  CGINS-CTDMOG-10214__20121113.csv
CTDMO-G  37-10214  CGINS-CTDMOG-10214__20140214.csv
CTDMO-G  37-10214  CGINS-CTDMOG-10214__20151110.csv
CTDMO-G  37-10214  CGINS-CTDMOG-10214__20170829.csv
CTDMO-G  37-10215  CGINS-CTDMOG-10215__20121113.csv
CTDMO-G  37-10215  CGINS-CTDMOG-10215__20160915.csv
CTDMO-G  37-10215  CGINS-CTDMOG-10215__20181009.csv
CTDMO-G  37-10216  CGINS-CTDMOG-10216__20121113.csv
CTDMO-G  37-10216  CGINS-CTDMOG-10216__20151113.csv
CTDMO-G  37-10216  CGINS-CTDMOG-10216__20170903.csv
CTDMO-G  37-10217  CGINS-CTDMOG-10217__20121114.csv
CTDMO-G  37-10217  CGINS-CTDMOG-10217__20160831.csv
CTDMO-G  37-10217  CGINS-CTDMOG-10217__20181009.csv
CTDMO-G  37-10218  CGINS-CTDMOG-10218__20121115.csv
CTDMO-G  37-10218  CGINS-CTDMOG-10218__20140214.csv
CTDMO-G  37-10218  CGINS-CTDMOG-10218__20151108.csv
CTDMO-G  37-10218  CGINS-CTDMOG-10218__20170901.csv
CTDMO-G  37-10219  CGINS-CTDMOG-10219__20121114.csv
CTDMO-G  37-10219  CGINS-CTDMOG-10219__20160921.csv
CTDMO-G  37-

**========================================================================================================================**
With the individual files identified for the CTDMO Vendor documents, QCTs, and CSVs, we next get the full directory path to the files. This is necessary to load them:

CSV file paths:

In [22]:
csv_paths = {}
for uid in sorted(csv_dict.keys()):
    paths = []
    for file in csv_dict.get(uid):
        path = generate_file_path(asset_management_directory, file, ext=['.csv','.ext'])
        paths.append(path)
    csv_paths.update({uid: paths})

In [23]:
csv_paths

{'CGINS-CTDMOG-10214': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20170829.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20121113.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20140214.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20151110.csv'],
 'CGINS-CTDMOG-10215': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10215__20160915.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10215__20121113.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10215__20181009.csv'],
 'CGINS-CTDMOG-10216': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibratio

CAL file paths:

In [24]:
# Retrieve and save the full directory path to the calibration files
cal_paths = {}
for uid in sorted(cal_dict.keys()):
    paths = []
    for file in cal_dict.get(uid):
        path = generate_file_path(cal_directory, file, ext=['.zip','.cap', '.txt', '.log'])
        paths.append(path)
    cal_paths.update({uid: paths})

In [25]:
cal_paths

{'CGINS-CTDMOG-10214': ['/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Cal/CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2012-11-13.pdf',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Cal/CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2014-02-14.pdf',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Cal/CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2016-01-13.pdf',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Cal/CTDMO-G_SBE_37IM_SN_37-10214_Calibration_Files_2017-08-29.zip'],
 'CGINS-CTDMOG-10215': ['/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Cal/CTDMO-G_SBE_37IM_SN_37-10215_Calibration_Files_2012-11-13.pdf',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Cal/CTDMO-G_SBE_37IM_SN_37-10215_Calibration_

QCT file paths:

In [26]:
qct_paths = {}
for uid in sorted(qct_dict.keys()):
    paths = []
    for file in qct_dict.get(uid):
        path = generate_file_path(qct_directory, file)
        paths.append(path)
    qct_paths.update({uid: paths})

In [27]:
qct_paths

{'CGINS-CTDMOG-10214': ['/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results/3305-00101-00001-A.cap',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results/3305-00101-00082-A.txt',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results/3305-00101-00254-A.cap',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results/3305-00101-00569-A.log'],
 'CGINS-CTDMOG-10215': ['/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results/3305-00101-00002-A.cap',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results/3305-00101-00404-A.txt',
  '/media/andrew/OS/Users/areed/Documents/Project_Files/Records/Instrument_Records/CTDMO/CTDMO_Results/3305-00101-00745-A.txt'],
 'CGINS-CTDMOG-10216': ['/media/andrew/OS/Users/areed/Docu

**========================================================================================================================**
# Processing and Parsing the Calibration Coefficients
With the associated vendor documents (cal files), QCT checkins (qct files), and calibration csvs (csv files), I want to be able to compare the following:
* **(1)** That the calibration date matches between the different documents
* **(2)** The file name agrees with the CTDMO UID and the calibration date
* **(3)** The calibration coefficients all agree between the different reference documents and calibration csvs
* **(4)** Identify when a calibration coefficient is incorrect, where to find it, and how to correct it

Below, I plan on going through each of the CTDMO UIDs, and parse the data into csvs. For source files which may contain multiple calibrations or calibration sources, I plan on extracting each of the calibrations to a temporary folder using the following structure:

    <local working directory>/<temp>/<source>/data/<calibration file>
    
The separate calibrations will be saved using the standard UFrame naming convention with the following directory structure:

    <local working directory>/<temp>/<source>/<calibration csv>
    
The csvs themselves will also be copied to the temporary folder. This allows for the program to be looking into the same temp directory for every CTDMO check.

In [None]:
import shutil

In [None]:
uid = sorted(uids)[163]
uid

In [None]:
temp_directory = '/'.join((os.getcwd(),'temp'))
if os.path.exists(temp_directory):
    shutil.rmtree(temp_directory)
    ensure_dir(temp_directory)

**=======================================================================================================================**
Copy the existing CTDMO asset management csvs to the local temp directory:

In [None]:
for filepath in csv_paths[uid]:
    savedir = '/'.join((temp_directory,'csv'))
    ensure_dir(savedir)
    savepath = '/'.join((savedir, filepath.split('/')[-1]))
    shutil.copyfile(filepath, savepath)

========================================================================================================================
### Parse and process the vendor documents
The next step is to read and parse the vendor documents. This is a more difficult challenge, since for CTDMOs the vendor documents are retained mostly as pdf files. While the pdf files are parseable, there is an added complication in that the forms have changed over time, with sometimes the T/S/P calibration pdfs combined into a single file, whereas other times they are separated into individual files. Furthermore, the files are often zipped into a single folder. So, I have the following possible vendor documents:
* **(1)** A .cal file - this is the easiest to read and parse, in a similar format to the CTDBP .cal files
* **(2)** A combinded pdf - this is the most difficult format. Need to separate out the different pages which each separately contain either the temperature calibration info, the conductivity calibration info, or the pressure calibration info.
* **(3)** Separate pdfs - this is a simpler pdf reading schematic, where I know a priori which particular "page" will contain relevant calibration info. 

There are a couple of different pdf readers that I can use:
1. PyPDF2
2. PDFMiner
3. Textract

In [None]:
import PyPDF2
import textract

PyPDF2 does not work to extract text from the CTDMO combined pdf file document. Neither does the straightforward PDFMiner application. We will have to use OCR and textract to parse the pdf forms.

When parsing the pdf file, it appears that the built-in method of pdf2text does the best job at parsing the forms, particularly the temperature and conductivity coefficients. The pressure calibration coefficients are not as well parsed, due to the positioning of the image.

This means that I'm going to split and use two different methods for getting the calibration coefficients depending on what the calibration is for, i.e. T/S/P. For T and S, I'll use the built-in method for extracting text. For the pressure, I'll use the tesseract OCR approach.

========================================================================================================================
### Preprocessing the Vendor Files
In order to automate the parsing of the CTDMO calibration coefficients from pdf files into csv files that can be read by Python requires a bit of preprocessing. In particular, the following steps are taken to make parsing the files:
* **(1)** Copy or extract the vendor calibration files from the Vault location to a local temp directory
* **(2)** Iterate over the available pdfs and split multipage pdfs into single page pdfs and append _page_ to the file
* **(3)** Once the pdfs have been split, they are ready to be parsed by the CTDMO object parsers.

In [None]:
# Now, write a function to copy over the file
cal_paths[uid]

Copy the vendor pdf files to a local temporary directory:

In [None]:
for filepath in cal_paths[uid]:
    folder, *ignore = filepath.split('/')[-1].split('.')
    savedir = '/'.join((temp_directory,'data',folder))
    ensure_dir(savedir)
    
    if filepath.endswith('.zip'):
        with ZipFile(filepath,'r') as zfile:
            for file in zfile.namelist():
                zfile.extract(file, path=savedir)
    else:
        shutil.copy(filepath, savedir)

In [None]:
for file in os.listdir('/'.join((temp_directory,'data',folder))):
    if os.path.isdir('/'.join((temp_directory,'data',folder,file))):
        for subfile in os.listdir('/'.join((temp_directory,'data',folder,file))):
            src = '/'.join((temp_directory,'data',folder,file,subfile))
            dst = '/'.join((temp_directory,'data',folder,subfile))
            shutil.move(src,dst)
        shutil.rmtree('/'.join((temp_directory,'data',folder,file)))

In [None]:
folders = os.listdir('/'.join((os.getcwd(),'temp','data')))
rmfile = None
for folder in folders:
    filepath = '/'.join((os.getcwd(),'temp','data',folder))
    
    if any([file for file in os.listdir(filepath) if file.endswith('.cal')]):
        pass
    else:
        files = [file for file in os.listdir(filepath) if 'SERVICE REPORT' not in file]
        
        try:
            
            for file in files:
                trip = False
                inputpath = '/'.join((filepath,file))
                inputpdf = PyPDF2.PdfFileReader(inputpath, 'rb')

                for i in range(inputpdf.numPages):
                    output = PyPDF2.PdfFileWriter()
                    output.addPage(inputpdf.getPage(i))
                    filename = '_'.join((inputpath.split('.')[0], 'page', str(i)))
                    with open(filename+'.pdf', "wb") as outputStream:
                        output.write(outputStream)
        except:
            rmfile = filepath
            print(f'Cannot reformat {filepath}')
            
if rmfile is not None:
    shutil.rmtree(rmfile)

In [None]:
os.listdir(temp_directory+'/data')

The next step is to iterate over the vendor calibration files and extract the calibration coefficients from the files. This is done by starting an instance of the CTDMO calibration object, check if any of the calibration data is stored as a .cal file, if no .cal file loop over the other files looking for _page_ files which indicates that the pdf file has been prepped.

In [None]:
datadir = os.path.abspath('/'.join((os.getcwd(),'temp','data')))
for folder in os.listdir(datadir):
    # Okay, now start generating calibration csvs
    ctdmo = CTDMOCalibration(uid)
    files = [file for file in os.listdir('/'.join((datadir,folder)))]
    if any([file for file in files if file.endswith('.cal')]):
        for file in files:
            if file.endswith('.cal'):
                ctdmo.mo_parse_cal('/'.join((datadir,folder,file)))
    else:
        for file in files:
            if '_page_' in file:
                try:
                    ctdmo.mo_parse_pdf('/'.join((datadir,folder,file)))
                except:
                    print(f'Parsing failed for {file}')
                    
    savedir = '/'.join((os.getcwd(),'temp','cal'))
    ensure_dir(savedir)
    try:
        ctdmo.write_csv(savedir)
    except:
        pass

Check that the calibration object properly loaded all of the calibration coefficients, serial number, calibration date, etc., and wrote the appropriate csv file.

In [None]:
os.listdir(temp_directory+'/cal')

**=======================================================================================================================**
Next, we need to parse the QCT files and check that they have been successfully saved to a csv file. There should be 24 coefficients. Similarly, check the instrument serial number, the calibration date (may be more than one b/c separate calibration dates for T, S, and P sensors), and the type (for CTDMOs should be 37).

In [None]:
for filepath in qct_paths[uid]:
    savedir = '/'.join((temp_directory,'qct'))
    ensure_dir(savedir)
    if filepath is not None:
        try:
            ctdmo = CTDMOCalibration(uid)
            ctdmo.mo_parse_qct(filepath)
            ctdmo.write_csv(savedir)
        except:
            print(f'Failed to parse {filepath}')
    else:
        pass

In [None]:
qct_paths[uid]

In [None]:
os.listdir('/'.join((temp_directory,'qct')))

**========================================================================================================================**
### Compare results
Now, with QCT files parsed into csvs which follow the UFrame format, I can load both the QCT and the calibratoin csvs into pandas dataframes, which will allow element by element comparison in relatively few lines of code.

In [28]:
def get_file_date(x):
    x = str(x)
    ind1 = x.index('__')
    ind2 = x.index('.')
    return x[ind1+2:ind2]

Load the calibration csvs:

In [29]:
# Now we want to compare dataframe
csv_files = pd.DataFrame(sorted(os.listdir('temp/csv')),columns=['csv'])
csv_files['cal date'] = csv_files['csv'].apply(lambda x: get_file_date(x))
csv_files.set_index('cal date',inplace=True)
csv_files

Unnamed: 0_level_0,csv
cal date,Unnamed: 1_level_1
20150625,CGINS-CTDMOG-13645__20150625.csv
20170412,CGINS-CTDMOG-13645__20170412.csv


Load the QCT csvs:

In [30]:
# Now we want to compare dataframe
qct_files = pd.DataFrame(sorted(os.listdir('temp/qct')),columns=['qct'])
qct_files['cal date'] = qct_files['qct'].apply(lambda x: get_file_date(x))
qct_files.set_index('cal date',inplace=True)
qct_files

Unnamed: 0_level_0,qct
cal date,Unnamed: 1_level_1
20150625,CGINS-CTDMOG-13645__20150625.csv
20170412,CGINS-CTDMOG-13645__20170412.csv


Load the calibration csvs:

In [31]:
cal_files = pd.DataFrame(sorted(os.listdir('temp/cal')),columns=['cal'])
cal_files['cal date'] = cal_files['cal'].apply(lambda x: get_file_date(x))
cal_files.set_index('cal date',inplace=True)
cal_files

Unnamed: 0_level_0,cal
cal date,Unnamed: 1_level_1
20150625,CGINS-CTDMOG-13645__20150625.csv
20170412,CGINS-CTDMOG-13645__20170412.csv


Combine the dataframes into one in order to know which csv files to compare and check calibration dates.

In [32]:
df_files = csv_files.join(qct_files,how='outer').join(cal_files,how='outer').fillna(value='-999')
df_files

Unnamed: 0_level_0,csv,qct,cal
cal date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20150625,CGINS-CTDMOG-13645__20150625.csv,CGINS-CTDMOG-13645__20150625.csv,CGINS-CTDMOG-13645__20150625.csv
20170412,CGINS-CTDMOG-13645__20170412.csv,CGINS-CTDMOG-13645__20170412.csv,CGINS-CTDMOG-13645__20170412.csv


If the filename is wrong, the calibration coefficient checker will not manage to compare the results. Consequently, we'll make a local copy of the wrong file to a new file with the correct name, and then run the calibration coefficient checker.

In [None]:
d1 = str(20151217)
d2 = str(20150625)

In [None]:
src = f'temp/csv/{uid}__{d1}.csv'
dst = f'temp/csv/{uid}__{d2}.csv'

In [None]:
shutil.move(src,dst)

In [None]:
os.listdir('temp/csv')

Reload the data so that all files are uniformly named:

In [33]:
csv_files = pd.DataFrame(sorted(os.listdir('temp/csv')),columns=['csv'])
csv_files['cal date'] = csv_files['csv'].apply(lambda x: get_file_date(x))
csv_files.set_index('cal date',inplace=True)
csv_files

Unnamed: 0_level_0,csv
cal date,Unnamed: 1_level_1
20150625,CGINS-CTDMOG-13645__20150625.csv
20170412,CGINS-CTDMOG-13645__20170412.csv


In [34]:
df_files = csv_files.join(qct_files,how='outer').join(cal_files,how='outer').fillna(value='-999')
df_files

Unnamed: 0_level_0,csv,qct,cal
cal date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20150625,CGINS-CTDMOG-13645__20150625.csv,CGINS-CTDMOG-13645__20150625.csv,CGINS-CTDMOG-13645__20150625.csv
20170412,CGINS-CTDMOG-13645__20170412.csv,CGINS-CTDMOG-13645__20170412.csv,CGINS-CTDMOG-13645__20170412.csv


In [35]:
caldates = df_files.index
for i in caldates:
    print(i)

20150625
20170412


In [36]:
for cpath in sorted(cal_paths[uid]):
    print(cpath.split('/')[-1])

CTDMO-G_SBE_37IM_SN_37-13645_Calibration_Files_2015-07-01.pdf
CTDMO-G_SBE_37IM_SN_37-13645_Calibration_Files_2017-04-12.zip


In [37]:
for qpath in qct_paths[uid]:
    if qpath is not None:
        print(qpath.split('/')[-1].split('.')[0])

3305-00101-00227-A
3305-00101-00527-A


With uniformly named csv files, we can now directly compare different calibration coefficient sources for the CTDMO.

This table tells us that, for the csv CGINS-CTDMOG-11596__20150608.csv, I am missing a QCT document and vendor doc which could verify the calibration coefficients. Next, for the files I can compare, I want to go through and check each calibration coefficient.

**========================================================================================================================**
Okay, I want to check the following in the comparison between the CSV files contained in Asset Management, the QCT checkins, and the vendor docs:
1. Do the calibration coefficients match exactly?
2. Do the calibration coefficients match to within 0.001%?

In [None]:
def check_exact_coeffs(coeffs_dict):
    
    # Part 1: coeff by coeff comparison between each source of coefficients
    keys = list(coeffs_dict.keys())
    comparison = {}
    for i in range(len(keys)):
        names = (keys[i], keys[i - (len(keys)-1)])
        check = len(coeffs_dict.get(keys[i])['value']) == len(coeffs_dict.get(keys[i - (len(keys)-1)])['value'])
        if check:
            compare = np.equal(coeffs_dict.get(keys[i])['value'], coeffs_dict.get(keys[i - (len(keys)-1)])['value'])
            comparison.update({names:compare})
        else:
            pass
        
    # Part 2: now do a logical_and comparison between the results from part 1
    keys = list(comparison.keys())
    i = 0
    mask = comparison.get(keys[i])
    while i < len(keys)-1:
        i = i + 1
        mask = np.logical_and(mask, comparison.get(keys[i]))
        print(i)
       
    return mask 

In [None]:
def check_relative_coeffs(coeffs_dict):
    
    # Part 1: coeff by coeff comparison between each source of coefficients
    keys = list(coeffs_dict.keys())
    comparison = {}
    for i in range(len(keys)):
        names = (keys[i], keys[i - (len(keys)-1)])
        check = len(coeffs_dict.get(keys[i])['value']) == len(coeffs_dict.get(keys[i - (len(keys)-1)])['value'])
        if check:
            compare = np.isclose(coeffs_dict.get(keys[i])['value'], coeffs_dict.get(keys[i - (len(keys)-1)])['value'], rtol=1e-5)
            comparison.update({names:compare})
        else:
            pass
        
    # Part 2: now do a logical_and comparison between the results from part 1
    keys = list(comparison.keys())
    i = 0
    mask = comparison.get(keys[i])
    while i < len(keys)-1:
        i = i + 1
        mask = np.logical_and(mask, comparison.get(keys[i]))
        print(i)
       
    return mask 

In [None]:
exact_match = {}
for cal_date in df_files.index:
    # Part 1, load all of the csv files
    coeffs_dict = {}
    for source,fname in df_files.loc[cal_date].items():
        if fname != '-999':
            load_directory = '/'.join((os.getcwd(),'temp',source,fname))
            df_coeffs = pd.read_csv(load_directory)
            for i in list(set(df_coeffs['serial'])):
                print(source + '-' + fname + ': ' + str(i))
            df_coeffs.set_index(keys='name',inplace=True)
            df_coeffs.sort_index(inplace=True)
            coeffs_dict.update({source:df_coeffs})
        else:
            pass
    
    # Part 2, now check the calibration coefficients
    mask = check_exact_coeffs(coeffs_dict)
    
    # Part 3: get the calibration coefficients are wrong
    # and show them
    fname = df_files.loc[cal_date]['csv']
    if fname == '-999':
        incorrect = 'No csv file.'
    else:
        incorrect = coeffs_dict['csv'][mask == False]
    exact_match.update({fname:incorrect})

In [None]:
relative_match = {}
for cal_date in df_files.index:
    # Part 1, load all of the csv files
    coeffs_dict = {}
    for source,fname in df_files.loc[cal_date].items():
        if fname != '-999':
            load_directory = '/'.join((os.getcwd(),'temp',source,fname))
            df_coeffs = pd.read_csv(load_directory)
            for i in list(set(df_coeffs['serial'])):
                print(source + '-' + fname + ': ' + str(i))
            df_coeffs.set_index(keys='name',inplace=True)
            df_coeffs.sort_index(inplace=True)
            coeffs_dict.update({source:df_coeffs})
        else:
            pass
    
    # Part 2, now check the calibration coefficients
    mask = check_relative_coeffs(coeffs_dict)
    
    # Part 3: get the calibration coefficients are wrong
    # and show them
    fname = df_files.loc[cal_date]['csv']
    if fname == '-999':
        incorrect = 'No csv file.'
    else:
        incorrect = coeffs_dict['csv'][mask == False]
    relative_match.update({fname:incorrect})

In [None]:
for key in sorted(exact_match.keys()):
    if key != '-999':
        print(', '.join((ind for ind in exact_match[key].index.values)))

In [None]:
for key in sorted(relative_match.keys()):
    if key != '-999':
        print(', '.join((ind for ind in relative_match[key].index.values)))

**========================================================================================================================**
Now we need to check that the calibration coefficients for each CTDMO csv have the same number of significant digits as are reported on the vendor PDFs. For the CTDMO, the vendor reports to six significant figures.

In [38]:
csv_paths

{'CGINS-CTDMOG-10214': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20170829.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20121113.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20140214.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10214__20151110.csv'],
 'CGINS-CTDMOG-10215': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10215__20160915.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10215__20121113.csv',
  '/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibration/CTDMOG/CGINS-CTDMOG-10215__20181009.csv'],
 'CGINS-CTDMOG-10216': ['/home/andrew/Documents/OOI-CGSN/ooi-integration/asset-management/calibratio

In [40]:
uid = uids[0]
uid

'CGINS-CTDMOG-10214'

In [44]:
CSV = pd.read_csv(csv_paths[uid][0])
CSV

Unnamed: 0,serial,name,value,notes
0,37-10214,CC_a0,-0.0001422718,
1,37-10214,CC_a1,0.0003111163,
2,37-10214,CC_a2,-4.611213e-06,
3,37-10214,CC_a3,2.064604e-07,
4,37-10214,CC_cpcor,-9.57e-08,
5,37-10214,CC_ctcor,3.25e-06,
6,37-10214,CC_g,-0.9670224,
7,37-10214,CC_h,0.1469677,
8,37-10214,CC_i,-0.0004765937,
9,37-10214,CC_j,5.526974e-05,


In [47]:
for val in CSV['value']:
    print("{:.6e}".format(val))

-1.422718e-04
3.111163e-04
-4.611213e-06
2.064604e-07
-9.570000e-08
3.250000e-06
-9.670224e-01
1.469677e-01
-4.765937e-04
5.526974e-05
1.450000e+03
-6.298774e+00
4.792953e-03
-1.283930e-11
5.230406e+05
-5.088561e+01
-2.945562e-01
2.539588e+01
1.175000e-03
0.000000e+00
-6.821422e+01
5.216195e-02
-5.385986e+00
6.158500e-07


In [41]:
print("{:.2e}".format(0.00253))

2.53e-03


In [42]:
import math

In [None]:
def to_precision(x,p):
    """
    Returns a string representation of x formatted with a precision of p,
    following the toPrecision method from javascript. This implementation
    is based on example code from www.randlet.com.
    
    Args:
        x - number to format to a specified precision
        p - the specified precision for the number x
    Returns:
    
    """
    
    # First check if x is a string
    if type(x) is not float:
        x = float(x)
        
    # Next, check if p is an int and if not, convert to int
    if type(p) is not int:
        p = int(p)
    
        
    if x == 0.:
        return "0." + "0"*(p-1)
    
    out = []
    
    if x < 0:
        out.append("-")
        x = -x
        
    e = int(math.log10(x))
    tens = math.pow(10, e - p + 1)
    n = math.floor(x / tens)
    
    if n < math.pow(10, p - 1):
        e = e - 1
        tens = math.pow(10, e - p + 1)
        n = math.floor(x / tens)
        
    if abs((n + 1.) * tens - x) <= abs(n * tens - x):
        n = n + 1
        
    if n >= math.pow(10, p):
        n = n / 10.
        e = e + 1
        
    m = "%.*g" % (p, n)
    
    if e < -2 or e >= p:
        out.append(m[0])
        if p > 1:
            out.append(".")
            out.extend(m[1:p])
        out.append('e')
        if e > 0:
            out.append("+")
        out.append(str(e))
    elif e == (p - 1):
        out.append(m)
    elif e >= 0:
        out.append(m[:e+1])
        if (e + 1) < len(m):
            out.append(".")
            out.extend(m[e+1:])
    else:
        out.append("0.")
        out.extend(["0"]*-(e+1))
        out.append(m)
        
    return "".join(out)

In [None]:
import datetime
import re
import os
from wcmatch import fnmatch
import pandas as pd
import numpy as np
import string
from zipfile import ZipFile
import json
import PyPDF2
import textract

class CTDMOCalibration():
    # Class that stores calibration values for CTDs.

    def __init__(self, uid):
        self.serial = ''
        self.uid = uid
        self.ctd_type = uid
        self.coefficients = {}
        self.date = {}

        # Name mapping for the MO-type CTDs (when reading from pdfs)
        self.mo_coefficient_name_map = {
            'ptcb1': 'CC_ptcb1',
            'pa2': 'CC_pa2',
            'a3': 'CC_a3',
            'pa0': 'CC_pa0',
            'wbotc': 'CC_wbotc',
            'ptcb0': 'CC_ptcb0',
            'g': 'CC_g',
            'ptempa1': 'CC_ptempa1',
            'ptcb2': 'CC_ptcb2',
            'a0': 'CC_a0',
            'h': 'CC_h',
            'ptca0': 'CC_ptca0',
            'a2': 'CC_a2',
            'cpcor': 'CC_cpcor',
            'pcor':'CC_cpcor',
            'i': 'CC_i',
            'ptempa0': 'CC_ptempa0',
            'prange': 'CC_p_range',
            'ctcor': 'CC_ctcor',
            'tcor':'CC_ctcor',
            'a1': 'CC_a1',
            'j': 'CC_j',
            'ptempa2': 'CC_ptempa2',
            'pa1': 'CC_pa1',
            'ptca1': 'CC_ptca1',
            'ptca2': 'CC_ptca2',
        }

    @property
    def uid(self):
        return self._uid

    @uid.setter
    def uid(self, d):
        r = re.compile('.{5}-.{6}-.{5}')
        if r.match(d) is not None:
            self.serial = d.split('-')[2]
            self._uid = d
        else:
            raise Exception(f"The instrument uid {d} is not a valid uid. Please check.")

    @property
    def ctd_type(self):
        return self._ctd_type

    @ctd_type.setter
    def ctd_type(self, d):
        if 'MO' in d:
            self._ctd_type = '37'
        elif 'BP' in d:
            self._ctd_type = '16'
        else:
            self._ctd_type = ''

            
    def mo_parse_pdf(self, filepath):
        """
        This function extracts the text from a given pdf file.
        Depending on if the text concerns calibration for 
        temperature/conductivity or pressure, it calls a further
        function to parse out the individual calibration coeffs.
    
        Args:
            filepath - the full directory path to the pdf file
                which it to be extracted and parsed.
        Calls:
            mo_parse_p(text, filepath)
            mo_parse_ts(text)
        Returns:
            self - a CTDMO calibration object with calibration
                coefficients parsed into the object calibration
                dictionary
        """
    
        text = textract.process(filepath, encoding='utf-8')
        text = text.decode('utf-8')
    
        if 'PRESSURE CALIBRATION DATA' in text:
            self.mo_parse_p(filepath)
    
        elif 'TEMPERATURE CALIBRATION DATA' or 'CONDUCTIVITY CALIBRATION DATA' in text:
            self.mo_parse_ts(text)
        
        else:
            pass
    

    def mo_parse_ts(self, text):
        """
        This function parses text from a pdf and loads the appropriate calibration
        coefficients for the temperature and conductivity sensors into the CTDMO 
        calibration object.
    
        Args:
            text - extracted text from a pdf page
        Returns:
            self - a CTDMO calibration object with either temperature or conductivity
                calibration values filled in the calibration coefficients dictionary
        Raises:
            Exception - if the serial number in the pdf text does not match the
                serial number parsed from the UID
        """
    
        keys = self.mo_coefficient_name_map.keys()
        for line in text.splitlines():
    
            if 'CALIBRATION DATE' in line:
                *ignore, cal_date = line.split(':')
                cal_date = pd.to_datetime(cal_date).strftime('%Y%m%d')
                self.date.update({len(self.date): cal_date})
        
            elif 'SERIAL NUMBER' in line:
                *ignore, serial_num = line.split(':')
                serial_num = serial_num.strip()
                if serial_num != self.serial:
                    raise Exception(f'Instrument serial number {serial_num} does not match UID serial num {self.serial}')
           
            elif '=' in line:
                key, *ignore, value = line.split()
                name = self.mo_coefficient_name_map.get(key.strip().lower())
                if name is not None:
                    self.coefficients.update({name: value.strip()})
            else:
                continue
            
            
    def mo_parse_p(self,filepath):
        """
        Function to parse the pressure calibration information from a pdf. To parse
        the pressure cal info requires re-extracting the text from the pdf file using
        tesseract-ocr rather than the basic pdf2text converter.
    
        Args:
            text - extracted text from a pdf page using pdf2text
            filepath - full directory path to the pdf file containing the pressure
                calibration info. This is the file which will be re-extracted.
        Returns
            self - a CTDMO calibration object with pressure calibration values filled
                in the calibration coefficients dictionary
        """
    
        # Now, can reprocess using tesseract-ocr rather than pdftotext
        ptext = textract.process(filepath, method='tesseract', encoding='utf-8')
        ptext = ptext.replace(b'\xe2\x80\x94',b'-')
        ptext = ptext.decode('utf-8')
        keys = list(self.mo_coefficient_name_map.keys())
        
        # Get the calibration date:
        for line in ptext.splitlines():
            if 'CALIBRATION DATE' in line:
                items = line.split()
                ind = items.index('DATE:')
                cal_date = items[ind+1]
                cal_date = pd.to_datetime(cal_date).strftime('%Y%m%d')
                self.date.update({len(self.date):cal_date})
            
            if 'psia S/N' in line:
                items = line.split()
                ind = items.index('psia')
                prange = items[ind-1]
                name = self.mo_coefficient_name_map.get('prange')
                self.coefficients.update({name: prange})
    
            # Loop through each line looking for the lines which contain
            # calibration coefficients
            if '=' in line:
                # Tesseract-ocr misreads '0' as O, and 1 as IL
                line = line.replace('O','0').replace('IL','1').replace('=','').replace(',.','.').replace(',','.')
                line = line.replace('L','1').replace('@','0').replace('l','1').replace('--','-')
                if '11' in line and 'PA2' not in line:
                    line = line.replace('11','1')
                items = line.split()
                for n, k in enumerate(items):
                    if k.lower() in keys:
                        try:
                            float(items[n+1])
                            name = self.mo_coefficient_name_map.get(k.lower())
                            self.coefficients.update({name: items[n+1]})
                        except:
                            pass
        if 'CC_ptcb2' not in list(self.mo_coefficient_name_map.keys()):
            self.coefficients.update({'CC_ptcb2': '0.000000e+000'})


    def mo_parse_cal(self, filepath):
        """
        Function to parse the .cal file for the CTDMO when a .cal file
        is available.
        """
    
        if not filepath.endswith('.cal'):
            raise Exception(f'Not a .cal filetype.')
    
        with open(filepath) as file:
            data = file.read()
        
        for line in data.splitlines():
            key, value = line.split('=')
            key = key.strip()
            value = value.strip()
        
            if 'SERIALNO' in key:
                sn = value
                if self.serial != sn:
                    raise Exception(f'File serial number {sn} does not match UID {self.uid}')
                
            elif 'CALDATE' in key:
                cal_date = pd.to_datetime(value).strftime('%Y%m%d')
                self.date.update({len(self.date): cal_date})
            
            elif 'INSTRUMENT_TYPE' in key:
                ctd_type = value[-2:]
                if self.ctd_type != ctd_type:
                    raise Exception(f'CTD type {ctd_type} does not match uid {self.uid}.')
                
            else:
                if key.startswith('T'):
                    key = key.replace('T','')
                if key.startswith('C') and len(key)==2:
                    key = key.replace('C','')
                name = self.mo_coefficient_name_map.get(key.lower())
                if name is not None:
                    self.coefficients.update({name: value})
                    
        # Now we need to add in the range of the sensor
        name = self.mo_coefficient_name_map.get('prange')
        self.coefficients.update({name: '1450'})

                    
    def mo_parse_qct(self, filepath):
        """
        This function reads and parses the QCT file into
        the CTDMO calibration object.
    
        Args:
            filepath - full directory path and filename of
                the QCT file
        Returns:
        
        """
        
        with open(filepath,errors='ignore') as file:
            data = file.read()

        data = data.replace('<',' ').replace('>',' ')
        keys = self.mo_coefficient_name_map.keys()

        for line in data.splitlines():
            items = line.split()
    
            # If the line is empty, go to next line
            if len(items) == 0:
                continue
    
            # Check the serial number from the instrument
            elif 'SERIAL NO' in line:
                ind = items.index('NO.')
                sn = items[ind+1]
                if sn != self.serial:
                    raise Exception(f'Serial number {sn} in QCT document does not match uid serial number {self.serial}')
        
            # Check if the line contains the calibration date
            elif 'CalDate' in line:
                cal_date = pd.to_datetime(items[1]).strftime('%Y%m%d')
                self.date.update({len(self.date): cal_date})
        
            # Get the coefficient names and values
            elif items[0].lower() in keys:
                name = self.mo_coefficient_name_map[items[0].lower()]
                self.coefficients.update({name: items[1]})
        
            else:
                pass
    
    
    def write_csv(self, outpath):
        """
        This function writes the correctly named csv file for the ctd to the
        specified directory.

        Args:
            outpath - directory path of where to write the csv file
        Raises:
            ValueError - raised if the CTD object's coefficient dictionary
                has not been populated
        Returns:
            self.to_csv - a csv of the calibration coefficients which is
                written to the specified directory from the outpath.
        """

        # Run a check that the coefficients have actually been loaded
        if len(self.coefficients) == 0:
            raise ValueError('No calibration coefficients have been loaded.')

        # Create a dataframe to write to the csv
        data = {'serial': [self.ctd_type + '-' + self.serial]*len(self.coefficients),
                'name': list(self.coefficients.keys()),
                'value': list(self.coefficients.values()),
                'notes': ['']*len(self.coefficients)
                }
        df = pd.DataFrame().from_dict(data)

        # Generate the csv name
        cal_date = max(self.date.values())
        csv_name = self.uid + '__' + cal_date + '.csv'

        # Write the dataframe to a csv file
        # check = input(f"Write {csv_name} to {outpath}? [y/n]: ")
        check = 'y'
        if check.lower().strip() == 'y':
            df.to_csv(outpath+'/'+csv_name, index=False)