In [2]:
import numpy as np 
import pandas as pd 
import os 
import glob 
import altair as alt
import imp
import cremerlab.hplc
imp.reload(cremerlab.hplc)
from io import StringIO


ModuleNotFoundError: No module named 'cremerlab'

In [15]:
# Read in the files. 
files = glob.glob('../exploratory_data/*.txt')

# Read the first file
with open(files[0], 'r') as f:
    raw_file = f.read()

# Split to get the file information

In [34]:
def scrape_metadata(file, delimiter=','):
    """
    Scrapes the sample information from the output of a Shimadzu HPLC ASCII 
    file and returns a dictionary of the metadata entries. 

    Parameters
    ----------
    file : str 
        The contents of the file as a string with newline characters (`\n`) present. 
    delimiter: str 
        The delimeter used in the file. If  tab-delimited, use `\t`.
    Returns
    -------
    metadata_dict : dictionary
        A dictionary of the metadata listed under the 'Sample Information', excluding 
        ISTD Amounts.

    Notes
    -----
    This function assumes that the file contains metadata fields `[Sample Information]`
    followed by `[Original Files]`. If either `[Sample Information]` or `[Original Files]`
    fields are missing or have other names, a ValueError exception is thrown. 

    Raises
    ------
    TypeError
        Raised if file is not of type `str`
    ValueError:
        Raised if `[Sample Information]`, `[Original Files]`, `\\n`, or delimiter
        is not present in file
    """
    # Make sure the 'file' provided is a string and has necessary fields. 
    if type(file) != str:
        raise TypeError(f'Argument `file` must be a string. type {type(file)} was provided.')
    if '\n' not in file:
        raise ValueError(f'Newline characters (`\\n`) are not in file, but must be for proper parsing.')
    if ('[Sample Information]' not in file) | ('[Original Files]' not in file):
        raise ValueError('`[Sample Information]` or `[Original Files]` field is missing.')
    if delimiter not in file:
        raise ValueError(f'Delimiter {f} not present in the file.')

    # Get the sample information and split by newline
    metadata = file.split('[Sample Information]\n')[1].split('[Original Files]\n')[0]
    
    # Set up the dictionary and loop through lines
    metadata_dict = {}
    for line in metadata.split('\n'):

        # Split by the delimiter.
        entry = line.split(delimiter)
        
        # Add the entry to the dictionary if longer than 1 and does not 
        # contain ISTD Amount.
        if (len(entry) > 1) & ('ISTD' not in entry[0]):
            metadata_dict[entry[0]] = entry[1]
    return metadata_dict

In [35]:
metadata_dict = scrape_metadata(raw_file)
metadata_dict

{'Operator Name': 'System Administrator',
 'Acquired': '3/18/2021 6:17:01 PM',
 'Sample Type': '1:Standard',
 'Level': '1',
 'Sample Name': '2021-03-18_MgSO4',
 'Sample ID': '001',
 'Sample Amount': '1',
 'Dilution Factor': '1',
 'Vial#': '22',
 'Injection Volume': '20'}

In [None]:
def scrape_chromatogram(file, detector='B', delimiter=',', metadata=True):
    """
    Scrapes the chromatogram for a given detector from a Shimadzu HPLC
    ASCII output.

    Parameters
    ----------
    file : str 
        The contents of the file as a string with newline characters (`\n`) present. 
    detector: str, 'A' or 'B'
        The name of the detector in the file. Default is `B`. Note that only that only 
        'A' or 'B' is an acceptable input and not the complete detector name such as 
        'LC Chromatogram(Detector B-Ch1)'.
    delimiter: str 
        The delimeter used in the file. If  tab-delimited, use `\t`.
    metadata : bool
        If `true`, a dictionary with the metadata about the detector is returned. 
        Default is True. 

    Returns
    -------
    chrom : pandas DataFrame
        A tidy pandas DataFrame with two columns -- `time_minutes` and `intensity_mV`
    metadata_dict : dictionary
        A dictionary of the metadata associated with the desired detector channel.
        if `metadata` is not `True`, this is not returned. 

    Notes
    -----
    This function assumes that the detector name follows the convention 
    `[LC Chromatogram(Detector A/B-ChX]` where `A/B` is the detector label 
    and `X` is the channel number. 

    Raises
    ------
    TypeError
        Raised if file, detctor, or delimiter is not of type `str`.
    ValueError:
        Raised if `[LC Chromatogram`, `\\n`, or delimiter is not present in file.
        Also raised if `detector.uppercase()` is not `A` or `B`
    """
    # Do type checks. 
    if type(file) is not str:
        raise TypeError(f'Type of `file` must be `str`. Type {type(file)} was provided.')
    if type(channel) is not str:
        rasie

In [25]:
metadata = raw_file.split('[Sample Information]')[1].split('ISTD')[0].split('\n')
metadata_dict = {}
for line in metadata:
    split = line.split(',')
    if len(split) > 1:
        metadata_dict[split[0]] = split[1]
metadata_dict

{'Operator Name': 'System Administrator',
 'Acquired': '3/18/2021 6:17:01 PM',
 'Sample Type': '1:Standard',
 'Level': '1',
 'Sample Name': '2021-03-18_MgSO4',
 'Sample ID': '001'}

In [41]:
detector='B'
delimiter =','
chrom = '\n'.join(raw_file.split(f'[LC Chromatogram(Detector {detector.upper()}-Ch')[1].split('\n')[1:-2])
out = pd.read_csv(StringIO(chrom), skiprows=6)
chrom.split('\n')[:6]

['Interval(msec),500',
 '# of Points,5401',
 'Start Time(min),0.000',
 'End Time(min),45.000',
 'Intensity Units,mV',
 'Intensity Multiplier,0.001']

In [23]:
metadata

['',
 'Operator Name,System Administrator',
 'Acquired,3/18/2021 6:17:01 PM',
 'Sample Type,1:Standard',
 'Level,1',
 'Sample Name,2021-03-18_MgSO4',
 'Sample ID,001',
 '']