# Proof of concept validation notebook for LEPR upload

In [1]:
import pandas as pd
import numpy as np
import xlrd
import logging

In [2]:
log_filename = 'validation.log'
logging.basicConfig(filename=log_filename,
                    filemode='w',
                    format="---> %(levelname)s_(%(funcName)s):: %(message)s.")

def print_log_file(log_filename=log_filename):
    with open(log_filename, 'r') as fin:
        print(fin.read())

In [3]:
upload_data = pd.read_excel('../data/upload_validation.xlsx', sheet_name=None)

In [4]:
def extract_chem_dat(upload_data):
    run_products = upload_data['6 Run Products']
    
    header_row_num = 4
    chem_dat_col_index = 13
    run_names = run_products.iloc[header_row_num+1:,0]

    dat = run_products.iloc[:,chem_dat_col_index:]
    dat.columns = dat.iloc[0]
    dat = dat.iloc[1:]
    chem_dat_info = dat.iloc[:2]
    chem_dat_info.index = ['method_id','unit']

    chem_dat = dat.iloc[header_row_num:]
    chem_dat
    chem_dat.index = run_names

    return chem_dat, chem_dat_info

In [5]:
chem_dat, chem_dat_info = extract_chem_dat(upload_data)
chem_dat_info

Unnamed: 0,SiO2,SiO2_err,TiO2,FeO*,FeOtot,Al2O3,MgO
method_id,1,1,1,1,2,1.0,
unit,wt%,wt%,wt%,wt%,wt%,,wt%


In [6]:
def validate_chem_error_columns(chem_dat_info):
    columns = chem_dat_info.columns
    meas_cols = [col for col in columns if not col.endswith('_err') ]
    for col in meas_cols:
        if col+'_err' not in columns:
            logging.error(f"'{col}_err' missing from chemistry data columns")
            

def validate_chem_units(chem_dat_info):
    for (col, dat) in chem_dat_info.T.iterrows():
        if dat.unit is np.nan:
            logging.critical(f"'{col}' does not provide any units")
            
def validate_chem_method(chem_dat_info):
    for (col, dat) in chem_dat_info.T.iterrows():
        if dat.method_id is np.nan:
            logging.critical(f"'{col}' does not provide any method id")

In [7]:
validate_chem_error_columns(chem_dat_info)
validate_chem_units(chem_dat_info)
validate_chem_method(chem_dat_info)

In [8]:
chem_dat.head()

Unnamed: 0_level_0,SiO2,SiO2_err,TiO2,FeO*,FeOtot,Al2O3,MgO
Analytical Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Run_1,47.5,(1),0.61,,,,
Run_2,49.2,0.1,0.42,,,,
Run_3,48.1,,0,,,,
Run_4,46.0,,<0.7,,,,
Run_5,49.0,,nd,,,,


In [9]:
ichem_dat = chem_dat.iloc[:,1]

# def validate_numeric_chem_data(ichem_dat):

        
 
def chem_not_detected_not_valid(val, chem, run_id):
    if val=='nd':
        logging.error(f"'{val}', the '{chem}' value for exp_run '{run_id}', is not valid. If not detected use vocabulary 'bdl'")
        return True
    
    return False

def chem_not_measured_not_valid(val, chem, run_id):
    if val=='-':
        logging.error(f"'{val}', the '{chem}' value for exp_run '{run_id}', is not valid. If not measured leave entry blank")
        return True
    
    return False

def chem_measurement_limit_not_valid(val, chem, run_id):
    if type(val) is not str:
        return False
    
    if val.startswith('>') or val.startswith('<'):
        logging.error(f"'{val}', the '{chem}' value for exp_run '{run_id}', is not valid. Instead give just the value and indicate limit using field '????, Ask roger'")
        return True
    
    return False
        
def numeric_chem_data_not_valid(val, chem, run_id):
    if type(val) is str:
        logging.error(f"'{val}', the '{chem}' value for exp_run '{run_id}', is not a valid number")
        return True
    
    return False

In [10]:
for ichem_col, ichem_dat in chem_dat.T.iterrows():
    chem = ichem_dat.name
    for run_id, val in ichem_dat.items():
        if chem_not_detected_not_valid(val, chem, run_id):
            continue
        
        if chem_not_measured_not_valid(val, chem, run_id):
            continue
        
        if chem_measurement_limit_not_valid(val, chem, run_id):
            continue
        
        numeric_chem_data_not_valid(val, chem, run_id)

In [11]:
print_log_file()

---> ERROR_(validate_chem_error_columns):: 'TiO2_err' missing from chemistry data columns.
---> ERROR_(validate_chem_error_columns):: 'FeO*_err' missing from chemistry data columns.
---> ERROR_(validate_chem_error_columns):: 'FeOtot_err' missing from chemistry data columns.
---> ERROR_(validate_chem_error_columns):: 'Al2O3_err' missing from chemistry data columns.
---> ERROR_(validate_chem_error_columns):: 'MgO_err' missing from chemistry data columns.
---> CRITICAL_(validate_chem_units):: 'Al2O3' does not provide any units.
---> CRITICAL_(validate_chem_method):: 'MgO' does not provide any method id.
---> ERROR_(numeric_chem_data_not_valid):: '(1)', the 'SiO2_err' value for exp_run 'Run_1', is not a valid number.
---> ERROR_(chem_measurement_limit_not_valid):: '<0.7', the 'TiO2' value for exp_run 'Run_4', is not valid. Instead give just the value and indicate limit using field '????, Ask roger'.
---> ERROR_(chem_not_detected_not_valid):: 'nd', the 'TiO2' value for exp_run 'Run_5', is n