<!-- Auto table of contents -->
<h1 class='tocIgnore'>AutoDS : Validation tests</h1>
<p>(for the <b>autods</b> module, a python interface to MCDS.exe, http://distancesampling.org/)</p>
<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table of contents</h2>
  <div id="toc"></div>
</div>

In [None]:
%%javascript
$.getScript('ipython_notebook_toc.js')

In [None]:
%matplotlib inline

In [None]:
import sys
import os
import pathlib as pl
import importlib as implib

import re

from collections import OrderedDict as odict, namedtuple as ntuple

import math
import numpy as np
import pandas as pd

from IPython.display import HTML

import matplotlib.pyplot as plt

import plotly as ply
import plotly.graph_objs as plygo

In [None]:
sys.path.insert(0, '..')

In [None]:
# Configure logging.
import logging

# Setup given logger.
def setupLogger(logr, level=logging.ERROR, handlers=[sys.stdout], fileMode='w',
                format='%(asctime)s %(name)s %(levelname)s\t%(message)s'):
    
    # Cleanup any default handler (jupyter does some logging initialisation itself ...)
    while logr.handlers:
        logr.removeHandler(logr.handlers[-1])

    # Set new handlers
    formatter = logging.Formatter(format)
    
    # Set level
    logr.setLevel(level)
    
    # Setup new handlers
    for hdlr in handlers:
        if isinstance(hdlr, str):
            handler = logging.FileHandler(hdlr, mode=fileMode)
        else:
            handler = logging.StreamHandler(stream=hdlr)
        handler.setFormatter(formatter)
        logr.addHandler(handler)
    
    def handlerId(hdlr):
        return 'File({})'.format(hdlr) if isinstance(hdlr, str) else 'Stream({})'.format(hdlr.name)
    logr.info('Logging to {}.'.format(', '.join(handlerId(hdlr) for hdlr in handlers)))

    return logr

# Local (NB) logger
setupLogger(logging.getLogger('autods'), level=logging.INFO, handlers=[sys.stdout, 'tmp/valtst.log'])
logger = setupLogger(logging.getLogger('valtst'), level=logging.DEBUG, handlers=[sys.stdout, 'tmp/valtst.log'])

In [None]:
import autods as ads 

In [None]:
# Activate Warnings as Exceptions
if False:
    
    import warnings

    warnings.filterwarnings(action='error')

    # pd.read_excel
    warnings.filterwarnings(action='default', module='etree')
    warnings.filterwarnings(action='default', module='xlrd')
    warnings.filterwarnings(action='default', module='defusedxml')

# I. Run analyses with real life field data (1/2 : the long way)

With MCDSAnalysis class.

(for comparison to manually issued analyses with Distance 7.3)

## 1. Load analyses set specifications

In [None]:
# Load refout results table
refFileName = 'ACDC2019-Papyrus-ALAARV-TURMER-resultats-distance-73.xlsx'
dfRefRes = pd.read_excel(pl.Path('refout', refFileName))
dfRefRes.reset_index(inplace=True) # Generate analysis # (later need for original cases order)
dfRefRes.rename(columns=dict(index='AnlysNum', Name='Model'), inplace=True)

dfRefRes.head()

## 2. Build test cases

In [None]:
# Generate test cases definition code from refout results file (don't cheat : only input columns :-)
modelIdCols = ['Model']
modelParamCols = ['LTrunc', 'RTrunc', 'FitDistCuts', 'DiscrDistCuts']
sampleIdCols = ['Species', 'Periods', 'Prec.', 'Duration']
caseIdCols = ['AnlysNum'] + sampleIdCols + modelIdCols
dfAnlysCases = dfRefRes[caseIdCols + modelParamCols].copy()

dfAnlysCases['KeyFn'] = \
    dfAnlysCases.Model.apply(lambda s: 'UNIFORM' if s.startswith('Unif') \
                                                 else 'HNORMAL' if s.startswith('Half') else 'HAZARD')
dfAnlysCases['AdjSer'] = \
    dfAnlysCases.Model.apply(lambda s: 'COSINE' if s.find(' Cos') > 0 \
                                                else 'POLY' if s.find(' SimPoly') > 0 else 'HERMITE')
dfAnlysCases['InFileName'] = \
    dfAnlysCases.apply(lambda sRow: 'ACDC2019-Papyrus-{}-{}-{}mn-{}dec-dist.txt' \
                                    .format(sRow.Species,
                                            'AB' if 'A+B' in sRow.Periods else 'A' if 'A' in sRow.Periods else 'B',
                                            sRow.Duration.split(' ')[0], sRow['Prec.'].split(' ')[0]),
                       axis='columns')
dfAnlysCases

In [None]:
#def nan2None(v):
#    return None if pd.isnull(v) else v
def distCutsFromSpecs(v):
    if pd.isnull(v):
        return None
    if isinstance(v, int):
        return v
    return [float(x) for x in v.split(',')]

## 3. Prepare analyses

In [None]:
decimalFields = ['Point transect*Survey effort', 'Observation*Radial distance']

In [None]:
# Analysis engine (sequential)
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-out'),
                      executor=None, # Non-parallel: ~7.5s elapsed on a Lenovo P52 (6-core i7-8850H with PCI-e SSD)
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
# Frozen analysis parameters (a choice here)
KEstimCriterion = 'AIC'
KCVInterval = 95

In [None]:
# Results object construction
sampCols = [('sample', col, 'Value') for col in sampleIdCols]
miSampCols = pd.MultiIndex.from_tuples(sampCols)

custCols = [('sample', 'AnlysNum', 'Value')] + sampCols + [('model', 'Model', 'Value')]
miCustCols = pd.MultiIndex.from_tuples(custCols)

dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=caseIdCols, 
                           fr=['NumAnlys', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Modèle']))

results = ads.MCDSResultsSet(miCustomCols=miCustCols, miSampleCols=miSampCols, dfCustomColTrans=dfCustColTrans)

In [None]:
computed = False

## 3a. Or : Really run analyses

In [None]:
# Shorten test cases and reference results lists, to go faster
# Warning: If you don't retain entire samples, later comparison will fail on Delta AIC values.
#selCaseInds = [0, 5, 7, 22, 31] # Some random cases, with uncomplete samples.
#selCaseInds = dfAnlysCases[dfAnlysCases.Sample.isin([3, 4])].index # A shorter selection, with complete samples.
selCaseInds = range(len(dfAnlysCases)) # All of them.

nOrigAnlysCases = len(dfAnlysCases)
dfAnlysCases = dfAnlysCases.loc[selCaseInds]
dfRefRes = dfRefRes.loc[selCaseInds]

print('Retained {} out of {}.'.format(len(selCaseInds), nOrigAnlysCases))

In [None]:
tsStart = pd.Timestamp.now()
print('Started at', tsStart)

# Run all analyses
lastInFileName = None
for _, sCase in dfAnlysCases.iterrows():
    
    nCase = sCase.AnlysNum
    name = sCase.InFileName[len('ACDC2019-Papyrus')+1:-len('-dist.txt')]
    name += '-' + sCase.Model.lower().translate(str.maketrans({c:'-' for c in ' ,.:;()/'}))
    print('#{:3d}'.format(nCase+1), name, sCase.KeyFn, sCase.AdjSer)
    
    # Create data set if not already done.
    if lastInFileName != sCase.InFileName:
        sds = ads.SampleDataSet(pl.Path('refin', sCase.InFileName), decimalFields=decimalFields)
        lastInFileName = sCase.InFileName
        
    # Run analysis and get results
    anlys = ads.MCDSAnalysis(engine=mcds, sampleDataSet=sds, name=name, logData=True,
                             estimKeyFn=sCase.KeyFn, estimAdjustFn=sCase.AdjSer,
                             estimCriterion=KEstimCriterion, cvInterval=KCVInterval,
                             minDist=sCase.LTrunc, maxDist=sCase.RTrunc,
                             fitDistCuts=distCutsFromSpecs(sCase.FitDistCuts),
                             discrDistCuts=distCutsFromSpecs(sCase.DiscrDistCuts))

    anlys.run()

    sResult = anlys.getResults()

    # Save results
    sHead = pd.Series(data=[sCase[col] for col in sCase.index[:len(caseIdCols)]], index=miCustCols)

    results.append(sResult, sCustomHead=sHead)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
tsEnd = pd.Timestamp.now()
print('Finished at', tsEnd, ': duration', str(tsEnd - tsStart).replace('0 days ', ''))

computed = True

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')

results.toExcel(resFileName, sheetName='AutoDSVal')

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-results-en.xlsx')

results.toExcel(resFileName, sheetName='Auto', lang='en')

In [None]:
# Check translation
dfActTrRes = results.dfTransData('fr')

dfActTrRes.head().T.iloc[:30] #.at['TroncGche', 0]

## 3b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName, sheetName='AutoDSVal')
    
    # shutdown analysis engine
    mcds.shutdown()

else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

# II. Compare actual results to reference

(reference = manually run analyses with Distance software)

## 1. Extract actual results to compare

In [None]:
# Analysis results
dfActRes = results.dfData

dfActRes.head().T[:30]

In [None]:
# Sélection des colonnes des résultats autos et association aux disponibles dans la référence, pour comparaison.
dCompCols = \
{
    ('sample', 'AnlysNum', 'Value'):  'AnlysNum',
    ('sample', 'Species', 'Value'):   'Species',
    ('sample', 'Periods', 'Value'):   'Periods',
    ('sample', 'Prec.', 'Value'):     'Prec.',
    ('sample', 'Duration', 'Value'):  'Duration',
    
    ('model',  'Model', 'Value'):         'Model',
    ('parameters', 'left truncation distance', 'Value'):           'LTrunc',
    ('parameters', 'right truncation distance', 'Value'):          'RTrunc',
    ('parameters', 'model fitting distance cut points', 'Value'):  'FitDistCuts',
    ('parameters', 'distance discretisation cut points', 'Value'): 'DiscrDistCuts',
    
    ('run output', 'run status', 'Value') : 'Status',
    #('run output', 'run time', 'Value') : 'Run', # Only for unintests ref. generation just below
    
    ('detection probability', 'total number of parameters (m)', 'Value'): '# params',
    ('encounter rate', 'number of observations (n)', 'Value'): '# obs',
    
    ('detection probability', 'Delta AIC', 'Value'): 'Delta AIC',
    ('detection probability', 'AIC value', 'Value'): 'AIC',
    ('detection probability', 'chi-square test probability determined', 'Value')               : 'GOF Chi-p',
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value')                  : 'GOF K-S p',
    ('detection probability', 'Cramér-von Mises (uniform weighting) test probability', 'Value'): 'GOF CvM (unif) p',
    ('detection probability', 'Cramér-von Mises (cosine weighting) test probability', 'Value') : 'GOF CvM (cos) p',
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'): 'ESW/EDR',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl')  : 'ESW/EDR LCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl')  : 'ESW/EDR UCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Cv')   : 'ESW/EDR CV',
    
    ('density/abundance', 'density of animals', 'Value'): 'D',
    ('density/abundance', 'density of animals', 'Lcl')  : 'D LCL',
    ('density/abundance', 'density of animals', 'Ucl')  : 'D UCL',
    ('density/abundance', 'density of animals', 'Cv')   : 'D CV',
    
    ('detection probability', 'probability of detection (Pw)', 'Value'): 'P',
    ('detection probability', 'probability of detection (Pw)', 'Lcl')  : 'P LCL',
    ('detection probability', 'probability of detection (Pw)', 'Ucl')  : 'P UCL',
    ('detection probability', 'probability of detection (Pw)', 'Cv')   : 'P CV',
    ('detection probability', 'probability of detection (Pw)', 'Df')   : 'P DF',
}
len(dCompCols)

In [None]:
# Warning: Unused columns (full of NaNs) have been atomatically removed
# (see last line of ResultsSet.dfData getter)
dCompCols = { k: v for k, v in dCompCols.items() if k in dfActRes.columns }
len(dCompCols)

In [None]:
# So we need to cleanup modelParamCols too
modelParamCols = [id_ for id_ in modelParamCols if id_ in dCompCols.values()]
len(modelParamCols)

In [None]:
# Safe stringification of model params
# * needed for use in indexes (hashability)
# * needed to cope with to_excel/read_excel unconsistent None management
def modelParam2Str(par):
    #print(par)
    if isinstance(par, list):
        spar = str([float(v) for v in par])
    elif pd.isnull(par):
        spar = 'None'
    elif isinstance(par, str):
        if ',' in par: # Assumed already somewhat stringified list
            spar = str([float(v) for v in par.strip('[]').split(',')])
    else:
        spar = str(par)
    return spar

In [None]:
# Select results columns and rename them as reference, for easier comparison
dfActRes4c = dfActRes[list(dCompCols.keys())].copy()
dfActRes4c.columns = [dCompCols[col] for col in dCompCols]
dfActRes4c[modelParamCols] = dfActRes4c[modelParamCols].applymap(modelParam2Str) # Hashable mandatory for indexing
dfActRes4c.set_index(caseIdCols + modelParamCols, inplace=True)

dfActRes4c

In [None]:
# Select usefull reference columns for comarison
dfRefRes4c = dfRefRes.copy()
dfRefRes4c[modelParamCols] = dfRefRes4c[modelParamCols].applymap(modelParam2Str) # Hashable mandatory for indexing
dfRefRes4c.set_index(caseIdCols + modelParamCols, inplace=True)
dfRefRes4c = dfRefRes4c.reindex(columns=dfActRes4c.columns)

dfRefRes4c

## 2. Diagnostic automatique

In [None]:
# First checks : equality of test case lists (index) and of column names (columns)
assert sorted(dfActRes4c.index)   == sorted(dfRefRes4c.index)
assert sorted(dfActRes4c.columns) == sorted(dfRefRes4c.columns)

In [None]:
# Actual / reference closeness measure : -round(log10((actual - reference) / max(abs(actual), abs(reference))), 1)
# = Compute the order of magnitude that separate the difference from the absolute max. of the two values
# The greater it is, the lower the relative difference
#    Ex: 3 = 10**3 ratio between difference absolue max. of the two,
#        +inf = NO difference at all,
#        0 = bad, one of the two is 0, and the other not,
# See unitary test below.
def closeness(sRefAct):
    
    x, y = sRefAct.to_list()
    
    # Special cases with 1 NaN, or 1 or more inf => all different
    if np.isnan(x):
        if not np.isnan(y):
            return 0 # All different
    elif np.isnan(y):
        return 0 # All different
    
    if np.isinf(x) or np.isinf(y):
        return 0 # All different
    
    # Normal case
    c = abs(x - y)
    if not np.isnan(c) and c != 0:
        c = c / max(abs(x), abs(y))
    
    return np.inf if c == 0 else round(-np.log10(c), 1)

In [None]:
# Actual / reference comparison : compute closeness indicator
dfRelDif = dfRefRes4c.copy()
for col in dfRelDif.columns:
    dfRelDif['act'] = dfActRes4c[col]
    dfRelDif[col] = dfRelDif[[col, 'act']].apply(closeness, axis='columns')
    dfRelDif.drop(columns='act', inplace=True)
    
dfRelDif

In [None]:
# Diagnosis : we only keep lines and columns with some relevant differences.
dfBadRelDif = dfRelDif.copy()
len(dfBadRelDif)

In [None]:
# 1. Suppression lignes : Status identique et reste NaN (cas des status = 0/3/4 : erreur d'exécution, ou pas d'exécution)
valCols = [col for col in dfRelDif.columns if col != 'Status']
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif.Status.abs() == np.inf) & dfBadRelDif[valCols].isnull().all(axis='columns')].index,
                 axis='index', inplace=True)
assert len(dfBadRelDif) == 29, len(dfBadRelDif)
len(dfBadRelDif)

In [None]:
# 2. Suppression lignes : Status et toutes autres colonnes à inf (stricte égalité)
#    NB. Somme very small differences observed when results have just been computed or when they have been
#        loaded from a previously saved Excel file (above 10**15 closeness value)
dfBadRelDif.drop(dfBadRelDif[dfBadRelDif.apply(np.isinf, axis='columns').all(axis='columns')].index,
                 axis='index', inplace=True)
assert (computed and len(dfBadRelDif) == 26) or (not computed and len(dfBadRelDif) == 19), len(dfBadRelDif)
len(dfBadRelDif)

In [None]:
# 3. Suppression lignes : Status et toutes autres colonnes supérieures à 15 (quasi stricte égalité)
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif >= 15).all(axis='columns')].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 9, len(dfBadRelDif)
len(dfBadRelDif)

In [None]:
# 4. Suppression lignes : Status identique et toutes autres colonnes supérieures à 4 (quasi égalité)
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif >= 4).all(axis='columns')].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 4, len(dfBadRelDif)
len(dfBadRelDif)

In [None]:
# 5. Suppression lignes : Status identique et toutes autres colonnes supérieures à 4 (quasi égalité)
#                         sauf colonnes GOF KS et CvM à NaN, non calculées quand on discrétise les distances.
if 'DiscrDistCuts' in dfBadRelDif.index.names:
    discrCols = [col for col in dfRelDif.columns if not col.startswith('GOF') or col.find('Chi') > 0]
    df2Drop = (dfBadRelDif.index.get_level_values('DiscrDistCuts') != -1) & (dfBadRelDif[discrCols] >= 4).all(axis='columns')
    dfBadRelDif.drop(dfBadRelDif[df2Drop].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 2, len(dfBadRelDif)
len(dfBadRelDif)

In [None]:
# Le verdict (Cf. fichier Excel refFileName, feuille "DiffAuto" pour explications des 2 différences Act/Ref)
dfBadRelDif.T

In [None]:
dfRefRes4c.loc[dfBadRelDif.index]

In [None]:
dfActRes4c.loc[dfBadRelDif.index]

In [None]:
nFails = len(dfBadRelDif.index)
if nFails > 0:
    print('Warning: {} test case(s) failed ;'.format(nFails))
    print(' ... see sheet "DiffAuto" of {} for possible explanations.'.format(refFileName))
else:
    print('All test cases succeeded !')

## 3. Save results to disk.

In [None]:
resCompFileName = os.path.join(mcds.workDir, 'autods-validation-rescomp.xlsx')

with pd.ExcelWriter(resCompFileName) as xlsxWriter:

    dfRefRes.to_excel(xlsxWriter, sheet_name='RefResults', index=True)
    dfActRes4c.reset_index().to_excel(xlsxWriter, sheet_name='ActResults', index=False)
    dfRelDif.reset_index().to_excel(xlsxWriter, sheet_name='Diff2Ref', index=False)
    dfBadRelDif.reset_index().to_excel(xlsxWriter, sheet_name='BadDiff2Ref', index=False)
    dfRefRes4c.loc[dfBadRelDif.index].reset_index().to_excel(xlsxWriter, sheet_name='RefResWithDiff', index=False)
    dfActRes4c.loc[dfBadRelDif.index].reset_index().to_excel(xlsxWriter, sheet_name='ActResWithDiff', index=False)
    dfActRes.to_excel(xlsxWriter, sheet_name='RawActResults', index=True)

In [None]:
dfActRes.head()

# III. Parallel run of same analyses

## 1. Prepare analyses

(same test cases and input data as previously, for easy comparison)

In [None]:
# Analysis executor : 6, 8, None threads => min elapsed = ~2s on a Lenovo P52 (6-core i7-8850H with PCI-e SSD)
parallelExecutor = ads.Executor(parallel=True, threads=6)

# Analysis engine
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-out'), executor=parallelExecutor, 
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
# Results object construction
parResults = ads.MCDSResultsSet(miCustomCols=miCustCols, miSampleCols=miSampCols, dfCustomColTrans=dfCustColTrans)

In [None]:
computed = False

## 2a. Or : Really run analyses

In [None]:
# Shorten test cases and reference results lists, to go faster
# Warning: If you don't retain entire samples, later comparison will fail on Delta AIC values.
#selCaseInds = [0, 5, 7, 22, 31] # Some random cases, with uncomplete samples.
#selCaseInds = dfAnlysCases[dfAnlysCases.Sample.isin([3, 4])].index # A shorter selection, with complete samples.
selCaseInds = range(len(dfAnlysCases)) # All of them.

nOrigAnlysCases = len(dfAnlysCases)
dfAnlysCases = dfAnlysCases.loc[selCaseInds]
dfRefRes = dfRefRes.loc[selCaseInds]

print('Retained {} out of {}.'.format(len(selCaseInds), nOrigAnlysCases))

In [None]:
tsStart = pd.Timestamp.now()
print('Started at', tsStart)

# Start running all analyses
lastInFileName = None
analyses = dict()
for _, sCase in dfAnlysCases.iterrows():
    
    nCase = sCase.AnlysNum
    name = sCase.InFileName[len('ACDC2019-Papyrus')+1:-len('-dist.txt')]
    name += '-' + sCase.Model.lower().translate(str.maketrans({c:'-' for c in ' ,.:;()/'}))
    print('#{:3d}'.format(nCase+1), name, sCase.KeyFn, sCase.AdjSer)
    
    # Create data set if not already done.
    if lastInFileName != sCase.InFileName:
        sds = ads.SampleDataSet(pl.Path('refin', sCase.InFileName), decimalFields=decimalFields)
        lastInFileName = sCase.InFileName
        
    # Start running analysis in parallel (don't wait for it's finished, go on)
    sResHead = pd.Series(data=[sCase[col] for col in sCase.index[:len(caseIdCols)]], index=miCustCols)

    anlys = ads.MCDSAnalysis(engine=mcds, sampleDataSet=sds, name=name, customData=sResHead, logData=True,
                             estimKeyFn=sCase.KeyFn, estimAdjustFn=sCase.AdjSer,
                             estimCriterion=KEstimCriterion, cvInterval=KCVInterval,
                             minDist=sCase.LTrunc, maxDist=sCase.RTrunc,
                             #minDist=nan2None(sCase.LTrunc), maxDist=nan2None(sCase.RTrunc),
                             fitDistCuts=distCutsFromSpecs(sCase.FitDistCuts), # TODO: do this when building dfAnlysCases
                             discrDistCuts=distCutsFromSpecs(sCase.DiscrDistCuts))
    anlysFut = anlys.run()
    
    # Store analysis object and associated "future" for later use (should be running soon or later).
    analyses[anlysFut] = anlys
    
print('All analyses started ; now waiting for their end, and results ...')

# For each analysis as it gets completed (first completed => first yielded)
for anlysFut in parallelExecutor.asCompleted(analyses):

    # Retrieve analysis object from its associated future object
    anlys = analyses[anlysFut]
    
    # Get analysis results
    sResult = anlys.getResults()

    # Save results with header
    parResults.append(sResult, sCustomHead=anlys.customData)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
tsEnd = pd.Timestamp.now()
print('Finished at', tsEnd, ': duration', str(tsEnd - tsStart).replace('0 days ', ''))

computed = True

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-parallel-results.xlsx')

parResults.toExcel(resFileName, sheetName='AutoDSVal')

## 2b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-parallel-results.xlsx')
    print('Loading results from {} ...'.format(resFileName))

    parResults.fromExcel(resFileName, sheetName='AutoDSVal')
    
    # shutdown analysis engine
    mcds.shutdown()

else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(parResults)))

## 3. Compare parallel results to sequential ones

In [None]:
# Prepare sequential results for comparison
dfSeqCmpRes = results.dfTransData('en')

dfSeqCmpRes.fillna(-9999, inplace=True) # Get rid of the Nan pb (because NaN != NaN :-)

dfSeqCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
# Prepare parallel results for comparison
dfParCmpRes = parResults.dfTransData('en')

dfParCmpRes.sort_values(by='AnlysNum', inplace=True) # Back to original test case order = sequential run order

dfParCmpRes.reset_index(inplace=True, drop=True) # Enforce same index as a consequence

dfParCmpRes.fillna(-9999, inplace=True) # And get rid of the Nan pb (because NaN != Nan :-)

dfParCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
assert (dfSeqCmpRes == dfParCmpRes).all().all(), 'Oh, oh, something went differently when run parallely ...'

In [None]:
results = parResults

# IV. Excel and HTML reports

In [None]:
# Sélection des colonnes pour les tableaux de synthèse du rapport
synthCols = \
[
    ('sample', 'AnlysNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    
    ('model', 'Model', 'Value'),
    ('parameters', 'left truncation distance', 'Value'),
    ('parameters', 'right truncation distance', 'Value'),
    ('parameters', 'model fitting distance cut points', 'Value'),
    ('parameters', 'distance discretisation cut points', 'Value'),
    
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    
    ('detection probability', 'Delta AIC', 'Value'),
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('density/abundance', 'density of animals', 'Cv'),
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
    
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Lcl'),
    ('detection probability', 'probability of detection (Pw)', 'Ucl'),
    ('detection probability', 'probability of detection (Pw)', 'Df'),

    ('run output', 'run folder', 'Value'),
]

In [None]:
report = ads.MCDSResultsFullReport(resultsSet=results, synthCols=synthCols, title='Validation du module autods',
                                   subTitle='Rapport d\'analyse global', anlysSubTitle='Rapport détaillé',
                                   description='Qu\'ajouter de plus ?', keywords='autods, validation',
                                   lang='fr', attachedDir='.', tgtFolder=mcds.workDir, tgtPrefix='autods-validation-report')

In [None]:
htmlRep = report.toHtml()

HTML(f'Rapport HTML : <a href="{htmlRep}" target="blank">{htmlRep}</a>')

In [None]:
xlsxRep = report.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxRep}" target="blank">{xlsxRep}</a>')

# V. Run and report pre-analyses

(to help users to setup the full analyses plan : run first try simple analyses and show PDF and few results)

## 1. Determine samples from input data

* in real life, we'd simply load field collected data, and deduce individual "samples" from it ;
* but there, for testing, it's easier to deduce samples from manual analysis specification file)

In [None]:
# Create sample table from refout results table
refFileName = 'ACDC2019-Papyrus-ALAARV-TURMER-resultats-distance-73.xlsx'

sampleIdCols = ['Species', 'Periods', 'Prec.', 'Duration']

dfSamples = pd.read_excel(pl.Path('refout', refFileName), usecols=sampleIdCols)
dfSamples.rename(columns=dict(Name='Model'), inplace=True)
dfSamples.drop_duplicates(inplace=True)
dfSamples.reset_index(drop=True, inplace=True)

dfSamples.reset_index(inplace=True) # Generate sample # (later need for original sample order)
dfSamples.rename(columns=dict(index='SampleNum'), inplace=True)

sampleIdCols = ['SampleNum'] + sampleIdCols

dfSamples

## 2. Prepare analyses

In [None]:
decimalFields = ['Point transect*Survey effort', 'Observation*Radial distance']

In [None]:
# Analysis engine
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-pout'), # Non-parallel executor here: MCDSPreAnalysis does the job !
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
# Results object construction
custCols = [('sample', col, 'Value') for col in sampleIdCols]
miCustCols = pd.MultiIndex.from_tuples(custCols)
dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=sampleIdCols, 
                           fr=['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée']))

results = ads.MCDSResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans)

In [None]:
computed = False

## 3 Or : Really run pre-analyses

In [None]:
KPreEstimCrit = 'AIC'
KPreCVInterval = 95
KPreEstimModStrat = [dict(keyFn=kf, adjSr='COSINE', estCrit=KPreEstimCrit, cvInt=KPreCVInterval) \
                     for kf in['HNORMAL', 'HAZARD', 'UNIFORM', 'NEXPON']]

In [None]:
tsStart = pd.Timestamp.now()
print('Started at', tsStart)
print()

# Run all analyses
lastInFileName = None
for _, sSamp in dfSamples.iterrows():
    
    nSamp = sSamp.SampleNum
    sampId = '{}-{}-{}mn-{}dec' \
             .format(sSamp.Species,
                     'AB' if 'A+B' in sSamp.Periods else 'A' if 'A' in sSamp.Periods else 'B',
                     sSamp.Duration.split(' ')[0], sSamp['Prec.'].split(' ')[0])
    print('#{:3d}'.format(nSamp+1), sampId)
    
    # Create data set if not already done.
    inFileName = 'ACDC2019-Papyrus-{}-dist.txt'.format(sampId)
    if lastInFileName != inFileName:
        sds = ads.SampleDataSet(pl.Path('refin', inFileName), decimalFields=decimalFields)
        lastInFileName = inFileName
        
    # Run analysis
    preAnlys = ads.MCDSPreAnalysis(engine=mcds, sampleDataSet=ds, name=sampId, executor=None, # Not parallel run for now.
                                   logData=False, modelStrategy=KPreEstimModStrat)
    preAnlys.run()
    
    # Get results (wait for it's finished)
    sResult = preAnlys.getResults()

    # Save results
    sResHead = sSamp.copy()
    sResHead.index = miCustCols
    results.append(sResult, sCustomHead=sResHead)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
tsEnd = pd.Timestamp.now()
print('Finished at', tsEnd, ': duration', str(tsEnd - tsStart).replace('0 days ', ''))

computed = True

In [None]:
# Look at results
results.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                           'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                           'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults.xlsx')

results.toExcel(resFileName, sheetName='AutoDSVal')

## 3. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults.xlsx')
    print('Loading pre-results from {} ...'.format(resFileName))

    results.fromExcel(resFileName, sheetName='AutoDSVal')
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} pre-analyses loaded'.format(len(results)))

In [None]:
# Look at results
results.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                           'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                           'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

## 4. Generate HTML and Excel reports

In [None]:
# Sélection des colonnes pour les tableaux de synthèse du rapport
synthCols = \
[
    ('sample', 'SampleNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    ('parameters', 'CV interval', 'Value'),
    ('parameters', 'left truncation distance', 'Value'),
    ('parameters', 'right truncation distance', 'Value'),
    ('parameters', 'model fitting distance cut points', 'Value'),
    ('parameters', 'distance discretisation cut points', 'Value'),
    
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('density/abundance', 'density of animals', 'Cv'),
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
    
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Lcl'),
    ('detection probability', 'probability of detection (Pw)', 'Ucl'),
    ('detection probability', 'probability of detection (Pw)', 'Df'),

    ('run output', 'run folder', 'Value'),
]

In [None]:
# Select analysis results columns for the 3 textual columns of the synthesis pre-report
sampleCols = \
[
    ('sample', 'SampleNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value')
]

paramCols = \
[
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    ('parameters', 'CV interval', 'Value')
]
    
resultCols = \
[
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),

    ('density/abundance', 'density of animals', 'Cv'),
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
]

In [None]:
report = ads.MCDSResultsPreReport(resultsSet=results,
                                  title='Validation du module autods', subTitle='Rapport de pré-analyse',
                                  anlysSubTitle='Détail des pré-analyses', description='Qu\'ajouter de plus ?',
                                  keywords='autods, validation', plotsHeight=384, lang='fr',
                                  sampleCols=sampleCols, paramCols=paramCols, resultCols=resultCols, anlysSynthCols=synthCols,
                                  attachedDir='.', tgtFolder=mcds.workDir, tgtPrefix='autods-validation-prereport')

In [None]:
htmlRep = report.toHtml()

HTML(f'Pré-rapport HTML : <a href="{htmlRep}" target="blank">{htmlRep}</a>')

In [None]:
xlsxRep = report.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxRep}" target="blank">{xlsxRep}</a>')

# VI. Parallel run of same pre-analyses

And compare results to sequential run's.

## 1. Prepare analyses

In [None]:
# Analysis engine : Non-parallel executor here: MCDSPreAnalysis takes care of this !
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-ppout'),
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
# Results object construction
parResults = ads.MCDSResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans)

In [None]:
computed = False

## 2 Or : Really run pre-analyses

In [None]:
# Pre-analysis executor (kind of overkill here, with only 5 pre-analyses ... but still works twice as rapidly !).
parallelExecutor = ads.Executor(parallel=True, threads=6)

In [None]:
tsStart = pd.Timestamp.now()
print('Started at', tsStart)
print()

# Run all analyses
lastInFileName = None
preAnalyses = dict()
for _, sSamp in dfSamples.iterrows():
    
    nSamp = sSamp.SampleNum
    sampId = '{}-{}-{}mn-{}dec' \
             .format(sSamp.Species,
                     'AB' if 'A+B' in sSamp.Periods else 'A' if 'A' in sSamp.Periods else 'B',
                     sSamp.Duration.split(' ')[0], sSamp['Prec.'].split(' ')[0])
    print('#{:3d}'.format(nSamp+1), sampId)
    
    # Create data set if not already done.
    inFileName = 'ACDC2019-Papyrus-{}-dist.txt'.format(sampId)
    if lastInFileName != inFileName:
        sds = ads.SampleDataSet(pl.Path('refin', inFileName), decimalFields=decimalFields)
        lastInFileName = inFileName
        
    # Start running analysis (but don't wait for it's finished)
    sResHead = sSamp.copy()
    sResHead.index = miCustCols
    
    preAnlys = ads.MCDSPreAnalysis(engine=mcds, sampleDataSet=sds, name=sampId,
                                   customData=sResHead, executor=parallelExecutor,
                                   logData=False, modelStrategy=KPreEstimModStrat)
    preAnlysFut = preAnlys.run()
    
    # Store analysis object and associated "future" for later use (should be running soon or later).
    preAnalyses[preAnlysFut] = preAnlys
    
print('All pre-analyses started ; now waiting for their end, and results ...')

# For each analysis as it gets completed (first completed => first yielded)
for preAnlysFut in parallelExecutor.asCompleted(preAnalyses):

    # Retrieve pre-analysis object from its associated future object
    preAnlys = preAnalyses[preAnlysFut]
    
    # Get pre-analysis results
    sResult = preAnlys.getResults()

    # Save results with header
    parResults.append(sResult, sCustomHead=preAnlys.customData)
    
# shutdown executor
parallelExecutor.shutdown()

# shutdown analysis engine
mcds.shutdown()

# Done.
tsEnd = pd.Timestamp.now()
print('Finished at', tsEnd, ': duration', str(tsEnd - tsStart).replace('0 days ', ''))

computed = True

In [None]:
# Look at results
parResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                              'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                              'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults-par.xlsx')

parResults.toExcel(resFileName, sheetName='AutoDSVal')

## 2. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults-par.xlsx')
    print('Loading pre-results from {} ...'.format(resFileName))

    parResults.fromExcel(resFileName, sheetName='AutoDSVal')
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} pre-analyses loaded'.format(len(parResults)))

In [None]:
# Look at results
parResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                              'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                              'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

## 3. Compare parallel results to sequential ones

In [None]:
# Prepare sequential results for comparison
dfSeqCmpRes = results.dfTransData('en')

dfSeqCmpRes.fillna(-9999, inplace=True) # Get rid of the Nan pb (because NaN != NaN :-)

dfSeqCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
# Prepare parallel results for comparison
dfParCmpRes = parResults.dfTransData('en')

dfParCmpRes.sort_values(by='SampleNum', inplace=True) # Back to original test case order = sequential run order

dfParCmpRes.reset_index(inplace=True, drop=True) # Enforce same index as a consequence

dfParCmpRes.fillna(-9999, inplace=True) # And get rid of the Nan pb (because NaN != Nan :-)

dfParCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
assert (dfSeqCmpRes == dfParCmpRes).all().all(), 'Oh, oh, something went differently when run parallely ...'

# VII. Run analyses with real life field data (2/2 : the short and fast way)

Thanks to MCDSAnalyser class.

In [None]:
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDecFields = [effortCol, 'Distance']

sampleCol = 'Echant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

varIndCol = 'IndAnlys'
anlysAbbrevCol = 'AbrevAnlys'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

## 1. Individuals data set

In [None]:
dfObsIndiv = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='DonnéesIndiv').dfData
len(dfObsIndiv)

In [None]:
dfObsIndiv.head()

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='Inventaires').dfData
len(dfTransects)

In [None]:
dfTransects

## 3. Analyses specs

In [None]:
def analysisAbbrev(sAnlys):
    
    # Sample abbreviation
    spcAbbrev = ''.join(word[:4].title() for word in sAnlys['Espèce'].split(' ')[:2])
    sampAbbrev = [str(x) for x in [spcAbbrev, sAnlys.Passage.replace('+', ''),
                                   sAnlys.Adulte.replace('+', ''), sAnlys['Durée']]]

    # Model + Parameters abbreviation
    modParAbbrev = [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    if not pd.isnull(sAnlys['TrGche']):
        modParAbbrev.append('g{}'.format(int(sAnlys['TrGche'])))
    if not pd.isnull(sAnlys['TrDrte']):
        modParAbbrev.append('d{}'.format(int(sAnlys['TrDrte'])))
    if not pd.isnull(sAnlys['NbTrches']):
        modParAbbrev.append('t{}'.format(int(sAnlys['NbTrches'])))
    
    return '-'.join(sampAbbrev + modParAbbrev)

dfAnlysSpecs = ads.DSAnalyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx', 
                                                   varIndCol=varIndCol,
                                                   #convertCols={ 'Durée': int }, # float 'cause of Excel
                                                   computedCols={ anlysAbbrevCol: analysisAbbrev })

len(dfAnlysSpecs)

In [None]:
# Ajout du numéro d'échantillon dans les specs d'analyses
dfAnlysSpecs.insert(1, column='Echant', value=dfAnlysSpecs.groupby(sampleSelCols, sort=False).ngroup())

In [None]:
# For faster debugging : reduce work.
#dfAnlysSpecs = dfAnlysSpecs[(dfAnlysSpecs.Passage == 'a+b') & (dfAnlysSpecs.Adulte == 'm') \
#                            & (dfAnlysSpecs['Durée'] == '10mn') \
#                            & ((dfAnlysSpecs.TrGche.isnull()) | (dfAnlysSpecs.TrGche < 20)) \
#                            & ((dfAnlysSpecs.TrDrte.isnull()) | (dfAnlysSpecs.TrDrte <= 500))]
#len(dfAnlysSpecs)

In [None]:
dfAnlysSpecs

In [None]:
computed = False

In [None]:
workDir = 'tmp/mcds-anlr'

## 4A. Or : Really run analyses

### a. MCDS Analyser object

In [None]:
anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, abbrevCol=anlysAbbrevCol,
                          resultsHeadCols=dict(before=[varIndCol, sampleCol], sample=sampleSelCols,
                                               after=[anlysAbbrevCol]),
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleDecFields=sampleDecFields,
                          workDir=workDir)

### b. Run analyses

In [None]:
# Colonnes de dfFinalExplSpecs (ou valeurs constantes) donnant les paramètres d'analyse
KEstimCrit = 'AIC'
KCVInterval = 95

dAnlysParamsSpecs = dict(estimKeyFn='FonctionClé', estimAdjustFn='SérieAjust',
                         estimCriterion=KEstimCrit, cvInterval=KCVInterval,
                         minDist='TrGche', maxDist='TrDrte', fitDistCuts='NbTrches')

In [None]:
%%time

# Analyses
results = anlysr.run(dfAnlysSpecs, dAnlysParamsSpecs, threads=6)

computed = True

In [None]:
anlysr.shutdown()

In [None]:
results.dfTransData('fr')

In [None]:
results.dfData

### c. Save results for later reload or examination

In [None]:
results.toExcel(pl.Path(workDir) / 'valtests-mcds-anlyser-results.xlsx')

In [None]:
#results.toExcel(pl.Path(workDir) / 'valtests-mcds-anlyser-results-fr.xlsx', lang='fr')

## 4B. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # Objet résultats
    # a. Sample columns (for sample data set extractions + abscence sightings generation)
    sampleCols = ['Echant'] + sampleSelCols

    # b. Sample multi-index columns (for deltaAIC computation)
    _sampMCols = [('sample', col, 'Value') for col in sampleCols]
    miSampCols = pd.MultiIndex.from_tuples(_sampMCols)

    # c. Full custom multi-index columns to prepend to raw analysis results
    _beforeCols = [varIndCol]
    _custMCols = [('sample', col, 'Value') for col in _beforeCols]

    _custMCols += _sampMCols

    _afterCols = [anlysAbbrevCol]
    _custMCols += [('more', col, 'Value') for col in _afterCols]

    custCols = _beforeCols + sampleCols + _afterCols
    miCustCols = pd.MultiIndex.from_tuples(_custMCols)

    # d. Translation for it
    dfCustColTrans = \
        pd.DataFrame(index=miCustCols,
                     data=dict(en=['AnlysNum', 'Sample', 'Species', 'Passing', 'Adult', 'Duration', 'AnlysAbrev'], 
                               fr=custCols))

    # e. And finally, the result object
    results = ads.MCDSResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans, miSampleCols=miSampCols)

    # Chargement
    resFileName = pl.Path(workDir) / 'valtests-mcds-anlyser-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

## 5. Compare results to reference

(reference generated with same kind of "long" code like in III above, but on another data set)

In [None]:
# Load reference
# 1. Clone results _without_ data.
rsRef = results.copy(withData=False)

# 2. Load it with reference data
rsRef.fromOpenDoc('refout/ACDC2019-Naturalist-ExtraitResultats.ods')

rsRef.dfData

In [None]:
# Compare.
indexCols = results.miCustomCols.to_list() + [('parameters', 'estimator key function', 'Value'),
                                              ('parameters', 'estimator adjustment series', 'Value'),
                                              ('parameters', 'left truncation distance', 'Value'),
                                              ('parameters', 'right truncation distance', 'Value'),
                                              ('parameters', 'model fitting distance cut points', 'Value')]
subsetCols = [col for col in results.dfData.columns.to_list() \
              if col not in (indexCols + [('parameters', 'estimator selection criterion', 'Value'),
                                          ('parameters', 'CV interval', 'Value'),
                                          ('run output', 'run time', 'Value'),
                                          ('run output', 'run folder', 'Value'),
                                          ('detection probability', 'key function type', 'Value'),
                                          ('detection probability', 'adjustment series type', 'Value')])]

dfDiff = rsRef.compare(results, indexCols=indexCols, subsetCols=subsetCols, dropCloser=15, dropNans=True)

assert dfDiff.empty

print('Yessssss !')

# Development

# Sandbox