<!-- Auto table of contents -->
<h1 class='tocIgnore'>Validation tests</h1>

**pyaudisam**: Automation of Distance Sampling analyses with [Distance software](http://distancesampling.org/)

Copyright (C) 2021 Jean-Philippe Meuret

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program.
If not, see https://www.gnu.org/licenses/.

<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table of contents</h2>
  <div id="toc"></div>
</div>

In [None]:
%%javascript
$.getScript('ipython_notebook_toc.js')

In [None]:
%matplotlib inline

In [None]:
import sys
import os
import shutil
import pathlib as pl
import importlib as implib

import re

from collections import OrderedDict as odict, namedtuple as ntuple

import math
import numpy as np
import pandas as pd

from IPython.display import HTML, Markdown

In [None]:
sys.path.insert(0, '..')

In [None]:
import pyaudisam as ads

ads.runtime

In [None]:
# Create temporary directory if not yet done.
tmpDir = pl.Path('tmp')
tmpDir.mkdir(exist_ok=True)

In [None]:
# Logging configuration.
ads.log.configure(handlers=[sys.stdout, tmpDir / 'valtst.log'], reset=True,
                  loggers=[dict(name='matplotlib', level=ads.WARNING),
                           dict(name='ads', level=ads.INFO),
                           #dict(name='ads.dat', level=ads.INFO2),
                           #dict(name='ads.eng', level=ads.INFO2),
                           dict(name='ads.anr', level=ads.INFO2),
                           dict(name='ads.onr', level=ads.INFO2),
                           dict(name='ads.rep', level=ads.INFO1),
                           dict(name='valtst', level=ads.DEBUG)])

logger = ads.logger('valtst')

In [None]:
# Activate Warnings as Exceptions
if False:
    
    import warnings

    warnings.filterwarnings(action='error')

    # pd.read_excel
    warnings.filterwarnings(action='default', module='etree')
    warnings.filterwarnings(action='default', module='xlrd')
    warnings.filterwarnings(action='default', module='defusedxml')

In [None]:
def backup(fpn, to='.', tsFmt='.%y%m%d'):
    """Backup given file to target folder with custom-formatted timestamp in name"""
    fpn = pl.Path(fpn)
    tn = fpn.stem + pd.Timestamp.now().strftime(tsFmt) + fpn.suffix
    tp = pl.Path(to) if to != '.' else fpn.parent
    print('Backingup to', (tp / tn).as_posix())
    tpn = tp / tn
    shutil.copy(fpn, tpn)
    return tpn

Jump to :
* [II. Run pre-analyses / 0. Data Description](#II.-Run-pre-analyses)
* [III. Run analyses with same real life field data / 0. Data Description](#III.-Run-analyses-with-same-real-life-field-data)
* [IV. Run truncation opt-analyses with same real life field data](#IV.-Run-truncation-opt-analyses-with-same-real-life-field-data)

# I. Generate input files for manual analyses in Distance interactive software

* through an Excel input field data file,
* and a reference output file set, prooved as OK by using it in Distance software ;
* automated comparison to reference is achied at the end.

In [None]:
dfDistCases = pd.DataFrame([dict(inFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'],
                                 refOutFileName='ACDC2019-Papyrus-ALAARV-saisie-5-cols.txt', withExtraFields=False),
                            dict(inFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'],
                                 refOutFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.txt', withExtraFields=True)])
dfDistCases

In [None]:
eng = ads.MCDSEngine(workDir=tmpDir / 'mcds-out')

pl.Path(eng.workDir, 'distance-in').mkdir(exist_ok=True)

In [None]:
fails = 0
for ind, sCase in dfDistCases.iterrows():
    
    print('#', ind, ':', sCase.inFileName)

    # Create data set
    sds = ads.SampleDataSet(source=pl.Path('refin', sCase.inFileName),
                           decimalFields=sCase.decimalFields)
    
    # Build distance import data file
    ofn = pl.Path(eng.workDir, 'distance-in', sCase.refOutFileName)
    ofn = eng.buildDistanceDataFile(sds, tgtFilePathName=ofn, withExtraFields=sCase.withExtraFields)
    
    # Compare generated file to reference
    rfn = pl.Path('refout', sCase.refOutFileName)
    with open(ofn, 'r') as fOut, open(rfn, 'r') as fRef:
        if fOut.read() == fRef.read():
            print('Success : Conform to reference.')
        else:
            print('Error: Generated file differs from reference', rfn)
            fails += 1
            
    print()
    
print('All test cases succeeded !' if fails == 0 else 'Error: {} test case(s) failed.'.format(fails))

In [None]:
eng.shutdown()

# II. Run and report pre-analyses

Thanks to MCDSPreAnalyser and MCDSPreReport.

Short code, fast (parallel) run.

Note: 2 modes here, with explicit or implicit sample specification (manual switch).

Note: The exact same results (implicit mode) and reports can be produced through the command line :
```
$ cd ..
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-preanlr -n --preanalyses --prereports excel,html -u
```

## 0. Data description

In [None]:
# Short string for sample "identification"
def sampleAbbrev(sSample):
    
    abrvSpe = ''.join(word[:4].title() for word in sSample['Espèce'].split(' ')[:2])
    
    sampAbbrev = '{}-{}-{}-{}'.format(abrvSpe, sSample.Passage.replace('+', ''),
                                      sSample.Adulte.replace('+', ''), sSample['Durée'])
    
    return sampAbbrev

In [None]:
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDecCols = [effortCol, 'Distance']

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

speciesAbbrevCol = 'AbrevEsp'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

Jump to [III. Run analyses with same real life field data / 0. Data Description](#III.-Run-analyses-with-same-real-life-field-data)

## 1. Individuals data set

In [None]:
dfObsIndiv = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='DonnéesIndiv').dfData
dfObsIndiv

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='Inventaires').dfData
dfTransects

## 3. Samples to pre-analyse

In [None]:
# Implicit variants
varEspeces = ['Sylvia atricapilla', 'Turdus merula', 'Luscinia megarhynchos'] # 1 variante espèce ... par espèce <8-]

varPassages = ['b', 'a+b'] # Passage b ou a+b => 2 variantes
varAdultes = ['m'] # Les mâles, et ensuite les mâles et autres adultes (=> 2 variantes)
varDurees = ['5mn', '10mn'] # 5 1ères mn, ou toutes les 10 => 2 variantes

In [None]:
# Explicitation of variants or not
# a. Implicites specs
dImplSampleSpecs = { 'Espèce': varEspeces, 'Passage': varPassages, 'Adulte': varAdultes, 'Durée':   varDurees }

specsAreExplicit = False  # Manually switch for testing explicit mode !
if specsAreExplicit:
    
    # b. Explicit combinations
    dfExplSampleSpecs = ads.Analyser.explicitVariantSpecs(dict(_impl=dImplSampleSpecs))
    #dfExplSampleSpecs = ads.Analyser.explicitPartialVariantSpecs(dImplSampleSpecs) # Just the same, but less generic.

    # c. Add sample order columns (usefull for reports, as pre-analyses are run parallely !).
    #dfExplSampleSpecs.reset_index(drop=False, inplace=True)
    #dfExplSampleSpecs.rename(columns=dict(index=sampleNumCol), inplace=True)

    # d. Add sample abbreviation column (mainly for analysis traces)
    #dfExplSampleSpecs[sampleAbbrevCol] = dfExplSampleSpecs.apply(sampleAbbrev, axis='columns')

    # e. Add neutral and pass-through column (from sample specs to results)
    dfExplSampleSpecs[speciesAbbrevCol] = dfExplSampleSpecs['Espèce'].apply(lambda s: ''.join(m[:4] for m in s.split()))
    
    print(dfExplSampleSpecs)
    
else:
    
    # b. Keep unexplicited : run will do automatically
    implSampleSpecs = dict(_impl=dImplSampleSpecs)
    
    print(implSampleSpecs)    

In [None]:
computed = False

In [None]:
workDir = tmpDir / 'mcds-preanlr'

## 4A. Or : Really run pre-analyses

Note: The exact same results (implicit mode) can be produced through the command line :
```
$ cd ..
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-preanlr -n --preanalyses -u
```

### a. MCDSPreAnalyser object

In [None]:
preAnlysr = \
    ads.MCDSPreAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                        transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                        sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleSpecCustCols=[speciesAbbrevCol],
                        abbrevCol=sampleAbbrevCol, abbrevBuilder=sampleAbbrev, sampleIndCol=sampleNumCol,
                        distanceUnit='Meter', areaUnit='Hectare',
                        surveyType='Point', distanceType='Radial', clustering=False,
                        resultsHeadCols=dict(before=[sampleNumCol], sample=sampleSelCols,
                                             after=([speciesAbbrevCol] if specsAreExplicit else []) + [sampleAbbrevCol]),
                        workDir=workDir, logProgressEvery=5)

In [None]:
assert len(preAnlysr.specs) == 17

preAnlysr.specs

### b. Check pre-analyses specs

In [None]:
dfExplSampleSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    preAnlysr.explicitParamSpecs(dfExplParamSpecs=dfExplSampleSpecs if specsAreExplicit else None,
                                 implParamSpecs=implSampleSpecs if not specsAreExplicit else None,
                                 dropDupes=True, check=True)

print(verdict, reasons, len(dfExplSampleSpecs), userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

assert len(dfExplSampleSpecs) == 12
assert userParamSpecCols == [] # No analysis params here (auto. generated by PreAnalyser)
assert intParamSpecCols == [] # Idem
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

### (option) c. Generate input files for manual analyses with Distance GUI

(not needed for pre-analyses: here only for example)

TODO: Make this a real validation and non-regression test with comparison of output to reference, as a replacement of I. above, which is more a unit test (to be moved to unintests notebook).

Note: The exact same results can be produced through the command line:
```
$ cd ..
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-preanlr -n --distexport -u
```

In [None]:
preAnlysr.exportDSInputData(dfExplSampleSpecs=dfExplSampleSpecs if specsAreExplicit else None,
                            implSampleSpecs=implSampleSpecs if not specsAreExplicit else None,
                            format='Distance')

### d. Run pre-analyses

In [None]:
# Model fall-down strategy
modelStrategy = [dict(keyFn=kf, adjSr=js, estCrit='AIC', cvInt=95) \
                 for js in['COSINE', 'POLY', 'HERMITE']
                 for kf in['HNORMAL', 'HAZARD', 'UNIFORM', 'NEXPON']]

# Note: For real bird study analyses, you'll probably avoid NEXPON key function (model with no shoulder : g'(0) << 1).
#       And also HERMITE adjustment series (overkill fitting).

In [None]:
%%time

preResults = preAnlysr.run(dfExplSampleSpecs if specsAreExplicit else None,
                           implSampleSpecs=implSampleSpecs if not specsAreExplicit else None, 
                           dModelStrategy=modelStrategy, threads=6)

preAnlysr.shutdown()

computed = True

preResults.specs

Performances figures on a Ruindows 10 laptop with PCI-e SSD, "optimal performances" power scheme, Python 3.8 :
* 4-HT-core i5-8350U:
  * 2021 (precise date ?): 50s to ~1mn10s elapsed for 12 samples, 6-12 threads
* 6-core i7-10750H (HT off):
  * 2022-01-17: 40s elapsed for 12 samples, 6-12 threads (N=4)

In [None]:
assert not specsAreExplicit or speciesAbbrevCol in preResults.dfTransData('fr').columns

In [None]:
preResults.dfTransData('fr')

In [None]:
preResults.dfData

### e. Save results for later reload or examination

In [None]:
preResFileName = workDir / 'valtests-preanalyses-results.xlsx'

preResults.toExcel(preResFileName)

_ = backup(preResFileName)

In [None]:
#preResults.toExcel(workDir / 'valtests-preanalyses-results-fr.xlsx', lang='fr')

## 4B. Or : Load pre-analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    preAnlysr = \
        ads.MCDSPreAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                            transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                            sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleSpecCustCols=[speciesAbbrevCol],
                            abbrevCol=sampleAbbrevCol, abbrevBuilder=sampleAbbrev, sampleIndCol=sampleNumCol,
                            distanceUnit='Meter', areaUnit='Hectare',
                            surveyType='Point', distanceType='Radial', clustering=False,
                            resultsHeadCols=dict(before=[sampleNumCol], sample=sampleSelCols,
                                                 after=([speciesAbbrevCol] if specsAreExplicit else []) + [sampleAbbrevCol]))
    
    preResults = preAnlysr.setupResults()
    
    # Load results from file
    preResFileName = workDir / 'valtests-preanalyses-results.xlsx'
    print('Loading results from {} ...'.format(preResFileName))

    preResults.fromExcel(preResFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(preResults)))

## 5. Compare command-line to notebook pre-analysis results

In [None]:
# 1. Generate notebook results (through 4. right above)
# => preResults

# 2. Generate command-line results (through an external console with relevant python env activated)
# $ cd .. && python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-preanlr -n --preanalyses -u
# => file(preResFileName)

# 3. Load command line results
clPreResults = preResults.copy(withData=False)
clPreResults.fromExcel(resFileName)

# 4. Check that 2 was really run ...
assert (clPreResults._dfData[ads.MCDSAnalysisResultsSet.CLRunStartTime].max() \
        - preResults._dfData[ads.MCDSAnalysisResultsSet.CLRunStartTime].max()).total_seconds() > 1, \
       'Please run above given command line first: you are actually comparing preResults to itself !'

# 5. Compare
assert preResults.dfTransData('en').drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumEchant') \
        .compare(clPreResults.dfTransData('en').drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumEchant')) \
        .empty

logger.info('Success !')

## 6. Compare results to reference

(reference generated with same kind of "long" code like in III above, but on another data set)

In [None]:
# Load reference
# 1. Clone results _without_ data.
rsRef = preResults.copy(withData=False)

# 2. Load it with reference data (prevent re-postComputation as this ref. file is old, with now missing computed cols)
rsRef.fromOpenDoc('refout/ACDC2019-Naturalist-ExtraitPreResultats.ods', postComputed=True)  

rsRef.dfData

In [None]:
# Compare (ignore sample and analysis indexes, no use here).
indexPreCols = [col for col in preResults.miCustomCols.to_list() if '(sample)' in col[0]] \
                + [('parameters', 'estimator key function', 'Value'),
                   ('parameters', 'estimator adjustment series', 'Value')]

In [None]:
subsetPreCols = [col for col in preResults.dfData.columns.to_list() \
                 if col in rsRef.columns
                    and col not in indexPreCols + [col for col in preResults.miCustomCols.to_list()
                                                   if '(sample)' not in col[0]]
                                   + [('parameters', 'estimator selection criterion', 'Value'),
                                      ('parameters', 'CV interval', 'Value'),
                                      ('run output', 'start time', 'Value'),
                                      ('run output', 'elapsed time', 'Value'),
                                      ('run output', 'run folder', 'Value'),
                                      ('detection probability', 'key function type', 'Value'),
                                      ('detection probability', 'adjustment series type', 'Value'),
                                      ('detection probability', 'Delta AIC', 'Value'),
                                      ('density/abundance', 'density of animals', 'Delta Cv')]]

In [None]:
dfDiff = rsRef.compare(preResults, indexCols=indexPreCols, subsetCols=subsetPreCols, dropCloser=13, dropNans=True)

assert dfDiff.empty, 'Oh oh ... some differences !'

print('Yessssss !')

In [None]:
# To be perfectly honest ... there may be some 10**-14/-16 glitches (due to worksheet I/O ?)
dfComp = rsRef.compare(preResults, indexCols=indexPreCols, subsetCols=subsetPreCols, dropNans=True)
dfComp = dfComp[(dfComp != np.inf).all(axis='columns')]
dfComp

## 7. Generate HTML and Excel pre-analyses reports

Note: This can be also achieved through command-line:
```
$ cd .. 
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-preanlr -n --prereports excel,html -u
```

In [None]:
R = preResults

In [None]:
# Super-synthesis sub-report : Selected analysis results columns for the 3 textual columns of the table
samplePreRepCols = [
    ('header (head)', 'NumEchant', 'Value'),
    ('header (sample)', 'Espèce', 'Value'),
    ('header (sample)', 'Passage', 'Value'),
    ('header (sample)', 'Adulte', 'Value'),
    ('header (sample)', 'Durée', 'Value'),
    R.CLNTotObs, R.CLMinObsDist, R.CLMaxObsDist
]

paramPreRepCols = [
    R.CLParEstKeyFn, R.CLParEstAdjSer
    #R.CLParEstSelCrit, R.CLParEstCVInt
]
    
resultPreRepCols = [
    R.CLRunStatus,
    R.CLNObs, R.CLEffort,
    R.CLAic, R.CLChi2, R.CLKS, R.CLDCv,
    
    R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3,
    
    R.CLPDetec,
    R.CLEswEdr,
    R.CLDensity, R.CLDensityMin, R.CLDensityMax,
    R.CLNumber, R.CLNumberMin, R.CLNumberMax
]

In [None]:
# Synthesis sub-report : Selected analysis results columns for the 
synthPreRepCols = [
    ('header (head)', 'NumEchant', 'Value'),
    ('header (sample)', 'Espèce', 'Value'),
    ('header (sample)', 'Passage', 'Value'),
    ('header (sample)', 'Adulte', 'Value'),
    ('header (sample)', 'Durée', 'Value'),
    R.CLParEstKeyFn,
    R.CLParEstAdjSer,
    #R.CLParEstSelCrit,
    #R.CLParEstCVInt,
    #R.CLParTruncLeft,
    #R.CLParTruncRight,
    #R.CLParModFitDistCuts,
 
    R.CLNTotObs, R.CLNObs, R.CLNTotPars, R.CLEffort, R.CLDeltaAic,
    R.CLChi2, R.CLKS, R.CLCvMUw, R.CLCvMCw, R.CLDCv,

    R.CLSightRate,
    R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3,
    R.CLCmbQuaChi2, R.CLCmbQuaKS, R.CLCmbQuaDCv,

    R.CLPDetec, R.CLPDetecMin, R.CLPDetecMax,
    R.CLDensity, R.CLDensityMin, R.CLDensityMax,
    R.CLNumber, R.CLNumberMin, R.CLNumberMax
]

In [None]:
# Sorting columns for all the sub-reports
sortPreRepCols = [('header (head)', 'NumEchant', 'Value')]
sortPreRepAscend = True

In [None]:
preReport = ads.MCDSResultsPreReport(resultsSet=preResults,
                                     title='PyAuDiSam Validation: Pre-analyses', subTitle='Pre-analysis results report',
                                     anlysSubTitle='Pre-analysis results details',
                                     description='Easy and parallel run through MCDSPreAnalyser',
                                     keywords='pyaudisam, validation, pre-analysis',
                                     lang='en', superSynthPlotsHeight=288,
                                     #plotImgSize=(640, 400), plotLineWidth=1, plotDotWidth=4,
                                     #plotFontSizes=dict(title=11, axes=10, ticks=9, legend=10),
                                     sampleCols=samplePreRepCols, paramCols=paramPreRepCols,
                                     resultCols=resultPreRepCols, synthCols=synthPreRepCols,
                                     sortCols=sortPreRepCols, sortAscend=sortPreRepAscend,
                                     tgtFolder=workDir, tgtPrefix='valtests-preanalyses-report')

In [None]:
xlsxPreRep = preReport.toExcel()

xlsxBkpPreRep = backup(xlsxPreRep)

HTML(f'Excel pre-report: <a href="{xlsxPreRep}" target="blank">{xlsxPreRep}</a>')

In [None]:
os.startfile(xlsxPreRep)

In [None]:
%%time

htmlPreRep = preReport.toHtml() #generators=5)

backup(htmlPreRep)

print('Pre-report: ' + pl.Path(htmlPreRep).resolve().as_uri())

## 8. Compare command-line and notebook Excel pre-analyses reports

In [None]:
# 1. Generate notebook report (through 7. right above)
# => xlsxBkpPreRep

# 2. Generate command-line report (through an external console with relevant python env activated)
# $ cd .. && python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-preanlr -n --prereports excel -u
# => xlsxPreRep

# 3. Load the 2 reports
ddfNbPreReport = pd.read_excel(xlsxBkpPreRep, sheet_name=None, index_col=0)  # Notebook (backup) one
ddfClPreReport = pd.read_excel(xlsxPreRep, sheet_name=None, index_col=0)  # Command-line one

# 4. Check that 2 was really run ...
assert (ddfClPreReport['Details']['StartTime'].max() - ddfNbPreReport['Details']['StartTime'].max()).total_seconds() > 1, \
       'Please run above given command line first: you are actually comparing notebook report to itself !'

# 5. Compare Synthesis and Details sheets
assert ddfNbPreReport['Synthesis'].drop(columns=['RunFolder']).set_index('NumEchant') \
        .compare(ddfClPreReport['Synthesis'].drop(columns=['RunFolder']).set_index('NumEchant')) \
        .empty
assert ddfNbPreReport['Details'].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumEchant') \
        .compare(ddfClPreReport['Details'].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumEchant')) \
        .empty

logger.info('Success !')

# III. Run analyses with same real life field data

Thanks to MCDSAnalyser class.

Short code, fast (parallel) run.

Note: The exact same results and reports can be also produced through command line:
```
$ cd .. 
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-anlr -n --analyses --reports excel,html -u
```

## 0. Data description

Run first [II. Run pre-analyses / 0. Data Description](#II.-Run-pre-analyses)

In [None]:
# Short string for analysis "identification"
def analysisAbbrev(sAnlys):
    
    # Sample abbreviation
    abbrevs = [sampleAbbrev(sAnlys)]

    # Model + Parameters abbreviation
    abbrevs += [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    dTroncAbrv = { 'l': 'TrGche' if 'TrGche' in sAnlys.index else 'TroncGche',
                   'r': 'TrDrte' if 'TrDrte' in sAnlys.index else 'TroncDrte',
                   'm': 'NbTrches' if 'NbTrches' in sAnlys.index else 'NbTrModel'
                                   if 'NbTrModel' in sAnlys.index else  'NbTrchMod',
                   'd': 'NbTrDiscr' }
    for abrv, name in dTroncAbrv.items():
        if name in sAnlys.index and not pd.isnull(sAnlys[name]):
            abbrevs.append('{}{}'.format(abrv, sAnlys[name][0].lower() if isinstance(sAnlys[name], str)
                                               else int(sAnlys[name])))
   
    return '-'.join(abbrevs)

In [None]:
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDecCols = [effortCol, 'Distance']

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

Jump to [IV. Run truncation opt-analyses with same real life field data](#IV.-Run-truncation-opt-analyses-with-same-real-life-field-data)

## 1. Individuals data set

In [None]:
dfObsIndiv = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='DonnéesIndiv').dfData

In [None]:
dfObsIndiv.head()

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='Inventaires').dfData
len(dfTransects)

In [None]:
dfTransects

## 3. Analyses specs

In [None]:
dfAnlysSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx', 
                                                 keep=['Echant1_impl', 'Echant2_impl', 'Modl_impl',
                                                       'Params1_expl', 'Params2_expl'],
                                                 varIndCol=varIndCol,
                                                 #convertCols={ 'Durée': int }, # float 'cause of Excel
                                                 computedCols={ anlysAbbrevCol: analysisAbbrev })

len(dfAnlysSpecs)

In [None]:
# For faster debugging : reduce work.
#dfAnlysSpecs = dfAnlysSpecs[(dfAnlysSpecs.Passage == 'a+b') & (dfAnlysSpecs.Adulte == 'm') \
#                            & (dfAnlysSpecs['Durée'] == '10mn') \
#                            & ((dfAnlysSpecs.TrGche.isnull()) | (dfAnlysSpecs.TrGche < 20)) \
#                            & ((dfAnlysSpecs.TrDrte.isnull()) | (dfAnlysSpecs.TrDrte <= 500))]
#len(dfAnlysSpecs)

In [None]:
dfAnlysSpecs

In [None]:
# Recall analysis set without truncation params
dfAnlysSpecs[['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust']].drop_duplicates().reset_index(drop=True)

In [None]:
computed = False

In [None]:
workDir = tmpDir / 'mcds-anlr'

## 4A. Or : Really run analyses

### a. MCDS Analyser object

In [None]:
anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                          abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                          distanceUnit='Meter', areaUnit='Hectare',
                          surveyType='Point', distanceType='Radial', clustering=False,
                          resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                               after=[anlysAbbrevCol]),
                          workDir=workDir, logProgressEvery=5,
                          defEstimCriterion='AIC', defCVInterval=95)

### b. Check analysis explicit specs

In [None]:
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfAnlysSpecs, dropDupes=True, check=True)

assert len(dfAnlysSpecs) == 48
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts']
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

In [None]:
dfAnlysSpecs.head()

### c. Run analyses

In [None]:
%%time

results = anlysr.run(dfAnlysSpecs, threads=6)

anlysr.shutdown()

computed = True

Performance figures on a Ruindows 10 laptop with PCI-e SSD, "optimal performances" power scheme:
* 6-HT-core i7-8850H (python 3.7?):
  * 2019 or 2020 before 06: min=5, max=11s elapsed for 64 analyses, 6 threads ?
* 4-HT-core i5-8350U (python 3.8):
  * 2021-01: min=5.3, max=5.7s elapsed for 48 analyses, 6 threads ?
  * 2021-10-02: min=4.2s, max=5.7s (n=3) elapsed for 48 analyses, 6 threads ?
* 6-core i7-10750H, HT disabled (python 3.8):
  * 2022-01-01: mean=3.4s (n=4) elapsed for 48 analyses, 6 threads

In [None]:
results.dfTransData('fr')

In [None]:
results.dfData

### d. Save results for later reload or examination

In [None]:
resFileName = workDir / 'valtests-analyses-results.xlsx'

results.toExcel(resFileName)

backup(fpn)

In [None]:
#results.toExcel(workDir / 'valtests-analyses-results-fr.xlsx', lang='fr')

## 4B. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                              resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                   after=[anlysAbbrevCol]),
                              transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                              sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                              abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                              distanceUnit='Meter', areaUnit='Hectare',
                              surveyType='Point', distanceType='Radial', clustering=False)
    
    results = anlysr.setupResults()
    
    # Load results from file.
    resFileName = workDir / 'valtests-analyses-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

## 5. Compare command-line to notebook analysis results

In [None]:
# 1. Generate notebook results (through 4. right above)
# => results

# 2. Generate command-line results (through an external console with relevant python env activated)
# $ cd .. && python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-anlr -n --analyses -u
# => file(resFileName)

# 3. Load command line results
clResults = results.copy(withData=False)
clResults.fromExcel(resFileName)

# 4. Check that 2 was really run ...
assert (clResults._dfData[ads.MCDSAnalysisResultsSet.CLRunStartTime].max() \
        - results._dfData[ads.MCDSAnalysisResultsSet.CLRunStartTime].max()).total_seconds() >= 1, \
       'Please run above given command line first: you are actually comparing results to itself !'

# 5. Compare
assert results.dfTransData('en').drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys') \
        .compare(clResults.dfTransData('en').drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys')) \
        .empty

logger.info('Success !')

## 6. Compare results to reference

(reference generated with same kind of "long" code like in III above, but on another data set)

In [None]:
# Load reference
# 1. Clone results _without_ data.
rsRef = results.copy(withData=False)

# 2. Load it with reference data (prevent re-postComputation as this ref. file is old, with now missing computed cols)
rsRef.fromFile('refout/ACDC2019-Naturalist-ExtraitResultats.ods', postComputed=True)

rsRef.dfData

In [None]:
# Compare (ignore sample and analysis indexes, no use here).
indexCols = [col for col in results.miCustomCols.to_list() if '(sample)' in col[0]] \
            + [('parameters', 'estimator key function', 'Value'),
               ('parameters', 'estimator adjustment series', 'Value'),
               ('parameters', 'left truncation distance', 'Value'),
               ('parameters', 'right truncation distance', 'Value'),
               ('parameters', 'model fitting distance cut points', 'Value')]

# Ignore also string params (comparison not implemented) and computed values.
subsetCols = [col for col in results.dfData.columns.to_list() \
              if col in rsRef.columns
                 and col not in (indexCols + [col for col in results.miCustomCols.to_list()
                                              if '(sample)' not in col[0]]
                                 + [('parameters', 'estimator selection criterion', 'Value'),
                                    ('parameters', 'CV interval', 'Value'),
                                    ('run output', 'start time', 'Value'),
                                    ('run output', 'elapsed time', 'Value'),
                                    ('run output', 'run folder', 'Value'),
                                    ('detection probability', 'key function type', 'Value'),
                                    ('detection probability', 'adjustment series type', 'Value'),
                                    ('detection probability', 'Delta AIC', 'Value'),
                                    ('density/abundance', 'density of animals', 'Delta Cv')])]

dfDiff = rsRef.compare(results, indexCols=indexCols, subsetCols=subsetCols, dropCloser=12, dropNans=True)

assert dfDiff.empty, 'No, no, no : not the same ...'

print('Yessssss !')

In [None]:
# To be perfectly honnest ... may be some 10**-12/15 glitches (due to worksheet I/O ?)
rsRef.compare(results, indexCols=indexCols, subsetCols=subsetCols, dropCloser=14, dropNans=True)

## 7. Generate HTML and Excel analyses reports

In [None]:
R = results

In [None]:
# Super-synthesis sub-report : Selected analysis results columns for the 3 textual columns of the table
sampleRepCols = [
    ('header (head)', 'NumEchant', 'Value'),
    ('header (sample)', 'Espèce', 'Value'),
    ('header (sample)', 'Passage', 'Value'),
    ('header (sample)', 'Adulte', 'Value'),
    ('header (sample)', 'Durée', 'Value'),
    R.CLNTotObs, R.CLMinObsDist, R.CLMaxObsDist
]

paramRepCols = [
    R.CLParEstKeyFn, R.CLParEstAdjSer,
    #R.CLParEstSelCrit, R.CLParEstCVInt,
    R.CLParTruncLeft, R.CLParTruncRight, R.CLParModFitDistCuts
]
    
resultRepCols = [
    ('header (head)', 'NumAnlys', 'Value'),
    R.CLRunStatus,
    R.CLNObs, R.CLEffort,
    R.CLAic, R.CLChi2, R.CLKS, R.CLDCv,
    
    R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3,
    
    R.CLPDetec,
    R.CLEswEdr,
    R.CLDensity, R.CLDensityMin, R.CLDensityMax,
    R.CLNumber, R.CLNumberMin, R.CLNumberMax
]

In [None]:
# Synthesis sub-report: Selected analysis results columns for the table
synthRepCols = [
    ('header (head)', 'NumEchant', 'Value'),
    ('header (sample)', 'Espèce', 'Value'),
    ('header (sample)', 'Passage', 'Value'),
    ('header (sample)', 'Adulte', 'Value'),
    ('header (sample)', 'Durée', 'Value'),
    ('header (head)', 'NumAnlys', 'Value'),
    
    R.CLParEstKeyFn, R.CLParEstAdjSer,
    #R.CLParEstSelCrit, R.CLParEstCVInt,
    R.CLParTruncLeft, R.CLParTruncRight, R.CLParModFitDistCuts,
 
    R.CLNTotObs, R.CLNObs, R.CLNTotPars, R.CLEffort, R.CLDeltaAic, R.CLChi2, R.CLKS, R.CLCvMUw, R.CLCvMCw, R.CLDCv, 
    R.CLPDetec, R.CLPDetecMin, R.CLPDetecMax, R.CLDensity, R.CLDensityMin, R.CLDensityMax,

    R.CLSightRate,
    R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3,
    R.CLCmbQuaChi2, R.CLCmbQuaKS, R.CLCmbQuaDCv,

    R.CLGrpOrdSmTrAic,
    R.CLGrpOrdClTrChi2KSDCv, #R.CLGrpOrdClTrChi2,
    R.CLGrpOrdClTrDCv,
    R.CLGrpOrdClTrQuaBal1, R.CLGrpOrdClTrQuaBal2, R.CLGrpOrdClTrQuaBal3, R.CLGrpOrdClTrQuaChi2,
    R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv,
    R.CLGblOrdChi2KSDCv, R.CLGblOrdQuaBal1, R.CLGblOrdQuaBal2, R.CLGblOrdQuaBal3,
    R.CLGblOrdQuaChi2, R.CLGblOrdQuaKS, R.CLGblOrdQuaDCv,
    R.CLGblOrdDAicChi2KSDCv,
]

In [None]:
# Sorting columns for all the sub-reports
sortRepCols = \
[('header (head)', 'NumEchant', 'Value')] \
+ [R.CLParTruncLeft, R.CLParTruncRight,
   R.CLDeltaAic,
   R.CLCmbQuaBal3]

sortRepAscend = [True] * (len(sortRepCols) - 1) + [False]

In [None]:
report = ads.MCDSResultsFullReport(resultsSet=results, 
                                   sampleCols=sampleRepCols, paramCols=paramRepCols,
                                   resultCols=resultRepCols, synthCols=synthRepCols,
                                   sortCols=sortRepCols, sortAscend=sortRepAscend,
                                   title='PyAuDiSam Validation: Analyses', subTitle='Global analyses report',
                                   anlysSubTitle='Detailed report',
                                   description='Easy and parallel run through MCDSAnalyser',
                                   keywords='pyaudisam, validation, analysis', pySources=['valtests.ipynb'],
                                   lang='en', superSynthPlotsHeight=288,
                                   #plotImgSize=(640, 400), plotLineWidth=1, plotDotWidth=4,
                                   #plotFontSizes=dict(title=11, axes=10, ticks=9, legend=10),
                                   tgtFolder=workDir, tgtPrefix='valtests-analyses-report')

In [None]:
xlsxAnlrRep = report.toExcel()

xlsxAnlrBkpRep = backup(xlsxAnlrRep)

HTML(f'Excel report: <a href="{xlsxAnlrRep}" target="blank">{xlsxAnlrRep}</a>')

In [None]:
os.startfile(xlsxAnlrRep)

In [None]:
htmlAnlrRep = report.toHtml()  # Auto-number of parallel generators 

backup(htmlAnlrRep)

print('Report: ' + pl.Path(htmlAnlrRep).resolve().as_uri())

## 8. Compare command-line and notebook Excel analyses reports

In [None]:
# 1. Generate notebook report (through 7. right above)
# => xlsxAnlrBkpRep

# 2. Generate command-line report (through an external console with relevant python env activated)
# $ cd .. && python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-anlr -n --reports excel -u
# => xlsxAnlrRep

# 3. Load the 2 reports
ddfNbReport = pd.read_excel(xlsxAnlrBkpRep, sheet_name=None, index_col=0)  # Notebook (backup) one
ddfClReport = pd.read_excel(xlsxAnlrRep, sheet_name=None, index_col=0)  # Command-line one

# 4. Check that 2 was really run ...
assert (ddfClReport['Details']['StartTime'].max() - ddfNbReport['Details']['StartTime'].max()).total_seconds() > 1, \
       'Please run above given command line first: you are actually comparing notebook report to itself !'

# 5. Compare Synthesis and Details sheets
assert ddfNbReport['Synthesis'].drop(columns=['RunFolder']).set_index('NumAnlys') \
        .compare(ddfClReport['Synthesis'].drop(columns=['RunFolder']).set_index('NumAnlys')) \
        .empty
assert ddfNbReport['Details'].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys') \
        .compare(ddfClReport['Details'].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys')) \
        .empty

logger.info('Success !')

# IV. Run truncation opt-analyses with same real life field data

i.e. analyses with:
* ready-to-go (const values) analysis parameters,
* sometimes, some distance truncation parameters auto-computed:
    * through some kind of optimisation process around MCDS.exe,
    * from easily specified optimisation parameters.

Thanks to MCDSTruncationOptanalyser class.

Note: The exact same results and reports can be also produced through command line:
```
$ cd .. 
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-optanlr -n --optanalyses --optreports excel,html:mqua-r92 -u
```

## 0. Data description and optanalysis parameters

Run first
* [II. Run pre-analyses / 0. Data Description](#II.-Run-pre-analyses)
* [III. Run analyses with same real life field data / 0. Data Description](#III.-Run-analyses-with-same-real-life-field-data)

In [None]:
# Source / Results data
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'Distance'
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

# optIndCol = 'IndOptim'
# optAbbrevCol = 'AbrevOptim'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

In [None]:
# General DS analysis parameters
varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

distanceUnit = 'Meter'
areaUnit = 'Hectare'
surveyType = 'Point'
distanceType = 'Radial'
clustering = False

In [None]:
# Default optimisation parameters.
defEstimKeyFn = 'HNORMAL'
defEstimAdjustFn = 'COSINE'
defEstimCriterion = 'AIC'
defCVInterval = 95
defMinDist = None
defMaxDist = None, 
defFitDistCuts = None
defDiscrDistCuts = None

defExpr2Optimise = 'chi2'
defMinimiseExpr = False
defOutliersMethod = 'tucquant'
defOutliersQuantCutPct = 7
defFitDistCutsFctr = ads.Interval(min=0.6, max=1.4)
defDiscrDistCutsFctr = ads.Interval(min=0.5, max=1.2)

defSubmitTimes = 1
defSubmitOnlyBest = None

defCoreEngine = 'zoopt'
defCoreMaxIters = 100
defCoreTermExprValue = None
defCoreAlgorithm = 'racos'
defCoreMaxRetries = 0

dDefSubmitOtherParams = dict()

In [None]:
# Results post-computation parameters
ldTruncIntrvSpecs = [dict(col='left', minDist=5.0, maxLen=5.0),
                     dict(col='right', minDist=25.0, maxLen=25.0)]
truncIntrvEpsilon = 1e-6

In [None]:
# Les analyses à faire (avec specs d'optimisation dedans si nécessaire)
optanlysSpecFile = 'refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx'
#optanlysSpecFile = '../donnees/acdc/ACDC2019-Naturalist-ExtraitSpecsOptanalyses-reduit.ods'

In [None]:
# def optimAbbrev(sAnlys):
#     
#     # Sample abbreviation
#     spcAbbrev = ''.join(word[:4].title() for word in sAnlys['Espèce'].split(' ')[:2])
#     sampAbbrev = [str(x) for x in [spcAbbrev, sAnlys.Passage.replace('+', ''),
#                                    sAnlys.Adulte.replace('+', ''), sAnlys['Durée']]]
# 
#     # Model + Parameters abbreviation
#     modParAbbrev = [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
#     
#     return '-'.join(sampAbbrev + modParAbbrev)

## 1. Individuals data set

In [None]:
# Les données individualisées et transects
indivObsFile = 'refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods'

In [None]:
dfObsIndiv = ads.DataSet(indivObsFile, sheet='DonnéesIndiv').dfData
len(dfObsIndiv)

In [None]:
dfObsIndiv.head()

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet(indivObsFile, sheet='Inventaires').dfData
len(dfTransects)

In [None]:
dfTransects

In [None]:
workDir = tmpDir / 'mcds-optanlr'

Jump to [3C. Or : Load opt-analyses results from a previous run](#3C.-Or-%3A-Load-opt-analyses-results-from-a-previous-run)

## 3A. Or : Really run opt-analyses

Note: The exact same results can be also produced through command line:
```
$ cd .. 
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-optanlr -n --optanalyses -u
```

### a. MCDS Opt-Analyser object

In [None]:
optanlr = \
    ads.MCDSTruncationOptanalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                                  transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                  sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                                  abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                  anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                                  distanceUnit=distanceUnit, areaUnit=areaUnit,
                                  surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                                  resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                       after=anlysParamCols + [anlysAbbrevCol]),
                                  ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon,
                                  workDir=workDir, runMethod='subprocess.run', runTimeOut=120,
                                  #runMethod='os.system', runTimeOut=None,  # Uncomment to test os.system run method.
                                  logAnlysProgressEvery=5, logOptimProgressEvery=3, backupOptimEvery=5,
                                  defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                                  defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                                  defExpr2Optimise=defExpr2Optimise, defMinimiseExpr=defMinimiseExpr,
                                  defOutliersMethod=defOutliersMethod, defOutliersQuantCutPct=defOutliersQuantCutPct,
                                  defFitDistCutsFctr=defFitDistCutsFctr, defDiscrDistCutsFctr=defDiscrDistCutsFctr,
                                  defSubmitTimes=defSubmitTimes, defSubmitOnlyBest=defSubmitOnlyBest,
                                  dDefSubmitOtherParams=dDefSubmitOtherParams,
                                  dDefOptimCoreParams=dict(core=defCoreEngine, maxIters=defCoreMaxIters,
                                                           termExprValue=defCoreTermExprValue,
                                                           algorithm=defCoreAlgorithm, maxRetries=defCoreMaxRetries))

In [None]:
assert len(optanlr.specs) == 27

### b. Check opt-analyses specs

In [None]:
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    optanlr.explicitParamSpecs(implParamSpecs=optanlysSpecFile, dropDupes=True, check=True)

In [None]:
assert len(dfAnlysSpecs) == 60
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts', 'SubmitParams']
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

In [None]:
dfAnlysSpecs

In [None]:
print(len(dfAnlysSpecs))
if not verdict:
    print(reasons)
    print(userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

### c. Run opt-analyses

In [None]:
print('* OptAnalyser specs:', ', '.join(f'{k}={v}' for k, v in optanlr.specs.items()))
print('* OptAnalyses specs:', len(dfAnlysSpecs), 'optimisations from', optanlysSpecFile)

In [None]:
%%time

optResults = optanlr.run(implParamSpecs=optanlysSpecFile, threads=24)
# optResults = optanlr.run(dfExplParamSpecs=dfAnlysSpecs.loc[51:52], threads=1)  # A small sample, for a quicker check

optanlr.shutdown()

computed = True

optResults.specs

Performances figures on a 4-HT-core i5-8350U Ruindows 10 laptop with PCI-e SSD, "optimal performance power scheme", 12 threads, Python 3.8 :
* 2021-01-05
  * OptAnalyserspecs: Zone=ACDC, Surface=2400, distanceUnit=Meter, areaUnit=Hectare, surveyType=Point, distanceType=Radial, clustering=False, defEstimKeyFn=HNORMAL, defEstimAdjustFn=COSINE, defEstimCriterion=AIC, defCVInterval=95, defMinDist=None, defMaxDist=None, defFitDistCuts=None, defDiscrDistCuts=None, defExpr2Optimise=chi2, defMinimiseExpr=False, dDefOptimCoreParams={'core': 'zoopt', 'maxIters': 100, 'termExprValue': None, 'algorithm': 'racos', 'maxRetries': 0}, defSubmitTimes=1, defSubmitOnlyBest=None, dDefSubmitOtherParams={}, defOutliersMethod=tucquant, defOutliersQuantCutPct=7, defFitDistCutsFctr=[0.6, 1.4], defDiscrDistCutsFctr=[0.5, 1.2]
  * OptAnalyses specs: 60 optimisations, from refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx => 70 resultats,
  * runMethod: subprocess.run => 4mn40, 4mn52, 4mn38, 4mn23, 4mn40, 5mn00, 4mn41, 4mn35, 4mn47 (mean 4mn42)
  * runMethod: os.system      => 4mn35, 4mn24, 4mn20, 4mn30 (mean 4mn27)

* 2021-08-22, 2021-10-02
  * same OptAnalyserspecs, OptAnalyses specs
  * runMethod: subprocess.run => 4mn35 (n >= 2)
  
* 2021-10-06
  * same OptAnalyserspecs, OptAnalyses specs
  * runMethod: subprocess.run => 4mn08 (n = 1)
* 2021-11-19 After adding quality indicators computation in analysis results post-processing
  * same OptAnalyserspecs, OptAnalyses specs
  * runMethod: subprocess.run => 6mn21 (n = 1)

Performances figures on a 6-core (HT disabled) i7-10850H Ruindows 10 laptop with PCI-e SSD, "optimal performance power scheme", Python 3.8 :
* 2021-11-28 After optimizing quality indicators computation in analysis results post-processing
  * same OptAnalyserspecs, OptAnalyses specs as on 2021-01-05
  * 12 threads, runMethod: subprocess.run => 4mn12 (n = 1)
  * 18 threads, runMethod: subprocess.run => 3mn20 (n = 1)
  * 24 threads, runMethod: subprocess.run => 3mn30 (n = 1)
* 2022-01-01,02 (no change)
  * 24 threads, runMethod: subprocess.run => 3mn16 to 3mn28 (n = 2)
* 2022-01-17 (no change)
  * 24 threads, runMethod: subprocess.run => 3mn03 (n = 1)

In [None]:
assert ads.MCDSTruncationOptanalyser.OptimTruncFlagCol in optResults.dfTransData('fr').columns
# Note: This also runs post-computations ...

In [None]:
optResults.dfTransData('fr')

In [None]:
optResults.dfData

In [None]:
#optResults.dfTransData('fr').to_excel('tmp/res-tst.xlsx')

In [None]:
#optResults._dfData.to_excel('tmp/rawres-tst.xlsx')

optResults._dfData

### d. Save results for later reload or examination

In [None]:
optResFileName = workDir / 'valtests-optanalyses-results.xlsx'

optResults.toExcel(optResFileName)

_ = backup(optResFileName)

In [None]:
optResults.toOpenDoc(workDir / 'valtests-optanalyses-results-fr.ods', lang='fr')

In [None]:
#results.fromExcel(workDir / 'valtests-optanalyses-results.xlsx', specs=False)

## 3B. Or : Restart opt-analyses from recovery files

(already run above)

### a. MCDS Opt-Analyser object

In [None]:
# Warning: Must be a real clone of the 3A optanalyser (about data, not technical run stuff),
# otherwise recovery might not work.
optanlr = \
    ads.MCDSTruncationOptanalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                                  transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                  sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                                  abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                  anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                                  distanceUnit=distanceUnit, areaUnit=areaUnit,
                                  surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                                  resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                       after=anlysParamCols + [anlysAbbrevCol]),
                                  ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon,
                                  workDir=workDir, logAnlysProgressEvery=5, logOptimProgressEvery=3,
                                  defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                                  defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                                  defExpr2Optimise=defExpr2Optimise, defMinimiseExpr=defMinimiseExpr,
                                  defOutliersMethod=defOutliersMethod, defOutliersQuantCutPct=defOutliersQuantCutPct,
                                  defFitDistCutsFctr=defFitDistCutsFctr, defDiscrDistCutsFctr=defDiscrDistCutsFctr,
                                  defSubmitTimes=defSubmitTimes, defSubmitOnlyBest=defSubmitOnlyBest,
                                  dDefSubmitOtherParams=dDefSubmitOtherParams,
                                  dDefOptimCoreParams=dict(core=defCoreEngine, maxIters=defCoreMaxIters,
                                                           termExprValue=defCoreTermExprValue,
                                                           algorithm=defCoreAlgorithm, maxRetries=defCoreMaxRetries))

In [None]:
assert len(optanlr.specs) == 27

### b. Check opt-analyses specs

In [None]:
dfAnlysSpecs

### c. Run opt-analyses

In [None]:
%%time

optResults2 = optanlr.run(implParamSpecs=optanlysSpecFile, recoverOptims=True, threads=12)

# A small sample, for a quicker check
#results2 = optanlr.run(dfExplParamSpecs=dfAnlysSpecs.loc[51:52], recoverOptims=True, threads=1)

optanlr.shutdown()

computed = True

optResults2.specs

In [None]:
optResults2.dfTransData('fr')

In [None]:
optResults2.dfData

### d. Save results for later reload or examination

In [None]:
optResults2.specs.toExcel(workDir / 'vvaltests-optanalyses-results2.xlsx')

## 3C. Or : Load opt-analyses results from a previous run

(already run and saved above)

In [None]:
if 'computed' not in dir():
    computed = False

if not computed:
    
    # An opt-analyser object knowns how to build an empty results object ...
    optanlr = \
        ads.MCDSTruncationOptanalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                                      transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                      sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                                      sampleDistCol=sampleDistCol,
                                      abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                      anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                                      distanceUnit=distanceUnit, areaUnit=areaUnit,
                                      surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                                      resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                           after=anlysParamCols + [anlysAbbrevCol]),
                                      ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon)

    optResults = optanlr.setupResults()
    
    # Load results from file.
    optResFileName = workDir / 'valtests-optanalyses-results.xlsx'
    print('Loading results from {} ...'.format(optResFileName))

    optResults.fromExcel(optResFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(optResults)))

Jump to [5. Generate HTML and Excel opt-analyses reports](#5.-Generate-HTML-and-Excel-opt-analyses-reports)

## 3D. Generate reference for non-regression tests

To be used in unintests.ipynb / 14. MCDSTruncOptAnalysisResultsSet / y. Non regression

Warning: Needs probably to be fully reworked

In [None]:
## 1. Clone results _without_ data.
#results3 = optResults.copy(withData=True)
#
## 2. Remove analyses with non-unique 'NumAnlys' (because of multiple optimisation tries)
##    (to make comparison easier, sorry)
#numAnlysCols = ('header (head)', 'NumAnlys', 'Value')
#numEchantCol = ('header (head)', 'NumEchant', 'Value')
#
#sb = results3.dfData[[numAnlysCols, numEchantCol]].groupby([numAnlysCols]).transform(len)[numEchantCol] > 1
#results3.dropRows(sb)
#
#results3.toExcel(workDir / 'valtests-optanalyses-results.ref.xlsx')

In [None]:
optResFileName

## 4. Compare command-line to notebook opt-analysis results

In [None]:
# a. Generate notebook results (through 34. right above)
# => optResults

# b. Generate command-line results (through an external console with relevant python env activated)
# $ cd .. && python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-optanlr -n --optanalyses -u
# => file(optResFileName)

# c. Load command line results
clOptResults = optResults.copy(withData=False)
clOptResults.fromExcel(optResFileName)

# d. Check that 2 was really run ...
assert (clOptResults._dfData[ads.MCDSAnalysisResultsSet.CLRunStartTime].max() \
        - optResults._dfData[ads.MCDSAnalysisResultsSet.CLRunStartTime].max()).total_seconds() > 1, \
       'Please run above given command line first: you are actually comparing optResults to itself !'

# e. Compare (using chapter 5. below, stating that "reference" is command-line results, and "actual" is optResults)
#    i. Select "reference" unoptimised analysis results
rsUnoptRef = clOptResults.copy()
optTruncFlagCol = ads.MCDSTruncationOptanalyser.OptimTruncFlagCol
rsUnoptRef.dropRows(rsUnoptRef.dfData[('header (tail)', optTruncFlagCol, 'Value')] == 1)
unoptAnlysAbbrevs = rsUnoptRef.dfData[('header (tail)', anlysAbbrevCol, 'Value')].tolist()

excludeUnoptCols = [col for col in rsUnoptRes.columns.to_list() if col[2] != 'Order']  # Some pre/post Excel IO issues ...

len(unoptAnlysAbbrevs)

#    ii. Select "reference" with-optimisation analysis results (i.e. with truncation params computed through optimisation)
rsOptRef = clOptResults.copy()
rsOptRef.dropRows(rsOptRef.dfData[('header (tail)', optTruncFlagCol, 'Value')] != 1)

#    iii. Compare using chapter 5. below, taking care of skipping 5.a. and 5.d. (just replaced by i. and ii. above :-)
print('\nNow, run chapter 5. below, skipping 5.a. and 5.d., and see what\'s happening ...')

## 5. Compare opt-analyses results to reference

(reference analysis results generated with same kind of "long" code like in [valarchives.ipynb / I. Run analyses with real life field data (1/2 : long code, long run)](./valarchives.ipynb#I.-Run-analyses-with-real-life-field-data-(1%2F2-%3A-long-code%2C-long-run)), but on another data set)

Note: As for now, filter and sort post-computed columns are not checked here ; only DS analyses are.

### a. Load reference unoptimised analyses results from file

In [None]:
# Load unoptimised reference
# 1. Clone optResults _without_ data.
rsUnoptRef = optResults.copy(withData=False)

# 2. Load it with reference data (prevent re-postComputation as this ref. file is old, with now missing computed cols)
rsUnoptRef.fromOpenDoc('refout/ACDC2019-Naturalist-ExtraitResultats.ods', postComputed=True)

unoptAnlysAbbrevs = list(rsUnoptRef.dfData[('header (tail)', anlysAbbrevCol, 'Value')])

excludeUnoptCols = []

len(unoptAnlysAbbrevs)

### b. Separate actual optanalysis results in 2 sets : optimised, and unoptimised

In [None]:
# Unoptimised results.
rsUnoptRes = optResults.copy()
#rsUnoptRes = optResults2.copy() # For recovered run

rsUnoptRes.dropRows(~rsUnoptRes.dfData[('header (tail)', anlysAbbrevCol, 'Value')].isin(unoptAnlysAbbrevs))

#rsUnoptRes.dfTransData('fr').to_excel('tmp/res.xlsx')

In [None]:
# Optimised results.
rsOptRes = optResults.copy()

rsOptRes.dropRows(rsOptRes.dfData[('header (tail)', anlysAbbrevCol, 'Value')].isin(unoptAnlysAbbrevs))

In [None]:
dict(unoptRef=len(rsUnoptRef), unoptRes=len(rsUnoptRes), optRes=len(rsOptRes), allRes=len(optResults))

In [None]:
indexCols

In [None]:
subsetCols

### c. Compare "unoptimised" analyses results to reference ones

In [None]:
# Compare (ignore sample and analysis indexes, no use here).
indexCols = [col for col in rsUnoptRes.miCustomCols.to_list() if '(sample)' in col[0]] \
            + [('parameters', 'estimator key function', 'Value'),
               ('parameters', 'estimator adjustment series', 'Value'),
               ('parameters', 'left truncation distance', 'Value'),
               ('parameters', 'right truncation distance', 'Value'),
               ('parameters', 'model fitting distance cut points', 'Value'),
               ('header (tail)', 'AbrevAnlys', 'Value')]
subsetCols = [col for col in rsUnoptRes.columns.to_list() \
              if col in rsUnoptRef.columns and col not in excludeFromComp
                 and col not in (indexCols + excludeUnoptCols
                                 + [col for col in rsUnoptRes.miCustomCols.to_list() if '(sample)' not in col[0]]
                                 + [('parameters', 'estimator selection criterion', 'Value'),
                                    ('parameters', 'CV interval', 'Value'),
                                    ('run output', 'start time', 'Value'),
                                    ('run output', 'elapsed time', 'Value'),
                                    ('run output', 'run folder', 'Value'),
                                    ('detection probability', 'Delta AIC', 'Value'),
                                    ('detection probability', 'key function type', 'Value'),
                                    ('detection probability', 'adjustment series type', 'Value')])]

dfDiff = rsUnoptRef.compare(rsUnoptRes, indexCols=indexCols, subsetCols=subsetCols, dropCloser=14, dropNans=True)

assert dfDiff.empty, 'No, no, no : not the same ...'

print('Yessssss !')

In [None]:
#dfDiff.to_excel('tmp/_.xlsx')

In [None]:
# To be perfectly honest, may be some 10^-15 differences (when some results loaded from Excel, some other not) ... or not.
rsUnoptRef.compare(rsUnoptRes, indexCols=indexCols, subsetCols=subsetCols, dropCloser=15, dropNans=True)

### d. Load reference "with optimisation" analyses results from file

**Warning** : reference =
* analyses results computed through [valarchives.ipynb / VII. Truncation optimisation (short code and fast run) / 6A. Or : Really run analyses](./valarchives.ipynb#VII.-Truncation-optimisation-(short-code-and-fast-run))
* with [IV. 0. Optanalyser parameters]((#0.-Optanalyser-parameters) exactly the same
* using variant 'main' for [3. Samples and analyses to optimise]

In [None]:
# Load optimised reference (analysis results with truncation params computed through optimisation)
# 1. Clone results _without_ data.
rsOptRef = optResults.copy(withData=False)

# 2. Load it with reference data (need to enforce presence of OptimTrunc column, as the source file may have been
#    built with an MCDSAnalisysResultsSet, not an MCDSTruncOptanalisysResultsSet, the actual class of results ;
#    otherwise, postCompute will fail ... => dDefMissingCols arg)
rsOptRef.fromExcel(f'tmp/mcds-anaftopt/valarc-mcds-analyser-afteropt-main-results.xlsx', 
                   dDefMissingCols={('header (tail)', 'OptimTrunc', 'Value'): np.nan})

### e. Compare "with optimisation" analysis results to "reference" ones

In [None]:
# Sort rows for each analysis optim param specs ... by left truncation distance first
miSortCols = [('header (tail)', 'AbrevAnlys', 'Value'),
              ('parameters', 'left truncation distance', 'Value'),
              ('parameters', 'right truncation distance', 'Value'),
              ('parameters', 'model fitting distance cut points', 'Value')]

rsOptRes.sortRows(by=miSortCols)
rsOptRef.sortRows(by=miSortCols)

In [None]:
# Simple columns index (fr) + setup sorted analyses index
miAnlysNumCol = 'NumAnlys'
dfOptRes = rsOptRes.dfTransData('fr')
dfOptRes[miAnlysNumCol] = [i for i in range(len(dfOptRes))]
dfOptRef = rsOptRef.dfTransData('fr')
dfOptRef[miAnlysNumCol] = [i for i in range(len(dfOptRef))]

In [None]:
# Check that order is "compatible" between reference and actual results
miAnlysAbrevCol = 'AbrevAnlys'

assert dfOptRes[miAnlysAbrevCol].to_list() == dfOptRef[miAnlysAbrevCol].to_list()

In [None]:
# Save to disk for visual checks / comparison
#dfOptRes.to_excel('tmp/opt-res-fr.xlsx')
#dfOptRef.to_excel('tmp/opt-ref-fr.xlsx')

In [None]:
# Compare a small and simple subset of analyses results columns ...
indexCols = [miAnlysNumCol, miAnlysAbrevCol]
subsetCols = ['AIC', 'PDetec', 'EDR/ESW', 'Densité']

dfDiff = ads.DataSet.compareDataFrames(dfOptRes, dfOptRef, indexCols=indexCols, subsetCols=subsetCols, dropNans=True)

dfDiff

In [None]:
# Some diff. stats
dfDiffStats = pd.DataFrame(data=[dfDiff.min(), dfDiff.max(), dfDiff.replace(np.inf, 16).mean()],
                           index=['min', 'max', 'mean'])
dfDiffStats

In [None]:
# Not too bad if less that 10% mean difference (100 / 10**1 = 10%) !
assert dfDiffStats.loc['mean'].min() >= 1.0

# And actually at most P % difference : let's compute P ...
p = 100 / 10**dfDiffStats.loc['mean'].min()

assert p < 10, f'Oh oh ... {p=} >= 10 %'

print(f'Good: {p=:.2f} < 10 %')

In [None]:
# Save to disk after "merging" ref and actual results, again for visual checks
dfOptRef.insert(0, 'x', 'ref')
dfOptRes.insert(0, 'x', 'res')

dfOptComp = dfOptRef.append(dfOptRes, sort=False)

dfOptComp.sort_values(by=['NumAnlys', 'x'], inplace=True)

dfOptComp.to_excel('tmp/opt-comp.xlsx')

### f. Some history of computations difference stats with various 'maxIters' values

In [None]:
# Keep stats for history ... copy/paste results below ...
print('**maxIters={} (N=?): max delta = {:.2f} %**'.format(defCoreMaxIters, 100 / 10**dfDiffStats.loc['mean'].min()))
print()
print(dfDiffStats.to_markdown())

In [None]:
%%html
<style>
table {float:left}
</style>

**maxIter=100 (core i7 10850H, N=1, last on 2021-11-28) : max delta = 7.15 %**

|      |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1.2     |  0.3     |   0.5     |   0.3     |
| max  | 4.9     |  3.4     |   3.7     |   3.4     |
| mean | 1.70455 |  1.14545 |   1.51364 |   1.22727 |

**maxIter=100 (core i5 8365U, N=8, last on 2021-11-20) : max delta = 3.6 %, 3.7 %, 6.44 %, 5.01 %, 5.12 %, 3.55 %, 5.28 %**

|      |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1.1     |  0.2     |   0.4     |   0.2     |
| max  | 6.3     |  4.6     |   4.9     |   4.6     |
| mean | 2.06818 |  1.29091 |   1.85455 |   1.54545 |
    
|      |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1       |  0.2     |       0.6 |   0.4     |
| max  | inf       |  4.9     |     inf   | inf       |
| mean |   2.82273 |  1.44545 |       2.7 |   2.40455 |

|      |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.3     |   0.5     |   0.3     |
| max  | inf       |  4.5     |   5.4     |   4.5     |
| mean |   2.63182 |  1.49091 |   2.03182 |   1.43182 |

|      |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1       |  0.2     |   0.5     |   0.3     |
| max  | 5.7     |  4.3     |   4.6     |   4.3     |
| mean | 1.92273 |  1.19091 |   1.80455 |   1.50455 |

|      |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1       |      0.2 |   0.8     |   0.6     |
| max  | 4.6     |      3.4 |   3.7     |   3.4     |
| mean | 1.80909 |      1.3 |   1.73636 |   1.53182 |

|      |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.2     |     0.2  |   0.9     |   0.6     |
| max  | inf       |     4.1  | inf       | inf       |
| mean |   2.64091 |     1.45 |   2.55455 |   2.28182 |

|      |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1.1     |  0.4     |       0.8 |   0.6     |
| max  | 4.1     |  2.7     |       3   |   2.4     |
| mean | 1.76364 |  1.27727 |       1.7 |   1.38182 |

**maxIter=120 (N=3) : max delta = 6.1 %, 1.6 %, 1.7 %**

|Exec1 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |   0.9     |   0.6     |
| max  | inf       |  5.1     | inf       |   6.5     |
| mean |   2.37273 |  1.21364 |   2.15909 |   1.47273 |

|Exec2 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |   0.8     |   0.6     |
| max  | inf       |  5       | inf       | inf       |
| mean |   3.15455 |  1.79545 |   2.82273 |   2.47273 |

|Exec3 |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1.1     |  0.3     |   0.6     |   0.4     |
| max  | 6.6     |  4.9     |   5.2     |   4.9     |
| mean | 2.57727 |  1.76818 |   2.21364 |   1.92273 |

**maxIter=250 (N=3) : max delta = 0.83 %, 3.4 %, 0.53 %**

|Exec1 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.4     |   0.8     |   0.6     |
| max  | inf       |  5.9     | inf       | inf       |
| mean |   4.39545 |  2.08182 |   2.95455 |   2.68636 |

|Exec2 |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1       |  0.4     |   0.5     |      0.3  |
| max  | 6.7     |  5.4     |   5.7     |      5.5  |
| mean | 2.18636 |  1.46818 |   1.82273 |      1.55 |

|Exec3 |       AIC |    PDetec |   EDR/ESW |   Densité |
|:-----|----------:|----------:|----------:|----------:|
| min  |   1       |   0.3     |   0.9     |   0.6     |
| max  | inf       | inf       | inf       | inf       |
| mean |   3.76818 |   2.27727 |   3.50909 |   3.24091 |

**maxIters=400 (N=4): max delta = 2.6 %, 2.9%, 1.9%, 1.8%**

|Exec1 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.1     |   0.5     |   0.3     |
| max  | inf       |  6.7     | inf       |   6.4     |
| mean |   3.03182 |  1.57727 |   2.65455 |   1.89091 |

|Exec2 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.3     |   0.9     |   0.6     |
| max  | inf       |  4.3     |   5       |   4.7     |
| mean |   2.79091 |  1.54091 |   2.08182 |   1.80909 |

|Exec3 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |   0.5     |   0.3     |
| max  | inf       |  6.7     | inf       |  15.9     |
| mean |   3.40455 |  1.71818 |   2.46364 |   2.24545 |

|Exec4 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |      0.8  |   0.6     |
| max  |   6.7     |  4.9     |      5.2  |   4.9     |
| mean |   2.66818 |  1.74091 |      2.45 |   2.18636 |

## 6. Generate HTML and Excel opt-analyses reports

Note: The exact same reports can be also produced through command line:
```
$ cd .. 
$ python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-optanlr -n --optreports excel,html:mqua-r92 -u
```

In [None]:
R = optResults

In [None]:
# Super-synthesis sub-report : Selected analysis results columns for the 3 textual columns of the table
sampleCols = [
    ('header (head)', 'NumEchant', 'Value'),
    ('header (sample)', 'Espèce', 'Value'),
    ('header (sample)', 'Passage', 'Value'),
    ('header (sample)', 'Adulte', 'Value'),
    ('header (sample)', 'Durée', 'Value'),
    
    R.CLNTotObs, R.CLMinObsDist, R.CLMaxObsDist]

paramCols = [
    ('header (head)', 'NumAnlys', 'Value'),
    R.CLParEstKeyFn, R.CLParEstAdjSer,
    R.CLParTruncLeft, R.CLParTruncRight, R.CLParModFitDistCuts]
    
resultCols = [
    R.CLRunStatus,
    R.CLNObs, R.CLEffort, R.CLSightRate, R.CLNAdjPars,
    R.CLAic, R.CLChi2, R.CLKS, R.CLDCv,
    R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3,
    
    R.CLEswEdr, R.CLPDetec, 
    R.CLDensity, R.CLDensityMin, R.CLDensityMax,
    R.CLNumber, R.CLNumberMin, R.CLNumberMax]

In [None]:
# Synthesis sub-report : Selected analysis results columns for the table
synthCols = [
    ('header (head)', 'NumEchant', 'Value'),
    ('header (sample)', 'Espèce', 'Value'),
    ('header (sample)', 'Passage', 'Value'),
    ('header (sample)', 'Adulte', 'Value'),
    ('header (sample)', 'Durée', 'Value'),

    ('header (head)', 'NumAnlys', 'Value'),
    R.CLParEstKeyFn, R.CLParEstAdjSer,
    #R.CLParEstSelCrit, R.CLParEstCVInt,
    R.CLParTruncLeft, R.CLParTruncRight, R.CLParModFitDistCuts,
 
    R.CLNTotObs, R.CLNObs, R.CLNTotPars, R.CLEffort,
    R.CLDeltaAic, R.CLChi2, R.CLKS, R.CLCvMUw, R.CLCvMCw, R.CLDCv, 
    R.CLSightRate,
    R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3,
    R.CLCmbQuaChi2, R.CLCmbQuaKS, R.CLCmbQuaDCv,

    R.CLPDetec, R.CLPDetecMin, R.CLPDetecMax,
    R.CLDensity, R.CLDensityMin, R.CLDensityMax,
    R.CLNumber, R.CLNumberMin, R.CLNumberMax,

    R.CLGrpOrdSmTrAic,
    R.CLGrpOrdClTrChi2KSDCv, #R.CLGrpOrdClTrChi2,
    R.CLGrpOrdClTrDCv,
    R.CLGrpOrdClTrQuaBal1, R.CLGrpOrdClTrQuaBal2, R.CLGrpOrdClTrQuaBal3, R.CLGrpOrdClTrQuaChi2,
    R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv,
    R.CLGblOrdChi2KSDCv, R.CLGblOrdQuaBal1, R.CLGblOrdQuaBal2, R.CLGblOrdQuaBal3,
    R.CLGblOrdQuaChi2, R.CLGblOrdQuaKS, R.CLGblOrdQuaDCv,
    R.CLGblOrdDAicChi2KSDCv]

In [None]:
# Filter and sort sub-reports : schemes to apply
whichFinalQua = R.CLCmbQuaBal3
ascFinalQua = False

whichBestQua = [R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, whichFinalQua,
               R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv]

dupSubset = [R.CLNObs, R.CLEffort, R.CLDeltaAic, R.CLChi2, R.CLKS, R.CLCvMUw, R.CLCvMCw, R.CLDCv, 
             R.CLPDetec, R.CLPDetecMin, R.CLPDetecMax, R.CLDensity, R.CLDensityMin, R.CLDensityMax]
dDupRounds = {R.CLDeltaAic: 1, R.CLChi2: 2, R.CLKS: 2, R.CLCvMUw: 2, R.CLCvMCw: 2, R.CLDCv: 2, 
              R.CLPDetec: 3, R.CLPDetecMin: 3, R.CLPDetecMax: 3, R.CLDensity: 2, R.CLDensityMin: 2, R.CLDensityMax: 2}

filSorSchemes = [dict(method=ads.MCDSTruncOptanalysisResultsSet.filterSortOnExecCode,
                      deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                      filterSort=dict(whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                      preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                      preselAscs=False, preselThrhs=0.2, preselNum=4),
                 dict(method=ads.MCDSTruncOptanalysisResultsSet.filterSortOnExCAicMulQua,
                      deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                      filterSort=dict(sightRate=90, nBestAIC=4, nBestQua=2, whichBestQua=whichBestQua,
                                      nFinalRes=15, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                      preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                      preselAscs=False, preselThrhs=0.2, preselNum=3),
                 dict(method=ads.MCDSTruncOptanalysisResultsSet.filterSortOnExCAicMulQua,
                      deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                      filterSort=dict(sightRate=92, nBestAIC=3, nBestQua=2, whichBestQua=whichBestQua,
                                      nFinalRes=12, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                      preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                      preselAscs=False, preselThrhs=0.2, preselNum=3),
                 dict(method=ads.MCDSTruncOptanalysisResultsSet.filterSortOnExCAicMulQua,
                      deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                      filterSort=dict(sightRate=94, nBestAIC=2, nBestQua=1, whichBestQua=whichBestQua,
                                      nFinalRes=10, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                      preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                      preselAscs=False, preselThrhs=0.2, preselNum=3),
                 dict(method=ads.MCDSTruncOptanalysisResultsSet.filterSortOnExCAicMulQua,
                      deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                      filterSort=dict(sightRate=96, nBestAIC=2, nBestQua=1, whichBestQua=whichBestQua,
                                      nFinalRes=8, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                      preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                      preselAscs=False, preselThrhs=0.2, preselNum=3)]

In [None]:
# Super-synthesis, synthesis and detail tables, HTML or Excel : sort parameters.
sortCols = [('header (head)', 'NumEchant', 'Value'), whichFinalQua]
sortAscend = [True, False]

In [None]:
afsReport = ads.MCDSResultsFilterSortReport(resultsSet=optResults,
                                            title="PyAuDiSam Validation : Analyses with optimised truncations",
                                            subTitle="Auto-selection of best results",
                                            description='Automated filtering et sorting : method "{fsId}" ; after '
                                                        'easy and parallel run through MCDSTruncationOptAnalyser',
                                            anlysSubTitle='Analyses details',
                                            lang='en', keywords='pyaudisam, validation, optimisation',
                                            superSynthPlotsHeight=280, plotImgSize=(512, 280),
                                            sampleCols=sampleCols, paramCols=paramCols,
                                            resultCols=resultCols, synthCols=synthCols,
                                            sortCols=sortCols, sortAscend=sortAscend,
                                            filSorSchemes=filSorSchemes, 
                                            tgtFolder=workDir,
                                            tgtPrefix='valtests-optanalyses-report')

In [None]:
xlsxBkpAFSRep = xlsxBkpAFSRep.with_name('valtests-optanalyses-report.220102')
xlsxBkpAFSRep, xlsxAFSRep

In [None]:
# XLSX report
xlsxAFSRep = afsReport.toExcel()

xlsxBkpAFSRep = backup(xlsxAFSRep)

HTML(f'Auto-filtered Excel report: <a href="{xlsxAFSRep}" target="blank">{xlsxAFSRep}</a>')

In [None]:
os.startfile(xlsxAFSRep)

In [None]:
# Select scheme to apply for HTML report.
scheme = next(sch for sch in filSorSchemes 
              if sch['method'] is ads.MCDSTruncOptanalysisResultsSet.filterSortOnExCAicMulQua
                 and sch['filterSort']['sightRate'] == 92)
print(optResults.filSorIdMgr.dFilSorSchemes.keys(), '\n=>', optResults.filSorSchemeId(scheme))

In [None]:
# HTML report
htmlAFSRep = afsReport.toHtml(filSorScheme=scheme, rebuild=False)

backup(htmlAFSRep)

afsId = optResults.filSorSchemeId(scheme)
print(f'Auto-filtered HTML report({afsId} scheme): ' + pl.Path(htmlAFSRep).resolve().as_uri())

## 7. Compare command-line and notebook Excel opt-analyses reports

(cd .. && python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-optanlr -n --optreports excel -u)

In [None]:
ddfNbOptReport['Details']['StartTime'].max(), ddfClOptReport['Details']['StartTime'].max()

In [None]:
# 1. Generate notebook report (through 6. right above)
# => xlsxBkpAFSRep

# 2. Generate command-line report (through an external console with relevant python env activated)
# $ cd .. && python -m pyaudisam -p tests/valtests-ds-params.py -w tests/tmp/mcds-optanlr -n --optreports excel -u
# => xlsxAFSRep

# 3. Load the 2 reports
ddfNbOptReport = pd.read_excel(xlsxBkpAFSRep, sheet_name=None, index_col=0)  # Notebook (backup) one
ddfClOptReport = pd.read_excel(xlsxAFSRep, sheet_name=None, index_col=0)  # Command-line one

# 4. Check that 2 was really run ...
assert (ddfClOptReport['Details']['StartTime'].max() - ddfNbOptReport['Details']['StartTime'].max()).total_seconds() > 1, \
       'Please run above given command line first: you are actually comparing notebook report to itself !'

# 5. Compare Synthesis and Details sheets
# assert ddfNbOptReport['Synthesis'].drop(columns=['RunFolder']).set_index('NumAnlys') \
#         .compare(ddfClOptReport['Synthesis'].drop(columns=['RunFolder']).set_index('NumAnlys')) \
#         .empty
# assert ddfNbOptReport['Details'].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys') \
#         .compare(ddfClOptReport['Details'].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys')) \
#         .empty

# 6. Compare auto-filtered sheets
#afsMeths = [sn for sn in ddfNbOptReport if sn.startswith('')]
#assert set(afsMeths) == set([sn for sn in ddfClOptReport if sn.startswith('')])
#for afsMeth in afsMeths:
#    assert ddfNbOptReport[afsMeths].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys') \
#            .compare(ddfClOptReport[afsMeths].drop(columns=['StartTime', 'ElapsedTime', 'RunFolder']).set_index('NumAnlys')) \
#            .empty
#    
#logger.info('Success !')

logger.error('This can\'t work, optimisations rarely give twice the same results ...')

## 8. Simple checks on opt-analyses reports

(accurate non regression tests are not simple to achieve ... here we keep it simple, but not very accurate)

In [None]:
#xlsxAFSRep = 'tmp/mcds-optanlr/valtests-optanalyses-report.xlsx'

In [None]:
# Load last generated Excel report
ddfAfsXlsxReport = pd.read_excel(xlsxAFSRep, sheet_name=None)

snPrfx = 'AFSM-'
{sn[len(snPrfx):]: len(ddfAfsXlsxReport[sn]) for sn in ddfAfsXlsxReport if sn.startswith(snPrfx)}

In [None]:
ddfAfsXlsxReport.keys()

In [None]:
# Check expected approximate number of results for each filter and sort method.
tolerance = 3  # +/-
KExpectedNbOfResults = {'ExCode': 70, 'ExAicMQua-r900m6q3d15': 51, 'ExAicMQua-r920m6q3d12': 48,
                        'ExAicMQua-r940m6q3d10': 43, 'ExAicMQua-r960m6q3d8': 34}  # OK on 2021-11-05 & 06

assert all(sn in ddfAfsXlsxReport for sn in KExpectedNbOfResults if sn.startswith(snPrfx)), 'Missing subreports'
assert all(KExpectedNbOfResults[sn[len(snPrfx):]] - tolerance <= len(ddfAfsXlsxReport[sn])
                                                              <= KExpectedNbOfResults[sn[len(snPrfx):]] + tolerance
           for sn in ddfAfsXlsxReport if sn.startswith(snPrfx)), 'Too much difference for some sub-reports'

In [None]:
# Check some results
dfAfsXlsxRes = ddfAfsXlsxReport['Details']

dfAfsXlsxRes['UniqueId'] = range(len(dfAfsXlsxRes))  # Unique row Id, usefull for some comparisons

len(dfAfsXlsxRes)

In [None]:
dfAfsXlsxRes[['NumEchant', 'NumAnlys', 'Group Left Trunc', 'Group Right Trunc']]

In [None]:
# Load reference Excel report
xlsxAFSRefRep = 'tmp/valtests-optanalysis-report.220102.xlsx'
ddfAfsXlsxRefReport = pd.read_excel(xlsxAFSRefRep, sheet_name=None)

snPrfx = 'AFSM-'
{sn[len(snPrfx):]: len(ddfAfsXlsxRefReport[sn]) for sn in ddfAfsXlsxRefReport if sn.startswith(snPrfx)}

In [None]:
# Check expected approximate number of results for each filter and sort method.
tolerance = 3  # +/-
KExpectedNbOfResults = {'ExCode': 70, 'ExAicMQua-r900m6q3d15': 51, 'ExAicMQua-r920m6q3d12': 48,
                        'ExAicMQua-r940m6q3d10': 43, 'ExAicMQua-r960m6q3d8': 34}  # OK on 2021-11-05 & 06

assert all(KExpectedNbOfResults[sn[len(snPrfx):]] - tolerance <= len(ddfAfsXlsxRefReport[sn])
                                                              <= KExpectedNbOfResults[sn[len(snPrfx):]] + tolerance
           for sn in ddfAfsXlsxRefReport if sn.startswith(snPrfx))

In [None]:
# Check some results
dfAfsXlsxRefRes = ddfAfsXlsxRefReport['Details']

dfAfsXlsxRefRes['UniqueId'] = range(len(dfAfsXlsxRefRes))  # Unique row Id, usefull for some comparisons

len(dfAfsXlsxRefRes)

In [None]:
# Truncation groups
dfComp = dfAfsXlsxRefRes[['NumEchant', 'NumAnlys', 'Group Left Trunc', 'Group Right Trunc']] \
             .compare(dfAfsXlsxRes[['NumEchant', 'NumAnlys', 'Group Left Trunc', 'Group Right Trunc']])

assert dfComp.empty  # Don't even think of it ...

In [None]:
# Quality indicators
quaCols = [col for col in dfAfsXlsxRefRes.columns if col.startswith('Qual ')]

dfRelDiff = \
    ads.DataSet.compareDataFrames(dfAfsXlsxRefRes, dfAfsXlsxRes, indexCols=['UniqueId'], subsetCols=quaCols, dropCloser=14)

assert dfRelDiff.empty  # Don't even think of it ...

# Sandbox