<!-- Auto table of contents -->
<h1 class='tocIgnore'>Development and unit tests</h1>

**pyaudisam**: Automation of Distance Sampling analyses with [Distance software](http://distancesampling.org/)

Copyright (C) 2021 Jean-Philippe Meuret

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program.
If not, see https://www.gnu.org/licenses/.

<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table of contents</h2>
  <div id="toc"></div>
</div>

In [None]:
%%javascript
$.getScript('ipython_notebook_toc.js')

In [None]:
%matplotlib inline

In [None]:
import sys
import copy
import os
import pathlib as pl

import re

import concurrent.futures as cofu

import math
import numpy as np
import pandas as pd

from tqdm import tqdm

from IPython.display import HTML

In [None]:
# Activate Warnings as Exception
#import warnings
#warnings.filterwarnings('error')

# Commons

In [None]:
# Short identification string for a sample.
def sampleAbbrev(sSample):
    
    abrvSpe = ''.join(word[:4].title() for word in sSample['Espèce'].split(' ')[:2])
    
    sampAbbrev = '{}-{}-{}-{}'.format(abrvSpe, sSample.Passage.replace('+', ''),
                                      sSample.Adulte.replace('+', ''), sSample['Durée'])
    
    return sampAbbrev

# Short identification string for an analysis.
def analysisAbbrev(sAnlys):
    
    # Sample abbreviation
    abbrevs = [sampleAbbrev(sAnlys)]

    # Model + Parameters abbreviation
    abbrevs += [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    dTroncAbrv = { 'l': 'TrGche' if 'TrGche' in sAnlys.index else 'TroncGche',
                   'r': 'TrDrte' if 'TrDrte' in sAnlys.index else 'TroncDrte',
                   'm': 'NbTrches' if 'NbTrches' in sAnlys.index else 'NbTrModel'
                                   if 'NbTrModel' in sAnlys.index else  'NbTrchMod',
                   'd': 'NbTrDiscr' }
    for abrv, name in dTroncAbrv.items():
        if name in sAnlys.index and not pd.isnull(sAnlys[name]):
            abbrevs.append('{}{}'.format(abrv, sAnlys[name][0].lower() if isinstance(sAnlys[name], str)
                                               else int(sAnlys[name])))
   
    return '-'.join(abbrevs)

# I. Unit tests

## 0. MCDS.exe detection

In [None]:
sys.path.insert(0, '..')

In [None]:
import pyaudisam as ads

ads.runtime

In [None]:
# Create temporary directory if not yet done.
tmpDir = pl.Path('tmp')
tmpDir.mkdir(exist_ok=True)

In [None]:
# Logging configuration.
ads.log.configure(handlers=[sys.stdout, tmpDir / 'unintst.log'], reset=True,
                  loggers=[dict(name='matplotlib', level=ads.WARNING),
                           dict(name='ads', level=ads.INFO),
                           dict(name='ads.dat', level=ads.INFO),
                           dict(name='ads.eng', level=ads.INFO2),
                           dict(name='ads.opn', level=ads.INFO1),
                           dict(name='ads.opr', level=ads.INFO1),
                           dict(name='ads.onr', level=ads.DEBUG),
                           dict(name='ads.anr', level=ads.DEBUG1)])

logger = ads.logger('unintst', level=ads.DEBUG)

[14. MCDS(Opt)AnalysisResultsSet](#14.-MCDS(Opt)AnalysisResultsSet)

## 1. DataSet class

In [None]:
# Finish preparing import data set
dfPapAlaArv = pd.read_excel('refin/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.ods')

dfPapAlaArv.to_csv('tmp/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.csv', sep='\t', index=False)
dfPapAlaArv.to_excel('tmp/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xls', index=False)  # Need for deprecated module xlwt !

In [None]:
# DataSet from multiple sources from various formats (same columns)
# => ctor, _csv2df, _fromDataFrame, _fromDataFile, _addComputedColumns, addColumns, renameColumns
sources = ['refin/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.ods',   # Need for module odfpy
           'refin/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',   # Need for module openpyxl (or xlrd)
           'tmp/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xls',  # No need for module xlwt (openpyxl seems to just do it)
           'tmp/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.csv', dfPapAlaArv]

def male2bool(s):
    return False if pd.isnull(s.MALE) or s.MALE.lower() != 'oui' else True

ds = ads.DataSet(sources, importDecFields=['EFFORT', 'DISTANCE', 'NOMBRE'],
                 dRenameCols={'NOMBRE': 'INDIVIDUS'}, dComputeCols={'MALE': male2bool},
                 sheet='Sheet1', skipRows=None, separator='\t')

In [None]:
# => ctor, _csv2df, _fromDataFrame, _fromDataFile, __len__, columns, empty
assert not ds.empty

assert len(ds) == len(dfPapAlaArv) * len(sources)

assert sorted(ds.columns) == sorted(['ZONE', 'HA', 'POINT', 'ESPECE', 'DISTANCE', 'MALE', 'DATE',
                                     'OBSERVATEUR', 'PASSAGE', 'INDIVIDUS', 'EFFORT'])

dTypes = {'ZONE': 'object', 'HA': 'int', 'POINT': 'int', 'ESPECE': 'object',
          'DISTANCE': 'float', 'MALE': 'bool', 'DATE': 'object', 'OBSERVATEUR': 'object',
          'PASSAGE': 'object', 'INDIVIDUS': 'float', 'EFFORT': 'int'}
assert all(typ.name.startswith(dTypes[col]) for col, typ in ds.dfData.dtypes.items())

In [None]:
# => dfData
ds.dfData

In [None]:
# => dfSubData, __len__, columns
df = ds.dfSubData(columns=['POINT', 'ESPECE', 'DISTANCE', 'INDIVIDUS', 'EFFORT'])

assert len(df) == len(dfPapAlaArv) * len(sources)
assert df.columns.to_list() == ['POINT', 'ESPECE', 'DISTANCE', 'INDIVIDUS', 'EFFORT']

df

In [None]:
# => dfSubData, __len__, columns
df = ds.dfSubData(columns=['POINT', 'ESPECE', 'DISTANCE', 'INDIVIDUS', 'EFFORT'], index=range(1, 300, 3))

assert len(df) == 100
assert df.columns.to_list() == ['POINT', 'ESPECE', 'DISTANCE', 'INDIVIDUS', 'EFFORT']
assert df.index.to_list() == list(range(1, 300, 3))

df

In [None]:
# ctor, _csv2df, _fromDataFrame, _fromDataFile, dfData
assert ds.dfData.MALE.value_counts()[True] == ds.dfData.INDIVIDUS.sum() == dfPapAlaArv.NOMBRE.sum() * len(sources)

In [None]:
# => dropColumns, columns, __len__
ds.dropColumns(['ZONE', 'HA', 'OBSERVATEUR'])

assert len(ds) == len(dfPapAlaArv) * len(sources)
assert ds.columns.to_list() == ['POINT', 'ESPECE', 'DISTANCE', 'MALE', 'DATE', 'PASSAGE', 'INDIVIDUS', 'EFFORT']

In [None]:
# => dropRows, dfData, __len__
ds.dropRows(ds.dfData.DISTANCE.isnull())

assert len(ds) == len(dfPapAlaArv[dfPapAlaArv.DISTANCE.notnull()]) * len(sources)
assert ds.dfData.MALE.value_counts()[True] == ds.dfData.INDIVIDUS.sum() == dfPapAlaArv.NOMBRE.sum() * len(sources)

In [None]:
# => toExcel, toOpenDoc, toPickle, compareDataFrames
closenessThreshold = 15  # => max relative delta = 1e-15
subsetCols = ['POINT', 'ESPECE', 'DISTANCE', 'INDIVIDUS', 'EFFORT']
filePathName = tmpDir / 'dataset-uni.ods'
dfRef = ds.dfSubData(columns=subsetCols).reset_index(drop=True)

for fpn in [filePathName, filePathName.with_suffix('.xlsx'), filePathName.with_suffix('.xls'),
            filePathName.with_suffix('.pickle'), filePathName.with_suffix('.pickle.xz')]:
    
    print(fpn.as_posix(), end=' : ')
    if fpn.suffix == '.ods':
        ds.toOpenDoc(fpn, sheetName='utest', subset=subsetCols, index=False)
    elif fpn.suffix in ['.xlsx', '.xls']:
        ds.toExcel(fpn, sheetName='utest', subset=subsetCols, index=False)
    elif fpn.suffix in ['.pickle', '.xz']:
        ds.toPickle(fpn, subset=subsetCols, index=False)
    assert fpn.is_file()

    if fpn.suffix in ['.ods', '.xlsx', '.xls']:
        df = pd.read_excel(fpn, sheet_name='utest')
    elif fpn.suffix in ['.pickle', '.xz']:
        df = pd.read_pickle(fpn)
        df.reset_index(drop=True, inplace=True)
    assert ds.compareDataFrames(df.reset_index(), dfRef.reset_index(),
                                subsetCols=['POINT', 'DISTANCE', 'INDIVIDUS', 'EFFORT'],
                                indexCols=['index'], dropCloser=closenessThreshold, dropNans=True).empty
    print('1e-{} comparison OK (df.equals(dfRef) is {}, df.compare(dfRef) {}empty)'
          .format(closenessThreshold, df.equals(dfRef), '' if df.compare(dfRef).empty else 'not'))

In [None]:
# Base function for comparison (test from static hard-coded data, not from loaded DataSets)
# => _closeness
values = [np.nan, -np.inf,
          -1.0e12, -1.0e5, -1.0-1e-5, -1.0, -1.0+1e-5, -1.0e-8,
          0.0, 1.0e-8, 1.0, 1.0e5, 1.0e12, np.inf]
aClose = np.ndarray(shape=(len(values), len(values)))

for r in range(len(values)):
    for c in range(len(values)):
        try:
            aClose[r, c] = ds._closeness(pd.Series([values[r], values[c]]))
        except Exception as exc:
            print(exc, r, c, values[r], values[c])
            raise

# Proximité infinie sur la diagonale (sauf pour nan et +/-inf)
assert all(np.isnan(values[i]) or np.isinf(values[i]) or np.isinf(aClose[i, i]) for i in range(len(values))), \
       'Error: Inequality on the diagonal'

# Pas de proximité infinie ailleurs
assert all(r == c or not np.isinf(aClose[r, c]) for r in range(len(values)) for c in range(len(values))), \
       'Error: No equality should be found outside the diagonal'

# Bonne proximité uniquement autour de -1
whereClose = [i for i in range(len(values)) if abs(values[i] + 1) <= 1.0e-5]
assert all(aClose[r, c] > 4 for r in whereClose for c in whereClose), 'Error: Unexpectedly bad closeness around -1'

pd.DataFrame(data=aClose, index=values, columns=values)

In [None]:
# Comparison (from other files data sources, the same as for ResultsSet.compare below, but through DataSet)
# => compare, compareDataFrames, _toHashable, _closeness

# a. Chargement référence Distance 7 et valeurs à comparer issues de pyaudisam
dsDist = ads.DataSet('refin/ACDC2019-Papyrus-ALAARV-TURMER-comp-dist-auto.ods',
                     sheet='RefDist73', skipRows=[3], headerRows=[0, 1, 2], indexCols=0)

dsAuto = ads.DataSet('refin/ACDC2019-Papyrus-ALAARV-TURMER-comp-dist-auto.ods',
                     sheet='ActAuto', skipRows=[3], headerRows=[0, 1, 2], indexCols=0)

# b. Colonnes d'index pour la comparaison
indexCols = [('sample', 'AnlysNum', 'Value')] \
            + [('sample', col, 'Value') for col in ['Species', 'Periods', 'Prec.', 'Duration']] \
            + [('model', 'Model', 'Value')] \
            + [('parameters', 'left truncation distance', 'Value'),
               ('parameters', 'right truncation distance', 'Value'),
               ('parameters', 'model fitting distance cut points', 'Value'),
               ('parameters', 'distance discretisation cut points', 'Value')]

# c. Colonnes à comparer (on retire DeltaDCV et DeltaAIC car ils dépendent des ensembles d'analyses effectuées,
#    différents entre la référence et l'exécution auto).
subsetCols = [col for col in dsDist.dfData.columns.to_list() \
              if col not in indexCols + [('run output', 'run time', 'Value'),
                                         ('density/abundance', 'density of animals', 'Delta Cv'),
                                         ('detection probability', 'Delta AIC', 'Value')]]

# d. Comparaison "exacte" : aucune ligne n'y réussit (majorité d'epsilons dûs à IO ODS)
dfRelDiff = dsDist.compare(dsAuto, subsetCols=subsetCols, indexCols=indexCols)
assert len(dfRelDiff) == len(dsDist)

# e. Comparaison à 10**-16 près : presque toutes les lignes réussissent, sauf 3 (majorité d'epsilons dûs à IO ODS).
dfRelDiff = dsDist.compare(dsAuto, subsetCols=subsetCols, indexCols=indexCols, dropCloser=16, dropNans=True)
assert len(dfRelDiff) == 3

dfRelDiff = dsDist.compare(dsAuto, subsetCols=subsetCols, indexCols=indexCols, dropCloser=5, dropNans=True)
assert len(dfRelDiff) == 2

dfRelDiff

## 2. SampleDataSet class (and base DataSet)

Note: Self-contained, nothing needing to be run before (but 0)

In [None]:
# Excel source (path as simple string)
sds = ads.SampleDataSet(source='refin/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                        decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])

assert sds.columns.to_list() == ['ZONE', 'HA', 'POINT', 'ESPECE', 'DISTANCE', 'MALE', 'DATE',
                                 'OBSERVATEUR', 'PASSAGE', 'NOMBRE', 'EFFORT']
assert len(sds) == 256
assert sds.dfData.NOMBRE.sum() == 217

sds.dfData.head()

In [None]:
# Libre / Open Office source (path as simple string)
sds = ads.SampleDataSet(source='refin/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.ods',
                        decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])

assert sds.columns.to_list() == ['ZONE', 'HA', 'POINT', 'ESPECE', 'DISTANCE', 'MALE', 'DATE',
                                 'OBSERVATEUR', 'PASSAGE', 'NOMBRE', 'EFFORT']
assert len(sds) == 256
assert sds.dfData.NOMBRE.sum() == 217

sds.dfData.head()

In [None]:
sds.columns

In [None]:
# CSV source with ',' as decimal point (path as pl.Path)
sds = ads.SampleDataSet(source=pl.Path('refin/ACDC2019-Papyrus-TURMER-AB-5mn-1dec-dist.txt'),
                        decimalFields=['Point transect*Survey effort', 'Observation*Radial distance'])

assert not any(sds.dfData[col].dropna().apply(lambda v: isinstance(v, str)).any() for col in sds.decimalFields), \
       'Error: Some strings found in declared decimal fields ... any decimal format issue ?'

assert sds.columns.to_list() == ['Region*Label', 'Region*Area', 'Point transect*Label',
                                 'Point transect*Survey effort', 'Observation*Radial distance']
assert len(sds) == 330
assert sds.dfData['Observation*Radial distance'].notnull().sum() == 324

sds.dfData.head()

In [None]:
# CSV source with '.' as decimal point
sds = ads.SampleDataSet(source=pl.Path('refin/ACDC2019-Papyrus-ALAARV-AB-10mn-1dotdec-dist.txt'),
                       decimalFields=['Point transect*Survey effort', 'Observation*Radial distance'])

assert not any(sds.dfData[col].dropna().apply(lambda v: isinstance(v, str)).any() for col in sds.decimalFields), \
       'Error: Some strings found in declared decimal fields ... any decimal format issue ?'

assert sds.columns.to_list() == ['Region*Label', 'Region*Area', 'Point transect*Label',
                                 'Point transect*Survey effort', 'Observation*Radial distance']
assert len(sds) == 256
assert sds.dfData['Observation*Radial distance'].notnull().sum() == 217

sds.dfData.head()

In [None]:
# DataFrame source.
dfData = pd.DataFrame(columns=['Date', 'TrucDec', 'Espece', 'Point', 'Effort', 'Distance'],
                      data=[('2019-05-13', 3.5, 'TURMER', 23, 2,   83),
                            ('2019-05-15', np.nan, 'TURMER', 23, 2,   27.355),
                            ('2019-05-13', 0, 'ALAARV', 29, 2,   56.85),
                            ('2019-04-03', 1.325, 'PRUMOD', 53, 1.3,  7.2),
                            ('2019-06-01', 2, 'PHICOL', 12, 1,  np.nan),
                            ('2019-06-19', np.nan, 'PHICOL', 17, 0.5, np.nan),
                           ])
dfData['Region'] = 'ACDC'
dfData['Surface'] = '2400'

sds = ads.SampleDataSet(source=dfData, decimalFields=['Effort', 'Distance', 'TrucDec'])

assert not any(sds.dfData[col].dropna().apply(lambda v: isinstance(v, str)).any() for col in sds.decimalFields), \
       'Error: Some strings found in declared decimal fields ... any decimal format issue ?'

assert sds.columns.equals(dfData.columns)
assert len(sds) == len(dfData)
assert sds.dfData.Distance.notnull().sum() == 4

sds.dfData

## 3. XXEngine classes

Note: Self-contained, nothing needing to be run before (but 0)

### a. Instance creation et loading of MCDS.exe output stat. specs

In [None]:
try:
    eng = ads.MCDSEngine(workDir='tmp/test out') # Simple string path
    print('Error: Should have raised an AssertionError !')
except AssertionError as exc:
    print('Good forbidden chars detection:', exc)

In [None]:
try:
    eng = ads.MCDSEngine(workDir=tmpDir / 'test out') # pl.Path path
    print('Error: Should have raised an AssertionError !')
except AssertionError as exc:
    print('Good forbidden chars detection:', exc)

In [None]:
# The old run method.
eng = ads.MCDSEngine(workDir=tmpDir / 'mcds-out', runMethod='os.system')

In [None]:
runDir = eng.setupRunFolder(runPrefix='uni') # Unit tests

### b. Generate input data file for MCDS

In [None]:
# A short dataset.
dfData = pd.DataFrame(columns=['Date', 'TrucDec', 'Espece', 'Point', 'Effort', 'Distance'],
                      data=[('2019-05-13', 3.5, 'TURMER', 23, 2,   83),
                            ('2019-05-15', np.nan, 'TURMER', 23, 2,   27.355),
                            ('2019-05-13', 0, 'ALAARV', 29, 2,   56.85),
                            ('2019-04-03', 1.325, 'PRUMOD', 53, 1.3,  7.2),
                            ('2019-06-01', 2, 'PHICOL', 12, 1,  np.nan),
                            ('2019-06-19', np.nan, 'PHICOL', 17, 0.5, np.nan),
                           ])
dfData['Region'] = 'ACDC'
dfData['Surface'] = '2400'

sds = ads.SampleDataSet(source=dfData, decimalFields=['Effort', 'Distance', 'TrucDec'])

sds.dfData

In [None]:
dataFileName = eng.buildDataFile(sampleDataSet=sds, runDir=runDir)

### c. Compute sample stats for MCDS

In [None]:
sSmpStats = eng.computeSampleStats(sds)
sSmpStats

In [None]:
assert all(sSmpStats.index == eng.MIStatSampCols)
assert all(sSmpStats.values == [4, 7.2, 83.0])

### d. Generate input command file for MCDS

In [None]:
cmdFileName = eng.buildCmdFile(estimKeyFn='HNORMAL', estimAdjustFn='COSINE', estimCriterion='AIC', cvInterval=95,
                               runDir=runDir)

### e. Low level analysis execution (_run)

In [None]:
# Debug mode
runStatus, startTime, elapsedTime = \
    eng._run(eng.ExeFilePathName, cmdFileName, forReal=False, method=eng.runMethod)

dict(runStatus=runStatus, startTime=startTime, elapsedTime=elapsedTime)

In [None]:
# Real mode
runStatus, startTime, engElapsedTime = \
    eng._run(eng.ExeFilePathName, cmdFileName, forReal=True, method=eng.runMethod)

In [None]:
# Real mode
runStatus, startTime, engElapsedTime = \
    eng._run(eng.ExeFilePathName, cmdFileName, forReal=True, method=eng.runMethod)

In [None]:
# Timeout
runStatus, startTime, engElapsedTime = \
    eng._run(eng.ExeFilePathName, cmdFileName, forReal=True, method='subprocess.run', timeOut=0.01)

In [None]:
eng = ads.MCDSEngine(workDir=tmpDir / 'mcds-out')

In [None]:
%%timeit -r 5 -n 10

# Performance measures : method='os.system', Windows 10, 4-core i5-8350U, PCI-e SSD, "optimal performances" power scheme
# 2021-01-06: 132 ms ± 1.47 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)
# 2021-10-02: 134 ms ± 6.31 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)
runStatus, startTime, engElapsedTime = \
    eng._run(eng.ExeFilePathName, cmdFileName, forReal=True, method='os.system')

In [None]:
%%timeit -r 5 -n 10

# Performance measures : method='subprocess.run', Windows 10, 4-core i5-8350U, PCI-e SSD, "optimal performances" power scheme
# 2021-01-06: 191 ms ± 3.75 ms per loop (mean ± std. dev. of 5 runs, 10 loops each) => os.system faster by 60-75ms
# 2021-10-02: 211 ms ± 19.7 ms per loop (mean ± std. dev. of 5 runs, 10 loops each) => os.system faster by ~80ms
runStatus, startTime, engElapsedTime = \
    eng._run(eng.ExeFilePathName, cmdFileName, forReal=True, method='subprocess.run')

**Conclusion**

Windows 10, 4-core i5-8350U, PCI-e SSD, "optimal performances" power scheme:
* 2021-01-06 : os.system systematically faster by 60-75ms
* 2021-10-02 : os.system systematically faster by ~80ms (128 ms ± 2.49 ms _vs_ 207 ms ± 13.5 ms)

### f. High level analysis execution  (via executor), debug mode

(generate cmd and data input files, but no call to executable)

In [None]:
# A real life (reduced) dataset
sds = ads.SampleDataSet(source=pl.Path('refin') / 'ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                        decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])

In [None]:
# Asynchronous model, even if no parallelism involved : submitAnalysis() returns a "future" object
# (see module concurrent)
futRun = eng.submitAnalysis(sds, realRun=False, runPrefix='int',
                            estimKeyFn='UNIFORM', estimAdjustFn='POLY',
                            estimCriterion='AIC', cvInterval=95)

In [None]:
# Get run output from future object
runCode, startTime, elapsedTime, runDir, sResults = futRun.result()

assert runCode == ads.MCDSEngine.RCNotRun, 'Should have NOT run (run code = 0)'

dict(runCode=runCode, runDir=runDir, startTime=startTime, elapsedTime=elapsedTime, sResults=sResults)

In [None]:
runCode, startTime, elapsedTime, runDir, sResults = \
    eng.submitAnalysis(sds, realRun=False, runPrefix='int',
                       estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95).result()

assert runCode == ads.MCDSEngine.RCNotRun, 'Should have NOT run (run code = 0)'

dict(runCode=runCode, runDir=runDir, startTime=startTime, elapsedTime=elapsedTime, sResults=sResults)

### g. High level analysis execution  (via executor), real mode

In [None]:
KRunCheckErrMsg = {ads.MCDSEngine.RCOK: 'Oh, oh, should have run smoothly and successfully !',
                   ads.MCDSEngine.RCWarnings: 'Oh, oh, should have run smoothly (even if with warnings) !',
                   ads.MCDSEngine.RCTimedOut: 'Oh, oh, should have timed-out !'}

def checkEngineAnalysisRun(sampleDataSet, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                           runMethod='os.system', timeOut=None, expectRunCode=ads.MCDSEngine.RCOK):
    
    # Need for an async. executor for time limit checking with os.system run method.
    exor = None if runMethod != 'os.system' or timeOut is None else ads.Executor(threads=1)
        
    # Engine
    eng = ads.MCDSEngine(executor=exor, workDir=tmpDir / 'mcds-out',
                         runMethod=runMethod, timeOut=timeOut)
    
    # Run analysis and get results
    fut = eng.submitAnalysis(sampleDataSet, realRun=True, runPrefix='int',
                             estimKeyFn=estimKeyFn, estimAdjustFn=estimAdjustFn,
                             estimCriterion=estimCriterion, cvInterval=cvInterval,
                             minDist=minDist, maxDist=maxDist,
                             fitDistCuts=fitDistCuts, discrDistCuts=discrDistCuts)
    
    try:
        if timeOut is not None:
            startTime = pd.Timestamp.now()  # In case of cofu.TimeoutError
        runCode, startTime, elapsedTime, runDir, sResults = fut.result(timeout=timeOut)
    except cofu.TimeoutError:
        logger.info('MCDS Analysis run timed-out after {}s'.format(timeOut))
        runCode, startTime, elapsedTime, runDir, sResults = \
            eng.RCTimedOut, startTime, timeOut, None, None

    # Check status
    assert runCode == expectRunCode, KRunCheckErrMsg.get(expectRunCode, 'Oh, oh, unexpected expected run code ;-)')
    
    # Done
    eng.shutdown()
    if exor:
        exor.shutdown()
    
    return runCode, startTime, elapsedTime, runDir, sResults

In [None]:
# No time limit
runCode, startTime, elapsedTime, runDir, sResults = \
    checkEngineAnalysisRun(sds, estimKeyFn='NEXPON', estimAdjustFn='COSINE', estimCriterion='AIC', cvInterval=95,
                           runMethod='os.system', timeOut=None, expectRunCode=ads.MCDSEngine.RCWarnings)

runCode, startTime, elapsedTime, runDir, sResults

In [None]:
# Some time limit, but too long to stop analysis.
runCode, startTime, elapsedTime, runDir, sResults = \
    checkEngineAnalysisRun(sds, estimKeyFn='HNORMAL', estimAdjustFn='COSINE', estimCriterion='AIC', cvInterval=95,
                           runMethod='os.system', timeOut=3, expectRunCode=ads.MCDSEngine.RCWarnings)

runCode, startTime, elapsedTime, runDir, sResults

In [None]:
# Too short time limit => analysis time-out (but MCDS goes on to its end : no kill done by executor)
runCode, startTime, elapsedTime, runDir, sResults = \
    checkEngineAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           runMethod='os.system', timeOut=0.1, expectRunCode=ads.MCDSEngine.RCTimedOut)

logger.info('Look: MCDS was not killed, it has gone to its end, whereas the analysis has timed-out')

runCode, startTime, elapsedTime, runDir, sResults

In [None]:
# No time limit
runCode, startTime, elapsedTime, runDir, sResults = \
    checkEngineAnalysisRun(sds, estimKeyFn='NEXPON', estimAdjustFn='COSINE', estimCriterion='AIC', cvInterval=95,
                           runMethod='subprocess.run', timeOut=None, expectRunCode=ads.MCDSEngine.RCWarnings)

runCode, startTime, elapsedTime, runDir, sResults

In [None]:
# Some time limit, but too long to stop analysis.
runCode, startTime, elapsedTime, runDir, sResults = \
    checkEngineAnalysisRun(sds, estimKeyFn='HNORMAL', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           runMethod='subprocess.run', timeOut=3, expectRunCode=ads.MCDSEngine.RCErrors)

runCode, startTime, elapsedTime, runDir, sResults

In [None]:
# Too short time limit => analysis time-out (but MCDS goes on to its end : no kill done by executor)
runCode, startTime, elapsedTime, runDir, sResults = \
    checkEngineAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           runMethod='subprocess.run', timeOut=0.1, expectRunCode=ads.MCDSEngine.RCTimedOut)

logger.info('Look: MCDS was actually killed on time-out')

runCode, startTime, elapsedTime, runDir, sResults

### h. Generate input data files for interactive Distance software

('point transect' mode only as for now)

In [None]:
tgtDir = pl.Path(eng.workDir, 'distance-in')
tgtDir.mkdir(exist_ok=True)

In [None]:
# Case 1: Point transect with radial distance, no extra fields, no clustering.
distDataFileName = \
    eng.buildDistanceDataFile(sds, tgtFilePathName=tgtDir / 'import-data-noextra.txt')

In [None]:
# Point transect with radial distance, with extra fields, no clustering.
distDataFileName = \
    eng.buildDistanceDataFile(sds, tgtFilePathName=tgtDir / 'import-data-withextra.txt',
                              withExtraFields=True)

In [None]:
eng.shutdown()

In [None]:
# Case 2: Point transect with radial distance, no extra fields, with clustering.
eng = ads.MCDSEngine(workDir=tmpDir / 'mcds-out', clustering=True)

# Add cluster data to the data set
dfData['Nombre'] = [1, 2, 1, 1, 2, 3]
sds = ads.SampleDataSet(source=dfData, decimalFields=['Effort', 'Distance', 'TrucDec'])

# Generate distance file
tgtDir = pl.Path(eng.workDir, 'distance-in')
tgtDir.mkdir(exist_ok=True)

distDataFileName = \
    eng.buildDistanceDataFile(sds, tgtFilePathName=tgtDir / 'import-data-clusters.txt')

In [None]:
eng.shutdown()

## 4. MCDSAnalysis class

Note: Self-contained, nothing needing to be run before (but 0)

In [None]:
def checkAnalysisRun(sampleDataSet, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                     minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                     runMethod='os.system', timeOut=None, expectStatus=ads.MCDSEngine.RCOK):
    
    # Need for a parallel executor for time limit checking with os.system run method.
    exor = None if runMethod != 'os.system' or timeOut is None else ads.Executor(threads=1)
        
    # Engine
    eng = ads.MCDSEngine(executor=exor, workDir=tmpDir / 'mcds-out',
                         runMethod=runMethod, timeOut=timeOut)
    
    # Analysis
    anlys = ads.MCDSAnalysis(engine=eng, sampleDataSet=sds, name='anlys', logData=True,
                             estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                             minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None)

    # Run
    anlys.submit()
    
    # Get result
    sResult = anlys.getResults()

    # Check status
    sts = sResult[('run output', 'run status', 'Value')]
    assert sts == expectStatus, KRunCheckErrMsg.get(expectStatus, 'Oh, oh, unexpected expected status ;-)')
    
    # Done
    eng.shutdown()
    if exor:
        exor.shutdown()
    
    return sResult

### a. Dataset to work with ...

In [None]:
# A real life (reduced) dataset
sds = ads.SampleDataSet(source=pl.Path('refin') / 'ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                        decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])

sds.dfData.head()

### b. Engine 'os.system' RunMethod and run time limit management

In [None]:
# No time limit
sResult = checkAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                           runMethod='os.system', timeOut=None, expectStatus=ads.MCDSEngine.RCWarnings)

sResult

In [None]:
# Some time limit, but too long to stop analysis.
sResult = checkAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                           runMethod='os.system', timeOut=5, expectStatus=ads.MCDSEngine.RCWarnings)

sResult

In [None]:
# Too short time limit => analysis time-out
sResult = checkAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                           runMethod='os.system', timeOut=0.01, expectStatus=ads.MCDSEngine.RCTimedOut)

sResult

### c. Engine 'subprocess.run' RunMethod and run time limit management

In [None]:
# No time limit
sResult = checkAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                           runMethod='os.system', timeOut=None, expectStatus=ads.MCDSEngine.RCWarnings)

sResult

In [None]:
# Some time limit, but too long to stop analysis.
sResult = checkAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                           runMethod='os.system', timeOut=5, expectStatus=ads.MCDSEngine.RCWarnings)

sResult

In [None]:
# Too short time limit => analysis time-out
sResult = checkAnalysisRun(sds, estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95,
                           minDist=None, maxDist=None, fitDistCuts=None, discrDistCuts=None,
                           runMethod='os.system', timeOut=0.01, expectStatus=ads.MCDSEngine.RCTimedOut)

sResult

### d. Performance tests

In [None]:
# RunMethod='subprocess.run'
eng = ads.MCDSEngine(workDir=tmpDir / 'mcds-out', runMethod='subprocess.run')

In [None]:
%%timeit -r 5 -n 10

# 2020-01-06: 347 ms ± 8.71 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)
# 2021-10-02: 326 ms ± 2.71 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)
runCode, startTime, elapsedTime, runDir, sRes = \
    eng.submitAnalysis(sds, realRun=True, runPrefix='int',
                       estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95).result()

In [None]:
eng.shutdown()

In [None]:
# RunMethod='os.system'
eng = ads.MCDSEngine(workDir=tmpDir / 'mcds-out', runMethod='os.system')

In [None]:
%%timeit -r 5 -n 10

# 2020-01-06: 272 ms ± 7.57 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)
# 2021-10-02: 268 ms ± 20.4 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)
runCode, startTime, elapsedTime, runDir, sRes = \
    eng.submitAnalysis(sds, realRun=True, runPrefix='int',
                       estimKeyFn='UNIFORM', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95).result()

In [None]:
eng.shutdown()

## 5. AnalysisResultsSet and ResultsSet classes

Note: Self-contained, nothing needing to be run before (but 0)

### a. ResultsSet class with specialised postComputeColumns

In [None]:
# A specialized results set for the tests = with extra. post-computed columns : Delta AIC
class TestAnalysisResultsSet(ads.analyser.AnalysisResultsSet):
    
    def __init__(self, miCustomCols=None, dfCustomColTrans=None,
                       dComputedCols=None, dfComputedColTrans=None):
        
        # Initialise base.
        super().__init__(ads.MCDSAnalysis, miCustomCols, dfCustomColTrans, dComputedCols, dfComputedColTrans)
        
    # Post-computations.
    def postComputeColumns(self):
        
        # Compute Delta AIC (AIC - min(group)) per { species, sample, precision, duration } group.
        # a. Minimum AIC per group
        aicColInd = ('detection probability', 'AIC value', 'Value')
        aicGroupColInds = [('sample', 'species', 'Value'), ('sample', 'periods', 'Value'),
                           ('sample', 'duration', 'Value'), ('variant', 'precision', 'Value')]
        df2Join = self._dfData.groupby(aicGroupColInds)[[aicColInd]].min()
        
        # b. Rename computed columns to target
        deltaAicColInd = ('detection probability', 'Delta AIC', 'Value')
        df2Join.columns = pd.MultiIndex.from_tuples([deltaAicColInd])
        
        # c. Join the column to the target data-frame
        self._dfData = self._dfData.join(df2Join, on=aicGroupColInds)
        
        # d. Compute delta-AIC in-place
        self._dfData[deltaAicColInd] = self._dfData[aicColInd] - self._dfData[deltaAicColInd]

# Results object construction
miCustCols = pd.MultiIndex.from_tuples([('id', 'index', 'Value'),
                                        ('sample', 'species', 'Value'),
                                        ('sample', 'periods', 'Value'),
                                        ('sample', 'duration', 'Value'),
                                        ('variant', 'precision', 'Value')])
dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=['index', 'species', 'periods', 'duration', 'precision'],
                           fr=['numéro', 'espèce', 'périodes', 'durée', 'précision']))

dCompCols = {('detection probability', 'Delta AIC', 'Value'): 
             len(ads.MCDSEngine.statSampCols()) + len(ads.MCDSAnalysis.MIRunColumns) + 11} # Right before AIC
dfCompColTrans = \
    pd.DataFrame(index=dCompCols.keys(),
                 data=dict(en=['Delta AIC'], fr=['Delta AIC']))

In [None]:
rs = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                            dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

### b. Some getters

In [None]:
# empty
assert rs.empty

In [None]:
# len
assert len(rs) == 0

In [None]:
# index
assert len(rs.index) == 0
assert rs.index.to_list() == []

In [None]:
# columns
assert len(rs.columns) == 0

### c. Append result rows

In [None]:
# append
sHead = pd.Series(index=miCustCols, data=list(range(len(miCustCols))))

miResCols = ads.MCDSEngine.statSampCols().append(ads.MCDSAnalysis.MIRunColumns).append(ads.MCDSEngine.statModCols())

sResult = pd.Series(index=miResCols, data=list(range(len(miResCols)))) # Fictive data, never mind !
rs.append(sResult, sCustomHead=sHead)

sResult = pd.Series(index=miResCols, data=list(range(1, len(miResCols) + 1))) # Fictive data, never mind !
rs.append(sResult, sCustomHead=sHead)

sResult = pd.Series(index=miResCols, data=list(range(2, len(miResCols) + 2))) # Fictive data, never mind !
rs.append(sResult, sCustomHead=sHead)

### d. Some getters again

In [None]:
# dfRawData (no post-computed columns)
dfRaw = rs.dfRawData
dfRaw

In [None]:
# columns (Beware: rs.columns does trigger computation of ... computed columns !)
assert len(rs._dfData.columns) == len(dfRaw.columns) and len(dfRaw.columns) == 113
rawCols = rs._dfData.columns.to_list()
rawCols

In [None]:
# columns
assert len(rs.columns) == 114  # The proof here !
postCols = rs.columns.to_list()
postCols

In [None]:
# Check added == compute column
assert set(rs.columns.to_list()) - set(dfRaw.columns.to_list()) == { ('detection probability', 'Delta AIC', 'Value') }

In [None]:
# dfData (post-computations already done, never mind)
dfPost = rs.dfData
dfPost

In [None]:
# index
assert len(rs.index) == 3
assert rs.index.to_list() == [0, 1, 2]

### e. Getters: dfSubData

In [None]:
columns = [('id', 'index', 'Value'), ('sample', 'species', 'Value'),
           ('sample', 'periods', 'Value'), ('sample', 'duration', 'Value'),
           ('detection probability', 'Delta AIC', 'Value')]
index = [0, 2]

dfSub = rs.dfSubData(index=index, columns=columns)
assert len(dfSub) == 2
assert dfSub.index.to_list() == index
assert dfSub.columns.to_list() == columns
dfSub

### f. Getters: Translation

In [None]:
# dfTransData
dfTrans = rs.dfTransData('fr')
assert len(dfPost.columns) == len(dfTrans.columns)
dfTrans

In [None]:
dfTrans.columns.to_list()

In [None]:
dfTrSub = rs.dfTransData('en', index=index, columns=columns)
assert len(dfTrSub) == 2
assert dfTrSub.index.to_list() == index
assert dfTrSub.columns.to_list() == ['index', 'species', 'periods', 'duration', 'Delta AIC']
dfTrSub

### g. Specs management

In [None]:
rs.updateSpecs(d=dict(a=1, b=2), df=pd.DataFrame([dict(a=3, b=4), dict(a=7, b=9, v=90)]), reset=True)

In [None]:
rs.updateSpecs(l=[9, -9], s=pd.Series(dict(e=3, f=5), name='serie'))
rs.specs

In [None]:
try:
    rs.updateSpecs(l=[8, -8, 0])
    assert False, "Error: Should have refused to overwite already existing 'l'"
except AssertionError:
    print('Good: Refused to overwrite existing spec if not explicitly authorised to')

assert rs.specs['l'] == [9, -9]

rs.specs

In [None]:
rs.updateSpecs(**dict(l = [7, -7, 77]), overwrite=True)

print('Good: Accepted to overwrite existing spec if explicitly authorised to')

assert rs.specs['l'] == [7, -7, 77]

rs.specs

### h. Imports and exports

#### i. Exports (with specs)

(see imports tests below for exported content checks)

In [None]:
rs.toExcel('tmp/results-set-uni.xlsx', sheetName='utest')

In [None]:
rs.toExcel('tmp/results-set-uni.xls', sheetName='utest')

In [None]:
rs.toOpenDoc('tmp/results-set-uni.ods', sheetName='utest')

In [None]:
rs.toPickle('tmp/results-set-uni.pickle.xz')

In [None]:
rs.toPickle('tmp/results-set-uni.pickle')

#### ii. Imports with explicit format (with specs)

In [None]:
# A. XLSX Format
rs1 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs1.fromExcel('tmp/results-set-uni.xlsx', sheetName='utest')

rs1.dfData

In [None]:
# Data
assert rs1.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs1.specs['d'], dict) and rs1.specs['d'] == rs.specs['d']
assert isinstance(rs1.specs['df'], pd.DataFrame) and rs1.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs1.specs['l'], list) and rs1.specs['l'] == rs.specs['l']
assert isinstance(rs1.specs['s'], pd.Series) and rs1.specs['s'].name == rs.specs['s'].name \
       and rs1.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs1.specs

In [None]:
# B. XLS Format
rs2 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs2.fromExcel('tmp/results-set-uni.xls', sheetName='utest')

rs2.dfData

In [None]:
# Data
assert rs2.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs2.specs['d'], dict) and rs2.specs['d'] == rs.specs['d']
assert isinstance(rs2.specs['df'], pd.DataFrame) and rs2.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs2.specs['l'], list) and rs2.specs['l'] == rs.specs['l']
assert isinstance(rs2.specs['s'], pd.Series) and rs2.specs['s'].name == rs.specs['s'].name \
       and rs2.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs2.specs

In [None]:
# C. Format ODS
rs3 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs3.fromOpenDoc('tmp/results-set-uni.ods', sheetName='utest')

rs3.dfData

In [None]:
# Data
assert rs3.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs3.specs['d'], dict) and rs3.specs['d'] == rs.specs['d']
assert isinstance(rs3.specs['df'], pd.DataFrame) and rs3.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs3.specs['l'], list) and rs3.specs['l'] == rs.specs['l']
assert isinstance(rs3.specs['s'], pd.Series) and rs3.specs['s'].name == rs.specs['s'].name \
       and rs3.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs3.specs

In [None]:
# D. Format pickle comprimé
rs4 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs4.fromPickle('tmp/results-set-uni.pickle.xz')

rs4.dfData

In [None]:
# Data
assert rs4.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs4.specs['d'], dict) and rs4.specs['d'] == rs.specs['d']
assert isinstance(rs4.specs['df'], pd.DataFrame) and rs4.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs4.specs['l'], list) and rs4.specs['l'] == rs.specs['l']
assert isinstance(rs4.specs['s'], pd.Series) and rs4.specs['s'].name == rs.specs['s'].name \
       and rs4.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs4.specs

In [None]:
# E. Format pickle non comprimé
rs5 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs5.fromPickle('tmp/results-set-uni.pickle')

rs5.dfData

In [None]:
# Data
assert rs5.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs5.specs['d'], dict) and rs5.specs['d'] == rs.specs['d']
assert isinstance(rs5.specs['df'], pd.DataFrame) and rs5.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs5.specs['l'], list) and rs5.specs['l'] == rs.specs['l']
assert isinstance(rs5.specs['s'], pd.Series) and rs5.specs['s'].name == rs.specs['s'].name \
       and rs5.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs5.specs

#### iii. Imports with auto-detected format (with specs)

In [None]:
# A. XLSX Format
rs1 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs1.fromFile('tmp/results-set-uni.xlsx', sheetName='utest')

rs1.dfData

In [None]:
# Data
assert rs1.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs1.specs['d'], dict) and rs1.specs['d'] == rs.specs['d']
assert isinstance(rs1.specs['df'], pd.DataFrame) and rs1.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs1.specs['l'], list) and rs1.specs['l'] == rs.specs['l']
assert isinstance(rs1.specs['s'], pd.Series) and rs1.specs['s'].name == rs.specs['s'].name \
       and rs1.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs1.specs

In [None]:
# B. XLS Format
rs2 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs2.fromFile('tmp/results-set-uni.xls', sheetName='utest')

rs2.dfData

In [None]:
# Data
assert rs2.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs2.specs['d'], dict) and rs2.specs['d'] == rs.specs['d']
assert isinstance(rs2.specs['df'], pd.DataFrame) and rs2.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs2.specs['l'], list) and rs2.specs['l'] == rs.specs['l']
assert isinstance(rs2.specs['s'], pd.Series) and rs2.specs['s'].name == rs.specs['s'].name \
       and rs2.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs2.specs

In [None]:
# C. Format ODS
rs3 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs3.fromFile('tmp/results-set-uni.ods', sheetName='utest')

rs3.dfData

In [None]:
# Data
assert rs3.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs3.specs['d'], dict) and rs3.specs['d'] == rs.specs['d']
assert isinstance(rs3.specs['df'], pd.DataFrame) and rs3.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs3.specs['l'], list) and rs3.specs['l'] == rs.specs['l']
assert isinstance(rs3.specs['s'], pd.Series) and rs3.specs['s'].name == rs.specs['s'].name \
       and rs3.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs3.specs

In [None]:
# D. Format pickle comprimé
rs4 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs4.fromFile('tmp/results-set-uni.pickle.xz')

rs4.dfData

In [None]:
# Data
assert rs4.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs4.specs['d'], dict) and rs4.specs['d'] == rs.specs['d']
assert isinstance(rs4.specs['df'], pd.DataFrame) and rs4.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs4.specs['l'], list) and rs4.specs['l'] == rs.specs['l']
assert isinstance(rs4.specs['s'], pd.Series) and rs4.specs['s'].name == rs.specs['s'].name \
       and rs4.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs4.specs

In [None]:
# E. Format pickle non comprimé
rs5 = TestAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                             dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

rs5.fromFile('tmp/results-set-uni.pickle')

rs5.dfData

In [None]:
# Data
assert rs5.dfData.equals(rs.dfData)  # == fails on NaNs in same places ...

In [None]:
# Specs
assert isinstance(rs5.specs['d'], dict) and rs5.specs['d'] == rs.specs['d']
assert isinstance(rs5.specs['df'], pd.DataFrame) and rs5.specs['df'].equals(rs.specs['df'])  # == fails on NaNs in same places
assert isinstance(rs5.specs['l'], list) and rs5.specs['l'] == rs.specs['l']
assert isinstance(rs5.specs['s'], pd.Series) and rs5.specs['s'].name == rs.specs['s'].name \
       and rs5.specs['s'].equals(rs.specs['s'])  #  == fails on NaNs in same places

rs5.specs

#### iv. Imports with default values for missing columns

In [None]:
# TODO
# How ?
# For each file format,
# - read target file (written above) with pandas API (not ResultsSet one)
# - remove some columns
# - overwrite target file with pandas API
# - load target file with ResultsSet API, specifying default valeus for the missing columns
# - check that results is OK

### i. Comparison

Note: Self-contained, nothing needing to be run before (but 0)

In [None]:
# Objets MCDSAnalysisResultsSet et chargement depuis fichiers.
modelIdCols = ['Model']
modelParamCols = ['LTrunc', 'RTrunc', 'FitDistCuts', 'DiscrDistCuts']
sampleIdCols = ['Species', 'Periods', 'Prec.', 'Duration']
caseIdCols = ['AnlysNum', 'SampNum'] + sampleIdCols + modelIdCols

sampCols = [('sample', col, 'Value') for col in sampleIdCols]
miSampCols = pd.MultiIndex.from_tuples(sampCols)

custCols = [('sample', 'AnlysNum', 'Value'), ('sample', 'SampNum', 'Value')] + sampCols + [('model', 'Model', 'Value')]
miCustCols = pd.MultiIndex.from_tuples(custCols)

dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=caseIdCols, 
                           fr=['NumAnlys', 'NumSamp', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Modèle']))

In [None]:
# Référence (obtenue avec Distance 7.3)
rsDist = ads.MCDSAnalysisResultsSet(miSampleCols=miSampCols, sampleIndCol=('sample', 'SampNum', 'Value'),
                                    miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                                    distanceUnit='Meter', areaUnit='Hectare',
                                    surveyType='Point', distanceType='Radial', clustering=False)

rsDist.fromFile('refin/ACDC2019-Papyrus-ALAARV-TURMER-comp-dist-auto.ods', sheetName='RefDist73',
                postComputed=True) # Avoid recomputations, some columns are now missing, files are old actually !

In [None]:
# Résultat obtenu via pyaudisam.
rsAuto = ads.MCDSAnalysisResultsSet(miSampleCols=miSampCols, sampleIndCol=('sample', 'SampNum', 'Value'),
                                    miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                                    distanceUnit='Meter', areaUnit='Hectare',
                                    surveyType='Point', distanceType='Radial', clustering=False)

rsAuto.fromFile('refin/ACDC2019-Papyrus-ALAARV-TURMER-comp-dist-auto.ods', sheetName='ActAuto',
                postComputed=True) # Avoid recomputations, some columns are now missing, files are old actually !

In [None]:
# Colonnes d'index
indexCols = custCols + [('parameters', 'left truncation distance', 'Value'),
                        ('parameters', 'right truncation distance', 'Value'),
                        ('parameters', 'model fitting distance cut points', 'Value'),
                        ('parameters', 'distance discretisation cut points', 'Value')]

In [None]:
# Colonnes à comparer (on retire DeltaDCV et DeltaAIC car ils dépendent des ensembles d'analyses effectuées,
#                      différents entre la référence et l'exécution auto, et une colonne string : comparaison non implémentée).
subsetCols = [col for col in rsDist.columns.to_list() \
              if col not in indexCols + [('run output', 'run time', 'Value'),
                                         ('density/abundance', 'density of animals', 'Delta Cv'),
                                         ('detection probability', 'Delta AIC', 'Value')]]
#subsetCols

In [None]:
dfRelDiff = rsDist.compare(rsAuto, subsetCols=subsetCols, indexCols=indexCols)
assert len(dfRelDiff.columns) == 21
assert len(dfRelDiff) == len(rsDist)
dfRelDiff

In [None]:
dfRelDiff = rsDist.compare(rsAuto, subsetCols=subsetCols, indexCols=indexCols, dropCloser=16, dropNans=False)
assert len(dfRelDiff.columns) == 21
assert len(dfRelDiff) == 8
dfRelDiff

In [None]:
dfRelDiff = rsDist.compare(rsAuto, subsetCols=subsetCols, indexCols=indexCols, dropCloser=16, dropNans=True)
assert len(dfRelDiff.columns) == 21
assert len(dfRelDiff) == 3
dfRelDiff

In [None]:
dfRelDiff = rsDist.compare(rsAuto, subsetCols=subsetCols, indexCols=indexCols, dropCloser=5, dropNans=True)
assert len(dfRelDiff.columns) == 21
assert len(dfRelDiff) == 2
dfRelDiff

In [None]:
# Drop also closer columns
dfRelDiff = rsDist.compare(rsAuto, subsetCols=subsetCols, indexCols=indexCols, dropCloser=5, dropNans=True, dropCloserCols=True)
assert len(dfRelDiff.columns) == 19
dfRelDiff

### j. Post-computations

Note: Self-contained, nothing needing to be run before (but 0)

In [None]:
# MCDSAnalysisResultsSet object + loading from file
modelIdCols = ['Model']
modelParamCols = ['LTrunc', 'RTrunc', 'FitDistCuts', 'DiscrDistCuts']
sampleIdCols = ['Species', 'Periods', 'Prec.', 'Duration']
caseIdCols = ['AnlysNum', 'SampNum'] + sampleIdCols + modelIdCols

sampCols = [('sample', col, 'Value') for col in sampleIdCols]
miSampCols = pd.MultiIndex.from_tuples(sampCols)

custCols = [('sample', 'AnlysNum', 'Value'), ('sample', 'SampNum', 'Value')] + sampCols + [('model', 'Model', 'Value')]
miCustCols = pd.MultiIndex.from_tuples(custCols)

dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=caseIdCols, 
                           fr=['NumAnlys', 'NumSamp', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Modèle']))

rsAuto = ads.MCDSAnalysisResultsSet(miSampleCols=miSampCols, sampleIndCol=('sample', 'SampNum', 'Value'),
                                    miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                                    distanceUnit='Meter', areaUnit='Hectare',
                                    surveyType='Point', distanceType='Radial', clustering=False)

rsAuto.fromFile('refin/ACDC2019-Papyrus-ALAARV-TURMER-comp-dist-auto.ods', sheetName='ActAuto')

In [None]:
# Trigger post-computations
rsAuto.dfData

In [None]:
# Load reference from file
rsAutoRef = ads.MCDSAnalysisResultsSet(miSampleCols=miSampCols, sampleIndCol=('sample', 'SampNum', 'Value'),
                                       miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                                       distanceUnit='Meter', areaUnit='Hectare',
                                       surveyType='Point', distanceType='Radial', clustering=False)

rsAutoRef.fromFile('refout/ACDC2019-Papyrus-ALAARV-TURMER-resultats-postcomp.ods')

In [None]:
# Comparison of loaded results to reference
# a. Index columns
indexCols = custCols + [('parameters', 'left truncation distance', 'Value'),
                        ('parameters', 'right truncation distance', 'Value'),
                        ('parameters', 'model fitting distance cut points', 'Value'),
                        ('parameters', 'distance discretisation cut points', 'Value'),
                        ('parameters', 'estimator key function', 'Value'),
                        ('parameters', 'estimator adjustment series', 'Value'),
                        ('parameters', 'estimator selection criterion', 'Value')]

# b. Colonnes to compare : we ignore ...
# * DeltaDCV et DeltaAIC because they depend on the whole set of analyses actually done to get the results,
#   that is possibly different sets in the 2 cases.
# * other string-typed columns (comparison not implemented)
subsetCols = [col for col in rsAutoRef.columns.to_list() \
              if col not in indexCols + [('run output', 'run time', 'Value'), ('run output', 'run folder', 'Value'),
                                         ('density/abundance', 'density of animals', 'Delta Cv'),
                                         ('detection probability', 'Delta AIC', 'Value'),
                                         ('detection probability', 'key function type', 'Value'),
                                         ('detection probability', 'adjustment series type', 'Value')]]

# c. Comparison
dfRelDiff = rsAuto.compare(rsAutoRef, subsetCols=subsetCols, indexCols=indexCols, dropCloser=15)
assert len(dfRelDiff) == 0
dfRelDiff

## 6. Class FieldDataSet (and base DataSet)

Note: For real unit tests of DataSet, see `visionat` module, which defines the same class (have to be the same: check it !)

### a. Load data sample

In [None]:
dfObs = pd.read_csv('refin/ACDC2019-Naturalist-ExtraitObsBrutesAvecDist.txt', sep='\t', decimal=',')
dfObs.head()

In [None]:
countCols =  ['nMalAd10', 'nAutAd10', 'nMalAd5', 'nAutAd5']

In [None]:
sCounts = dfObs[countCols].sum()

In [None]:
len(dfObs), sCounts.to_dict()

In [None]:
assert len(dfObs) == 724
assert not any(sCounts - pd.Series({'nMalAd10': 613, 'nAutAd10': 192, 'nMalAd5': 326, 'nAutAd5': 102}))

### b. FieldDataSet._separateMultiCategoryCounts

In [None]:
%%time

dfObsMonoCat_ = ads.FieldDataSet._separateMultiCategoryCounts(dfObs, countCols)
len(dfObsMonoCat_), dfObsMonoCat_[countCols].sum()

In [None]:
s = dfObs[countCols].apply(lambda s: len(s[s > 0]), axis='columns')

print(len(s), s.value_counts().to_dict())

assert len(s) - len(s[s < 1]) + sum((i-1)*len(s[s == i]) for i in range(1, s.max()+1)) == len(dfObsMonoCat_)

In [None]:
assert len(dfObsMonoCat_) == 1125
assert not any(dfObsMonoCat_[countCols].sum() - sCounts)

In [None]:
dfObsMonoCat_.head()

### c. Categorise sightings

Needed for adding absence data below

(no more counts - by the way, all 0 or 1 - => only catgories)

In [None]:
# Should not see any sightings with all null counts
assert dfObsMonoCat_[~dfObsMonoCat_[countCols].any(axis='columns')].empty

In [None]:
def count2AdultCat(sCounts):
    return 'm' if 'Mal' in sCounts[sCounts > 0].index[0] else 'a'
dfObsMonoCat_['Adulte'] = dfObsMonoCat_[countCols].apply(count2AdultCat, axis='columns')

def count2DurationCat(sCounts):
    return '5mn' if '5' in sCounts[sCounts > 0].index[0] else '10mn'
dfObsMonoCat_['Durée'] = dfObsMonoCat_[countCols].apply(count2DurationCat, axis='columns')

dfObsMonoCat_.tail()

### d. FieldDataSet._individualiseMonoCategoryCounts

In [None]:
%%time

dfObsIndiv_ = ads.FieldDataSet._individualiseMonoCategoryCounts(dfObsMonoCat_, countCols)
len(dfObsIndiv_), dfObsIndiv_[countCols].sum()

In [None]:
assert len(dfObsIndiv_) == 1233
assert not any(dfObsIndiv_[countCols].sum() - sCounts)

In [None]:
dfObsIndiv_.head()

### e. FieldDataSet.monoCategorise

(combines a, b, c and d above in one function : the one to use actually !)

In [None]:
# First, load FieldDataSet from dfObs
fds = ads.FieldDataSet(source=dfObs, countCols=countCols,
                       addMonoCatCols={ 'Adulte': count2AdultCat, 'Durée': count2DurationCat })

In [None]:
dfObsMonoCat = fds.monoCategorise()

In [None]:
dfObsMonoCat.head()

In [None]:
assert (dfObsMonoCat == dfObsMonoCat_).all().all()

### f. FieldDataSet.individualise
(combines a, b, c and d above in one function : the one to use actually !)

In [None]:
dfObsIndiv = fds.individualise()

In [None]:
dfObsIndiv.head()

In [None]:
assert (dfObsIndiv == dfObsIndiv_).all().all()

In [None]:
# Second, try from source CSV file
fds = ads.FieldDataSet(source='refin/ACDC2019-Naturalist-ExtraitObsBrutesAvecDist.txt',
                       importDecFields=['distMem'], countCols=countCols,
                       addMonoCatCols={ 'Adulte': count2AdultCat, 'Durée': count2DurationCat })

In [None]:
dfObsIndiv = fds.individualise()

In [None]:
dfObsIndiv.head()

In [None]:
assert (dfObsIndiv == dfObsIndiv_).all().all()

### g. Drop now unneeded count columns

(only 0 or 1 inside + columns Adulte and Duree to explain what a 1 means)

In [None]:
# No more need for count cols then (only 0 or 1 inside + columns Adulte and Duree to explain what a 1 means)
dfObsIndiv.drop(columns=countCols, inplace=True)
dfObsIndiv.tail()

## 7. Class MonoCategoryDataSet (and base DataSet)

Note: For real unit tests of DataSet, see `visionat` module, which defines the same class (have to be the same: check it !)

Note: Run 4 above before.

### a. Extract transect info

(assuming that each transect x pass gave at least 1 sighting, otherwise the effort will be wrong)

In [None]:
transectPlaceCol = 'Point'
transectPlaceCols = [transectPlaceCol]
passIdCol = 'Passage'
effortCol = 'Effort'

In [None]:
dfTransPassEffort = ads.MonoCategoryDataSet._extractTransects(dfObsIndiv, transectPlaceCols=transectPlaceCols,
                                                              passIdCol=passIdCol,
                                                              effortCol=effortCol, effortConstVal=1)
dfTransPassEffort

In [None]:
assert len(dfTransPassEffort) == 41 \
       and len(dfTransPassEffort[dfTransPassEffort.Passage == 'a']) == 21 \
       and len(dfTransPassEffort[dfTransPassEffort.Passage == 'b']) == 20 

### b. Select sighting from 1 sample

In [None]:
# Define sample columns
sampleCols = ['Passage', 'Adulte', 'Durée']

In [None]:
dfObsIndiv.head()

In [None]:
# Select 1 sample
espece = 'Sylvia atricapilla'
passage = 'a'
adulte = 'm'
duree = '10mn'
#dfObsIndivSmpl = dfObsIndiv[(dfObsIndiv.Passage == passage) & (dfObsIndiv.Adulte == adulte) \
#                            & (dfObsIndiv.Duree == duree) & (dfObsIndiv.Espece == espece)]

dfObsIndivSmpl, dfTrPassEffSmpl = \
    ads.MonoCategoryDataSet._selectSampleSightings(dSample={ 'Passage': passage, 'Adulte': adulte,
                                                            'Durée': duree, 'Espèce': espece },
                                                  dfAllSights=dfObsIndiv, dfAllEffort=dfTransPassEffort,
                                                  transectPlaceCols=['Point'], passIdCol='Passage',
                                                  effortCol='Effort')

In [None]:
assert len(dfObsIndivSmpl) == 36 and dfObsIndivSmpl[transectPlaceCol].nunique() == 18
assert len(dfTrPassEffSmpl) == 21 and dfTrPassEffSmpl.reset_index()[transectPlaceCol].nunique() == len(dfTrPassEffSmpl)
assert len(dfTrPassEffSmpl[dfTrPassEffSmpl.Effort != 1]) == 0 # 1 seul passage, et sur tous les points sans exception

### c. Add abscence sightings

In [None]:
dfObsIndivSmpl

In [None]:
%%time

dfObsIndivAbscSmpl = ads.MonoCategoryDataSet._addAbsenceSightings(dfObsIndivSmpl, sampleCols, dfTrPassEffSmpl)
len(dfObsIndivAbscSmpl)

In [None]:
# Check for no change in sample columns
assert list(dfObsIndivAbscSmpl.columns) == list(dfObsIndivSmpl.columns)

# Check for number of added rows
assert len(dfObsIndivAbscSmpl) == 39 # 36 sightings + 3 missings transects

# Check for final number of transects
assert dfObsIndivAbscSmpl[dfTrPassEffSmpl.index.name].nunique() == 21

# Check for no change in sample identification
assert list(dfObsIndivAbscSmpl['Espèce'].unique()) == [espece, None] # None for absence sightings !
assert list(dfObsIndivAbscSmpl.Passage.unique()) == [passage]
assert list(dfObsIndivAbscSmpl.Adulte.unique()) == [adulte]
assert list(dfObsIndivAbscSmpl['Durée'].unique()) == [duree]

In [None]:
sorted(dfObsIndiv['Espèce'].unique())

In [None]:
%%time

# Performance test
print('Espèce      Passage  Adulte Durée NbDonnées')

for espece in ['Sylvia atricapilla', 'Alauda arvensis', 'Sylvia communis', 'Phylloscopus collybita']: 
    
    for passage in ['a', 'b', 'a+b']: 

        for adulte in ['m', 'a', 'm+a']:

            for duree in ['5mn', '10mn']:

                passages = passage.split('+')
                adultes = adulte.split('+')
                #dfObsIndivSmpl = dfObsIndiv[dfObsIndiv.Passage.isin(passages) & dfObsIndiv.Adulte.isin(adultes) \
                #                            & (dfObsIndiv.Duree == duree) & (dfObsIndiv.Espece == espece)]
                dfObsIndivSmpl, dfTrPassEffSmpl = \
                    ads.MonoCategoryDataSet._selectSampleSightings(dSample={ 'Passage': passage, 'Adulte': adulte,
                                                                            'Durée': duree, 'Espèce': espece },
                                                                  dfAllSights=dfObsIndiv,
                                                                  dfAllEffort=dfTransPassEffort,
                                                                  transectPlaceCols=['Point'], passIdCol='Passage', 
                                                                  effortCol='Effort')

                try:
                    print(espece, passage, adulte, duree, ':', len(dfObsIndivSmpl), '=> ', end='')
                    dfObsIndivAbscSmpl_ = \
                        ads.MonoCategoryDataSet._addAbsenceSightings(dfObsIndivSmpl, sampleCols, dfTrPassEffSmpl)
                    print(len(dfObsIndivAbscSmpl_))
                except Exception as e:
                    print(e)
                    
print('Should give around 1s on a Core i7 8850H (6 HT cores, 2.6-4.3GHz, cache 9Mb) + NVME SSD')
print('Should give around 1s on a Core i5 8365U (4 HT cores, 1.6-4.1GHz, cache 6Mb) + NVME SSD')

### d. ads.MonoCategoryDataSet._addSurveyAreaInfo

In [None]:
dSurveyArea = dict(Zone='ACDC', Surface='2400')

dfObsIndivAbscSmpl = ads.MonoCategoryDataSet._addSurveyAreaInfo(dfObsIndivAbscSmpl, dSurveyArea=dSurveyArea)

In [None]:
dfObsIndivAbscSmpl.head()

### e. MonoCategoryDataSet.sampleDataSet

(combines a, b, c and d above in one function : the one to use actually, of course !)

In [None]:
mds = ads.MonoCategoryDataSet(dfObsIndiv, dSurveyArea=dSurveyArea, sampleDecFields=['Effort', 'distMem'],
                             transectPlaceCols=transectPlaceCols, passIdCol=passIdCol,
                             effortCol=effortCol, effortConstVal=1)

In [None]:
sds = mds.sampleDataSet(sSampleSpecs=pd.Series({ 'Passage': passage, 'Adulte': adulte, 
                                                 'Durée': duree, 'Espèce': espece }))

In [None]:
sds.dfData

## 8. Abstract class Analyser

Note: Run 4 above before.

### a. Generate implicit partial variant combination table

In [None]:
# Nombre d'individus par espèce, pour voir quelles espèces on va analyser
dfIndivCounts = dfObsIndiv.loc[dfObsIndiv.Adulte == 'm', ['Espèce', 'Adulte']].groupby('Espèce').count()

dfIndivCounts.rename(columns=dict(Adulte='Mâles'), inplace=True)
dfIndivCounts.sort_values(by='Mâles', ascending=False, inplace=True)

dfIndivCounts[dfIndivCounts['Mâles'] >= 20]

In [None]:
nMaxMal10 = 30
varEspeces = list(dfIndivCounts[dfIndivCounts['Mâles'] >= nMaxMal10].index) # 1 variante par espèce

varPassages = [''] # Tous les passages ensemble => 1 seule variante
varAdultes = ['m', 'm+a'] # Les mâles, et ensuite les mâles et autres adultes (=> 2 variantes)
varDurees = ['5mn', '10mn'] # 5 1ères mn, ou toutes les 10 => 2 variantes

dfImplSampSpecs = ads.Analyser.implicitPartialVariantSpecs({ 'Espèces':varEspeces, 'Passages': varPassages,
                                                               'Adultes': varAdultes, 'Durées': varDurees })
dfImplSampSpecs

### b. Explicit partial variant combination generation

In [None]:
dfExplSampSpecs = ads.Analyser.explicitPartialVariantSpecs(dfImplSampSpecs)
dfExplSampSpecs

### c. Direct explicitation of all variants

from user specs (implicit and explict)

In [None]:
userVariantSpecs = 'refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx'

if False: # Both method MUST work, but this one needs more code :-)
    userVariantSpecs = pd.read_excel(userVariantSpecs, sheet_name=None)
    print('sheets:', ', '.join(userVariantSpecs.keys()))

userVariantSpecs

In [None]:
dfFinalExplSpecs = ads.Analyser.explicitVariantSpecs(userVariantSpecs, ignore=['Params3_expl'],
                                                     varIndCol='IndAnlys',
                                                     #convertCols={ 'Durée': int }, # float 'cause of Excel
                                                     computedCols=dict(AbrevAnlys=analysisAbbrev))

dfFinalExplSpecs

In [None]:
# Just to see by eye
dfFinalExplSpecs.to_excel('tmp/tools-unitests-final-expl-specs.xlsx', index=False)

In [None]:
# Computational checks
if isinstance(userVariantSpecs, dict):
    ddfUserVariantSpecs = userVariantSpecs
else:
    ddfUserVariantSpecs = pd.read_excel('refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx', sheet_name=None)

nEch1Vars = 1
df = ddfUserVariantSpecs['Echant1_impl']
for col in df.columns:
    nEch1Vars *= len(df[col].dropna())
    
nEch2Vars = 1
df = ddfUserVariantSpecs['Echant2_impl']
for col in df.columns:
    nEch2Vars *= len(df[col].dropna())
    
nModVars = 1
df = ddfUserVariantSpecs['Modl_impl']
for col in df.columns:
    nModVars *= len(df[col].dropna())

nEch1ParWithVars = \
  len(ddfUserVariantSpecs['Params1_expl'].drop_duplicates(subset=ddfUserVariantSpecs['Echant1_impl'].columns))

nEch1Pars = len(ddfUserVariantSpecs['Params1_expl'])

nEch2ParWithVars = \
  len(ddfUserVariantSpecs['Params2_expl'].drop_duplicates(subset=ddfUserVariantSpecs['Echant2_impl'].columns))

nEch2Pars = len(ddfUserVariantSpecs['Params2_expl'])

nExpdVars = nModVars * (nEch1Pars + nEch1Vars - nEch1ParWithVars + nEch2Pars + nEch2Vars - nEch2ParWithVars)
assert len(dfFinalExplSpecs) == nExpdVars

nModVars, nEch1Pars, nEch1Vars, nEch1ParWithVars, nEch2Pars, nEch2Vars, nEch2ParWithVars, nExpdVars

## 9. Abstract class DSAnalyser

Note: Run 6 above before.

### a. userSpec2ParamNames

In [None]:
IntSpecEstimKeyFn = 'EstimKeyFn'
IntSpecEstimAdjustFn = 'EstimAdjustFn'
IntSpecEstimCriterion = 'EstimCriterion'
IntSpecCVInterval = 'CvInterval'
IntSpecMinDist = 'MinDist' # Left truncation distance
IntSpecMaxDist = 'MaxDist' # Right truncation distance
IntSpecFitDistCuts = 'FitDistCuts'
IntSpecDiscrDistCuts = 'DiscrDistCuts'

int2UserSpecREs = \
  { IntSpecEstimKeyFn:     ['ke[a-z]*[\.\-_ ]*f', 'f[o]?n[a-z]*[\.\-_ ]*cl'],
    IntSpecEstimAdjustFn:  ['ad[a-z]*[\.\-_ ]*s', 's[éa-z]*[\.\-_ ]*aj'],
    IntSpecEstimCriterion: ['crit[èa-z]*[\.\-_ ]*'],
    IntSpecCVInterval:     ['conf[a-z]*[\.\-_ ]*[a-z]*[\.\-_ ]*int',
                            'in[o]?n[a-z]*[\.\-_ ]*conf'],
    IntSpecMinDist:        ['min[a-z]*[\.\-_ ]*d', 'd[a-z]*[\.\-_ ]*min',
                            'tr[a-z]*[\.\-_ ]*ga', 'tr[a-z]*[\.\-_ ]*gc', 'le[a-z]*[\.\-_ ]*tr'],
    IntSpecMaxDist:        ['max[a-z]*[\.\-_ ]*d', 'd[a-z]*[\.\-_ ]*max',
                            'tr[a-z]*[\.\-_ ]*dr', 'tr[a-z]*[\.\-_ ]*dt', 'le[a-z]*[\.\-_ ]*tr'],
    IntSpecFitDistCuts:    ['fit[a-z]*[\.\-_ ]*d', 'tr[a-z]*[\.\-_ ]*[a-z]*[\.\-_ ]*mod'],
    IntSpecDiscrDistCuts:  ['dis[a-z]*[\.\-_ ]*d', 'tr[a-z]*[\.\-_ ]*[a-z]*[\.\-_ ]*dis']}


In [None]:
assert ads.DSAnalyser.userSpec2ParamNames(['key fn', 'série-aj', 'est.crit.', 'ConfInt',
                                           'fit d', 'disc d', 'min dist', 'maxd'], int2UserSpecREs) \
       == [IntSpecEstimKeyFn, IntSpecEstimAdjustFn, IntSpecEstimCriterion, IntSpecCVInterval,
           IntSpecFitDistCuts, IntSpecDiscrDistCuts, IntSpecMinDist, IntSpecMaxDist]

### b. _explicitParamSpecs

In [None]:
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'distMem'
sampleDecCols=[effortCol, sampleDistCol]

sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']
sampleIndCol = 'IndSamp'

varIndCol = 'IndAnlys'
anlysAbbrevCol = 'AbrevAnlys'

In [None]:
# Via combinaisons implicites, par fichier.
dfExplParamSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols = \
    ads.DSAnalyser._explicitParamSpecs(implParamSpecs='refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx',
                                       int2UserSpecREs=int2UserSpecREs,
                                       sampleSelCols=sampleSelCols, abbrevCol=anlysAbbrevCol,
                                       abbrevBuilder=analysisAbbrev, anlysIndCol=varIndCol,
                                       sampleIndCol=sampleIndCol, dropDupes=False)

In [None]:
print(len(dfExplParamSpecs), userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

assert len(dfExplParamSpecs) == 48
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts']
assert unmUserParamSpecCols == []

In [None]:
dfExplParamSpecs

In [None]:
# Via combinaisons explicites, par dataframe, avec doublons nettoyés, et colonnes neutres traversantes.
dfExplParamSpecs.drop(columns=[varIndCol, anlysAbbrevCol, sampleIndCol], inplace=True)
dfExplParamSpecs = dfExplParamSpecs.append(dfExplParamSpecs, ignore_index=True)  # Pleins de doublons !
dfExplParamSpecs['AvecTronc'] = dfExplParamSpecs[['TrGche', 'TrDrte']].apply(lambda s: s.isnull().all(), axis='columns')  # Neutre 1
dfExplParamSpecs['AbrevEsp'] = dfExplParamSpecs['Espèce'].apply(lambda s: ''.join(m[:4] for m in s.split()))  # Neutre 2
dfExplParamSpecs                                                           

In [None]:
# Colonnes neutres non signalées, doublons conservés
dfExplParamSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols = \
    ads.DSAnalyser._explicitParamSpecs(dfExplParamSpecs=dfExplParamSpecs, int2UserSpecREs=int2UserSpecREs,
                                       sampleSelCols=sampleSelCols, abbrevCol=anlysAbbrevCol,
                                       abbrevBuilder=analysisAbbrev, anlysIndCol=varIndCol,
                                       sampleIndCol=sampleIndCol, dropDupes=False)

print(len(dfExplParamSpecs), userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

assert len(dfExplParamSpecs) == 2*48
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts']
assert unmUserParamSpecCols == ['AvecTronc', 'AbrevEsp']

In [None]:
# Colonnes neutres signalées, doublons supprimés
dfExplParamSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols = \
    ads.DSAnalyser._explicitParamSpecs(dfExplParamSpecs=dfExplParamSpecs, int2UserSpecREs=int2UserSpecREs,
                                       sampleSelCols=sampleSelCols, abbrevCol=anlysAbbrevCol,
                                       abbrevBuilder=analysisAbbrev, anlysIndCol=varIndCol,
                                       sampleIndCol=sampleIndCol, anlysSpecCustCols=['AvecTronc', 'AbrevEsp'],
                                       dropDupes=True)

print(len(dfExplParamSpecs), userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

assert len(dfExplParamSpecs) == 48
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts']
assert unmUserParamSpecCols == []

In [None]:
dfExplParamSpecs

## 10. MCDSZerothOrderTruncationOptimisation class and bases

### a. Data set

In [None]:
sds = ads.SampleDataSet(source=pl.Path('refin', 'ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx'),
                        decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])
dict(nSights=len(sds), nSightDist=len(sds.dfData['DISTANCE'].dropna()))

### b. MCDS engine

In [None]:
eng = ads.MCDSEngine(workDir=tmpDir / 'mcds-zooption')

### c. Optimisations

In [None]:
# All variant truncation params, absolute discrDistCuts (dict, tuple, list intervals) ; submit times, onlyBest, termExprValue
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='HNO', estimAdjustFn='COS', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50), maxDist=dict(min=150, max=200),
                                              fitDistCutsFctr=[0.5, 1.5], discrDistCuts=dict(min=3, max=8),
                                              expr2Optimise='chi2', minimiseExpr=False,
                                              maxIters=30, termExprValue=0.5)

zoption.submit(times=3, onlyBest=2)

zoption.getResults()

In [None]:
# All variant truncation params, absolute fitDistCuts ; Interval interval
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False, autoClean=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='HNO', estimAdjustFn='COS', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50), maxDist=ads.Interval(150, 200),
                                              fitDistCuts=(10, 20), discrDistCutsFctr=(0.5, 1.5),
                                              expr2Optimise='chi2', minimiseExpr=False, maxIters=6)

zoption.submit()

zoption.getResults()

In [None]:
# Only maxDist variant truncation param
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=True, autoClean=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='HAZ', estimAdjustFn='POLY', 
                                              estimCriterion='AIC', cvInterval=95,
                                              maxDist=dict(min=150, max=200),
                                              expr2Optimise='ks', minimiseExpr=False, maxIters=5)

zoption.submit()

zoption.getResults()

In [None]:
# Only minDist variant truncation param, others absent
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='UNI', estimAdjustFn='POLY', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50),
                                              expr2Optimise='chi2*ks', minimiseExpr=False, maxIters=5)

zoption.submit()

zoption.getResults()

In [None]:
# Variant minDist, maxDist, fitDistCutsFctr, and const discrDistCuts
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='HNO', estimAdjustFn='COS', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50), maxDist=dict(min=150, max=200),
                                              fitDistCutsFctr=[0.5, 1.5], discrDistCuts=7,
                                              expr2Optimise='chi2', minimiseExpr=False,
                                              maxIters=30, termExprValue=0.5)

zoption.submit(times=3, onlyBest=2)

zoption.getResults()

In [None]:
# Variant minDist, maxDist, discrDistCutsFctr, const fitDistCuts
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False, autoClean=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='HNO', estimAdjustFn='COS', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50), maxDist=ads.Interval(150, 200),
                                              fitDistCuts=15, discrDistCutsFctr=(0.5, 1.5),
                                              expr2Optimise='chi2', minimiseExpr=False, maxIters=6)

zoption.submit()

zoption.getResults()

In [None]:
# Variant maxDist, const minDist, others absent
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=True, autoClean=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='HAZ', estimAdjustFn='POLY', 
                                              estimCriterion='AIC', cvInterval=95,
                                              maxDist=dict(min=150, max=200), minDist=20,
                                              expr2Optimise='ks', minimiseExpr=False, maxIters=5)

zoption.submit()

zoption.getResults()

In [None]:
# Variant minDist, const maxDist, others absent
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='UNI', estimAdjustFn='POLY', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50), maxDist=200.0,
                                              expr2Optimise='chi2*ks', minimiseExpr=False, maxIters=5)

zoption.submit()

zoption.getResults()

In [None]:
# Setup error, no real run
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False,
                                              error='Setup error !',
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='UNI', estimAdjustFn='POLY', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50),
                                              expr2Optimise='chi2*ks', minimiseExpr=False, maxIters=5)

zoption.submit()

zoption.getResults()

In [None]:
# Submit error, no real run
zoption = \
    ads.MCDSZerothOrderTruncationOptimisation(engine=eng, name='alarv', logData=False,
                                              sampleDataSet=sds, distanceField='DISTANCE',
                                              estimKeyFn='UNI', estimAdjustFn='POLY', 
                                              estimCriterion='AIC', cvInterval=95,
                                              minDist=(0, 50),
                                              expr2Optimise='cvmuw', minimiseExpr=False, maxIters=5)

zoption.submit(error='Submit error !')

zoption.getResults()

### d. Done.

In [None]:
eng.shutdown()

## 11. DSParamsOptimiser abstract class 

(class and static methods)

In [None]:
adspo = ads.DSParamsOptimiser

### a. _parseUserSpec

In [None]:
# Defs for param. spec. mini-language
auto = adspo.Auto()
def dist(min, max):
    return adspo.DistInterval(int(min), int(max))
def quant(pct):
    return adspo.OutliersMethod('quant', int(pct))
def tucquant(pct):
    return adspo.OutliersMethod('tucquant', float(pct))
def mult(min, max):
    return adspo.MultInterval(float(min), float(max))
def abs(min, max):
    return adspo.AbsInterval(int(min), int(max))
def min(expr):
    return dict(op='min', expr=expr)

In [None]:
# Parse spec : no error (note: look at case ;-).
for spec in [5, 12.0, 'auto', 'Auto', 'dist(5, 12)', 'quant(8)', 'QUANT(12)', 'tucquant(5)', 'mult(1.4, 7.3)', 'Abs(4, 10)']:
    r = adspo._parseUserSpec(spec, 
                             globals=dict(Auto=adspo.Auto,
                                          DistInterval=adspo.DistInterval,
                                          AbsInterval=adspo.AbsInterval,
                                          MultInterval=adspo.MultInterval,
                                          OutliersMethod=adspo.OutliersMethod),
                             locals=dict(auto=auto, dist=dist, quant=quant, tucquant=tucquant,
                                         mult=mult, abs=abs))
    assert r[0] is None
    print(spec, '=>', ', '.join(str(x) for x in r))

In [None]:
# Parse spec : errors because of bad output types.
for spec in [1, 6.0, 'auto', 'dist(5, 12)', 'quant(8)', 'tucquant(5)', 'mult(1.4, 7.3)', 'abs(4, 10)']:
    r = adspo._parseUserSpec(spec, 
                             globals=dict(Auto=adspo.Auto,
                                          DistInterval=adspo.DistInterval,
                                          AbsInterval=adspo.AbsInterval,
                                          MultInterval=adspo.MultInterval,
                                          OutliersMethod=adspo.OutliersMethod),
                             locals=dict(auto=auto, dist=dist, quant=quant, tucquant=tucquant,
                                         mult=mult, abs=abs),
                             errIfNotA=[dict])
    assert r[1] is None
    print(spec, '=>', ', '.join(str(x) for x in r))

In [None]:
# Parse spec : empty and no error.
for spec in [None, np.nan, '', '   ']:
    r = adspo._parseUserSpec(spec, 
                             globals=dict(Auto=adspo.Auto,
                                          DistInterval=adspo.DistInterval,
                                          AbsInterval=adspo.AbsInterval,
                                          MultInterval=adspo.MultInterval,
                                          OutliersMethod=adspo.OutliersMethod),
                             locals=dict(auto=auto, dist=dist, quant=quant, tucquant=tucquant,
                                         mult=mult, abs=abs),
                             nullOrEmpty='rien', errIfNotA=[dict]) # Note that errIfNotA is ignored (feature).
    assert r[0] is None and r[1] == 'rien'
    print(spec, '=>', ', '.join(str(x) for x in r))

In [None]:
# Parse spec : oneStrArg and no error.
for spec in ['min(ks*chi2/12)']:
    r = adspo._parseUserSpec(spec, 
                             globals=dict(),
                             locals=dict(min=min),
                             oneStrArg=True)
    assert r[0] is None and r[1] == dict(op='min', expr='ks*chi2/12')
    print(spec, '=>', ', '.join(str(x) for x in r))

In [None]:
# Parse spec : errors.
for spec in ['dist(5m, 12m)', 'quant(8%)', 'tucquant(t)', 'tuckey(5)', 'mult(1,4, 7.3)', 'abs(4, \'m\')']:
    r = adspo._parseUserSpec(spec, 
                             globals=dict(Auto=adspo.Auto,
                                          DistInterval=adspo.DistInterval,
                                          AbsInterval=adspo.AbsInterval,
                                          MultInterval=adspo.MultInterval,
                                          OutliersMethod=adspo.OutliersMethod),
                             locals=dict(auto=auto, dist=dist, quant=quant, tucquant=tucquant,
                                         mult=mult, abs=abs))
    assert r[1] is None
    print(spec, '=>', ', '.join(str(x) for x in r))

### b. _parseDistTruncationUserSpec

In [None]:
# No error.
r = adspo._parseDistTruncationUserSpec(2.0, errIfNotA=[float])
print(r)
assert r == (None, 2.0)
             
r = adspo._parseDistTruncationUserSpec(7, errIfNotA=[int])
print(r)
assert r == (None, 7)
             
r = adspo._parseDistTruncationUserSpec('auto', errIfNotA=[adspo.Auto])
print(r)
assert r == (None, adspo.Auto())
             
r = adspo._parseDistTruncationUserSpec('quant(5)', errIfNotA=[adspo.OutliersMethod])
print(r)
assert r == (None, adspo.OutliersMethod('quant', 5))

r = adspo._parseDistTruncationUserSpec('abs(8, 12)', errIfNotA=[adspo.AbsInterval])
print(r)
assert r == (None, adspo.AbsInterval(8, 12))

r = adspo._parseDistTruncationUserSpec('dist(0, 70)', errIfNotA=[adspo.DistInterval])
print(r)
assert r == (None, adspo.DistInterval(0, 70))

r = adspo._parseDistTruncationUserSpec('mult(0.6, 1.2)', errIfNotA=[adspo.MultInterval])
print(r)
assert r == (None, adspo.MultInterval(0.6, 1.2))

r = adspo._parseDistTruncationUserSpec('tucquant(2.5)')
print(r)
assert r == (None, adspo.OutliersMethod('tucquant', 2.5))

In [None]:
# Bad type errors.
r = adspo._parseDistTruncationUserSpec('auto', errIfNotA=(adspo.AbsInterval, adspo.MultInterval))
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('quant(5)', errIfNotA=[adspo.Auto])
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('abs(8, 12)', errIfNotA=(adspo.OutliersMethod,))
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('mult(0.6, 1.2)', errIfNotA=(adspo.DistInterval, adspo.OutliersMethod))
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('tucquant(2.5)', errIfNotA=(adspo.DistInterval, adspo.MultInterval))
print(r[0])
assert r[0] is not None and r[1] is None

In [None]:
# Parsing errors.
r = adspo._parseDistTruncationUserSpec('autox')
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('tuckey(5)')
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('abs(12)')
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('mult(0.6, x)')
print(r[0])
assert r[0] is not None and r[1] is None

r = adspo._parseDistTruncationUserSpec('tucquant(2.5%)')
print(r[0])
assert r[0] is not None and r[1] is None

## 12. MCDSTruncationOptimiser abstract class

In [None]:
adsto = ads.MCDSTruncationOptimiser

### a. Individualised data set

In [None]:
countCols =  ['nMalAd10', 'nAutAd10', 'nMalAd5', 'nAutAd5']

def count2AdultCat(sCounts):
    return 'm' if 'Mal' in sCounts[sCounts > 0].index[0] else 'a'

def count2DurationCat(sCounts):
    return '5mn' if '5' in sCounts[sCounts > 0].index[0] else '10mn'

fds = ads.FieldDataSet(source='refin/ACDC2019-Naturalist-ExtraitObsBrutesAvecDist.txt',
                       importDecFields=['distMem'], countCols=countCols,
                       addMonoCatCols={ 'Adulte': count2AdultCat, 'Durée': count2DurationCat })

dfObsIndiv = fds.individualise()

dfObsIndiv.drop(columns=countCols, inplace=True)

dfObsIndiv.tail()

In [None]:
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'distMem'
sampleDecCols=[effortCol, sampleDistCol]

sampleCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

varIndCol = 'IndAnlys'
anlysAbbrevCol = 'AbrevAnlys'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

In [None]:
# Show samples
dfObsIndiv[sampleCols].drop_duplicates()

### b. Ctor

In [None]:
# Check run method and time-out support
try:
    optr = ads.MCDSTruncationOptimiser \
                    (dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea, 
                     transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                     sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                     workDir=tmpDir / 'mcds-optr', runMethod='os.system', runTimeOut=120)
except AssertionError as exc:
    if re.search("Can't care about .+s execution time limit", str(exc)):
        print('Good: Expected refuse to work for incompatible params')
    else:
        raise

In [None]:
#An operational one for checks below
optr = ads.MCDSTruncationOptimiser \
                (dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea, 
                 transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                 sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                 distanceUnit='Meter', areaUnit='Hectare',
                 surveyType='Point', distanceType='Radial', clustering=False,
                 resultsHeadCols=dict(before=[varIndCol], sample=sampleCols, after=[anlysAbbrevCol]),
                 abbrevCol=anlysAbbrevCol, workDir=tmpDir / 'mcds-optr', logData=False,                 
                 defEstimKeyFn='HNO', defEstimAdjustFn='COS',
                 defEstimCriterion='AIC', defCVInterval=95,
                 defExpr2Optimise='chi2', defMinimiseExpr=False,
                 defOutliersMethod='tucquant', defOutliersQuantCutPct=5,
                 defFitDistCutsFctr=dict(min=2/3, max=3/2),
                 defDiscrDistCutsFctr=dict(min=1/3, max=1),
                 defSubmitTimes=4, defSubmitOnlyBest=2,
                 dDefOptimCoreParams=dict(core='zoopt'))

### c. getAnalysisOptimExprParams

In [None]:
# Spec is present
sAnIntSpec = pd.Series({ adsto.IntSpecExpr2Optimise: 'min(ks*chi2/12)' })
r = optr.getAnalysisOptimExprParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(minimiseExpr=True, expr2Optimise='ks*chi2/12')

In [None]:
# Spec is null
sAnIntSpec = pd.Series({ adsto.IntSpecExpr2Optimise: None })
r = optr.getAnalysisOptimExprParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(minimiseExpr=False, expr2Optimise='chi2')

In [None]:
# Spec is absent
sAnIntSpec = pd.Series()
r = optr.getAnalysisOptimExprParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(minimiseExpr=False, expr2Optimise='chi2')

### d. getAnalysisFixedParams

In [None]:
# All specs present
sAnIntSpec = pd.Series({ adsto.IntSpecEstimKeyFn:'HNO', adsto.IntSpecEstimAdjustFn:'POLY',
                         adsto.IntSpecEstimCriterion:'AIC', adsto.IntSpecCVInterval:97 })
r = optr.getAnalysisFixedParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(estimKeyFn='HNO', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=97)

In [None]:
# Some specs absent => default values
sAnIntSpec = pd.Series({ adsto.IntSpecEstimKeyFn:'UNI', adsto.IntSpecEstimAdjustFn:'POLY'})
r = optr.getAnalysisFixedParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(estimKeyFn='UNI', estimAdjustFn='POLY', estimCriterion='AIC', cvInterval=95)

### e. getAnalysisOptimedParams

In [None]:
# Get a "random" sample from indiv. data set
sAnSpec = pd.Series({ 'Espèce': 'Alauda arvensis', 'Passage': 'a+b', 'Adulte': 'm+a', 'Durée': '10mn'})
sds = optr._mcDataSet.sampleDataSet(sAnSpec[sampleCols])
sSampleDistances = sds.dfData[sampleDistCol].dropna()
len(sSampleDistances)

In [None]:
# Some base figures for checking results
sqd = np.sqrt(len(sSampleDistances.dropna()))
dMin = sSampleDistances.min()
dMax = sSampleDistances.max()

In [None]:
# All present and variant (check computations) 1
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:'auto', adsto.IntSpecMaxDist:'quant(5)',
                         adsto.IntSpecFitDistCuts:'abs(8, 12)', adsto.IntSpecDiscrDistCuts:'mult(0.6, 1.2)',
                         adsto.IntSpecOutliersMethod:'tucquant(2.5)'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[2.5, 95])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=ads.Interval(dMin, qLeft), maxDist=ads.Interval(qRight, dMax),
           fitDistCuts=ads.Interval(8, 12), discrDistCuts=ads.Interval(int(round(sqd*0.6)), int(round(sqd*1.2))))

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

In [None]:
# All present and variant (check computations) 2
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:'quant(5)', adsto.IntSpecMaxDist:'auto',
                         adsto.IntSpecFitDistCuts:'mult(3/4, 5/4)', adsto.IntSpecDiscrDistCuts:'abs(4, 6)',
                         adsto.IntSpecOutliersMethod:'tucquant(1)'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[5, 99])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=ads.Interval(dMin, qLeft), maxDist=ads.Interval(qRight, dMax),
           fitDistCuts=ads.Interval(int(round(sqd*3/4)), int(round(sqd*5/4))), discrDistCuts=ads.Interval(4, 6))

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

In [None]:
# All present and variant (check computations) 3
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:'auto', adsto.IntSpecMaxDist:'auto',
                         adsto.IntSpecFitDistCuts:'auto', adsto.IntSpecDiscrDistCuts:'auto',
                         adsto.IntSpecOutliersMethod:'tucquant(2)'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[2, 98])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=ads.Interval(dMin, qLeft), maxDist=ads.Interval(qRight, dMax),
           fitDistCuts=ads.Interval(int(round(sqd*2/3)), int(round(sqd*3/2))),
           discrDistCuts=ads.Interval(int(round(sqd/3)), int(round(sqd))))

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

In [None]:
# All present and variant (check computations) 4
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:'auto', adsto.IntSpecMaxDist:'auto',
                         adsto.IntSpecFitDistCuts:'auto', adsto.IntSpecDiscrDistCuts:'auto',
                         adsto.IntSpecOutliersMethod:'auto'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[5, 95])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=ads.Interval(dMin, qLeft), maxDist=ads.Interval(qRight, dMax),
           fitDistCuts=ads.Interval(int(round(sqd*2/3)), int(round(sqd*3/2))),
           discrDistCuts=ads.Interval(int(round(sqd/3)), int(round(sqd))))

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

In [None]:
# All present, some variant, some consts (check computations) 1
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:12, adsto.IntSpecMaxDist:'quant(5)',
                         adsto.IntSpecFitDistCuts:'abs(8, 12)', adsto.IntSpecDiscrDistCuts:'mult(0.6, 1.2)',
                         adsto.IntSpecOutliersMethod:'tucquant(2.5)'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[2.5, 95])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=12, maxDist=ads.Interval(qRight, dMax),
           fitDistCuts=ads.Interval(8, 12), discrDistCuts=ads.Interval(int(round(sqd*0.6)), int(round(sqd*1.2))))

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

In [None]:
# All present, some variant, some consts (check computations) 2
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:'quant(5)', adsto.IntSpecMaxDist:250.0,
                         adsto.IntSpecFitDistCuts:'mult(3/4, 5/4)', adsto.IntSpecDiscrDistCuts:'abs(4, 6)',
                         adsto.IntSpecOutliersMethod:'tucquant(1)'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[5, 99])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=ads.Interval(dMin, qLeft), maxDist=250.0,
           fitDistCuts=ads.Interval(int(round(sqd*3/4)), int(round(sqd*5/4))), discrDistCuts=ads.Interval(4, 6))

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

In [None]:
# All present, some variant, some consts (check computations) 3
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:'auto', adsto.IntSpecMaxDist:'auto',
                         adsto.IntSpecFitDistCuts:17, adsto.IntSpecDiscrDistCuts:'auto',
                         adsto.IntSpecOutliersMethod:'tucquant(2)'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[2, 98])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=ads.Interval(dMin, qLeft), maxDist=ads.Interval(qRight, dMax),
           fitDistCuts=17, discrDistCuts=ads.Interval(int(round(sqd/3)), int(round(sqd))))

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

In [None]:
# All present, some variant, some consts (check computations) 4
# a. Call method
sAnIntSpec = pd.Series({ adsto.IntSpecMinDist:'auto', adsto.IntSpecMaxDist:'auto',
                         adsto.IntSpecFitDistCuts:'auto', adsto.IntSpecDiscrDistCuts:6,
                         adsto.IntSpecOutliersMethod:'auto'})
e, r = optr.getAnalysisOptimedParams(sAnIntSpec, sSampleDistances)

assert e is None

sr = str({ k:str(v) for k,v in r.items() })
print('Actual result   :', sr)

# b. Compute theorical result
qLeft, qRight = np.percentile(a=sSampleDistances, q=[5, 95])

print('Base variables  :', dict(sqd=sqd, dMin=dMin, dMax=dMax, qLeft=qLeft, qRight=qRight))

sol = dict(minDist=ads.Interval(dMin, qLeft), maxDist=ads.Interval(qRight, dMax),
           fitDistCuts=ads.Interval(int(round(sqd*2/3)), int(round(sqd*3/2))), discrDistCuts=6)

ssol = str({ k:str(v) for k,v in sol.items() })
print('Theorical result:', ssol)

# c. Check "equality" (for some reason, must use str repr for comparison ...)
assert sr == ssol

### f. getOptimisationCoreParams

In [None]:
# Specs not present => default from ctor
sAnIntSpec = pd.Series({ adsto.IntSpecOptimisationCore: np.nan })
r = optr.getOptimisationCoreParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(core='zoopt')

In [None]:
# Specs null => default from ctor
sAnIntSpec = pd.Series()
r = optr.getOptimisationCoreParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(core='zoopt')

In [None]:
# Some specs present, with all default values ; string as last param.
sAnIntSpec = pd.Series({ adsto.IntSpecOptimisationCore: 'zoopt(mxi=0,a=racos)'})
r = optr.getOptimisationCoreParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(core='zoopt')

In [None]:
# Some specs present, some with default values, some not, 1 non keyword param.
sAnIntSpec = pd.Series({ adsto.IntSpecOptimisationCore: 'zoopt(80, a=racos)'})
r = optr.getOptimisationCoreParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(core='zoopt', maxIters=80)

In [None]:
# All specs present, no default value
sAnIntSpec = pd.Series({ adsto.IntSpecOptimisationCore: 'zoopt(a=sracos,mxi=450,tv=1,mxr=5)'})
r = optr.getOptimisationCoreParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(core='zoopt', algorithm='sracos', maxIters=450, termExprValue=1, maxRetries=5)

### g. getOptimisationSubmitParams

In [None]:
# Specs not present => default from ctor
sAnIntSpec = pd.Series({ adsto.IntSpecSubmitParams: np.nan })
r = optr.getOptimisationSubmitParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(times=4, onlyBest=2)

In [None]:
# Specs null => default from ctor
sAnIntSpec = pd.Series()
r = optr.getOptimisationSubmitParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(times=4, onlyBest=2)

In [None]:
# Some specs present, with default values
sAnIntSpec = pd.Series({ adsto.IntSpecSubmitParams: 'times(n=9)'})
r = optr.getOptimisationSubmitParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(times=9, onlyBest=None)

In [None]:
# All specs present, no default value
sAnIntSpec = pd.Series({ adsto.IntSpecSubmitParams: 'times(100, b=22)'})
r = optr.getOptimisationSubmitParams(sAnIntSpec)
print(*r)
assert r[0] is None and r[1] == dict(times=100, onlyBest=22)

In [None]:
# Bad times times
sAnIntSpec = pd.Series({ adsto.IntSpecSubmitParams: 'times(n=0, b=22)'})
r = optr.getOptimisationSubmitParams(sAnIntSpec)
print(*r)
assert r[1] is None and str(r[0]).find('Run times must be > 0') >= 0

In [None]:
# Bad best kept values number
sAnIntSpec = pd.Series({ adsto.IntSpecSubmitParams: 'times(2, b=0)'})
r = optr.getOptimisationSubmitParams(sAnIntSpec)
print(*r)
assert r[1] is None and str(r[0]).find('Number of best kept values must be > 0') >= 0

## 13. MCDSAnalysisResultsSet

In [None]:
RS = ads.MCDSAnalysisResultsSet

### a. _indexOfDuplicates

In [None]:
# Test cases
df = pd.DataFrame([dict(a=1.000, b=2.00, c=3.0, d='To be kept: first so as a.round(1) == 1.0, whatever c, b == 2'),
                   dict(a=1.010, b=2.00, c=1.0, d='Duplicate: 2nd so as a.round(1) == 1.0, whatever c, b == 2'),
                   dict(a=1.049, b=2.00, c=2.0, d='Duplicate: 3rd so as a.round(1) == 1.0, whatever c, b == 2'),
                   dict(a=1.051, b=2.00, c=2.0, d='To be kept: first so as a.round(1) == 1.1, whatever c, b == 2'),
                   dict(a=1.060, b=2.00, c=2.0, d='Duplicate: 2nd so as a.round(1) == 1.1, whatever c, b == 2'),
                   dict(a=1.100, b=2.00, c=4.0, d='Duplicate: 3rd so as a.round(1) == 1.1, whatever c, b == 2'),
                   dict(a=1.151, b=2.00, c=5.0, d='To be kept: first so as a.round(1) == 1.2, whatever c, b == 2'),
                   dict(a=2.000, b=2.00, c=3.0, d='To be kept: first so as b == 2.0, whatever c, a == 2'),
                   dict(a=2.000, b=2.00, c=5.0, d='Duplicate: 2nd so as b == 2.0, whatever c, a == 2'),
                   dict(a=2.000, b=2.01, c=9.0, d='To be kept: first so as b == 2.0, whatever c, a == 2'),
                   dict(a=2.000, b=1.9999999, c=3.0, d='To be kept: first so as b == 1.9999999, whatever c, a == 2')])

In [None]:
# Compute filter
iDupes = RS._indexOfDuplicates(df, keep='first', subset=['a', 'b'], round2decs=dict(a=1))
iDupes

In [None]:
# Apply filter
df.drop(iDupes, inplace=True)
df

In [None]:
# Auto-check
assert all(iDupes == [1, 2, 4, 5, 8])
assert all('Duplicate' not in s for s in df.d)

### b. _indexOfWorstOneCriterion

In [None]:
# Test cases
df = pd.DataFrame([dict(s=0, a=1.000),
                   dict(s=0, a=0.010),
                   dict(s=0, a=1.049),
                   dict(s=0, a=1.051),
                   dict(s=0, a=0.060),
                   dict(s=0, a=1.100),
                   dict(s=0, a=1.151),
                   dict(s=0, a=2.000),
                   dict(s=0, a=1.020),
                   dict(s=0, a=1.500),
                   dict(s=0, a=2.000),
                   dict(s=0, a=1.010),
                   dict(s=0, a=1.049),
                   dict(s=0, a=0.051),

                   dict(s=1, a=1.060),
                   dict(s=1, a=1.100),
                   dict(s=1, a=1.151),

                   dict(s=2, a=3.000),
                   dict(s=2, a=2.000),
                   dict(s=2, a=6.000),
                   dict(s=2, a=0.060),
                   dict(s=2, a=1.100),
                   dict(s=2, a=3.010),
                   dict(s=2, a=2.200),
                   dict(s=2, a=2.230),

                   dict(s=3, a=1.100),
                   dict(s=3, a=1.151),
                   dict(s=3, a=2.000),
                   dict(s=3, a=2.000),

                   dict(s=4, a=2.000),
                   dict(s=4, a=2.000),
                   dict(s=4, a=2.000),
                   dict(s=4, a=2.000),
                   dict(s=4, a=2.000),
                   dict(s=4, a=2.000)])

s2filter = [0, 2, 3, 5]  # Ignore sample 1 and 4, add empty sample 5

maxRes = 6  # Keep 6 best values at most.

In [None]:
df.s.value_counts().sort_index()

In [None]:
#df.sort_values(by=['s', 'a'])

In [None]:
# Compute filter
i2drop = RS._indexOfWorstOneCriterion(df, sampleIds=s2filter, sampleIdCol='s', critCol='a', ascendCrit=False, nTgtRes=maxRes)

i2drop

In [None]:
# Apply filter
df.drop(i2drop, inplace=True)

df.sort_values(by=['s', 'a'])

In [None]:
df.s.value_counts().sort_index()

In [None]:
# Auto-check
assert all(i2drop == [2, 12, 8, 11, 0, 4, 13, 1, 21, 20])
assert df[df.s.isin(s2filter)].s.value_counts().le(maxRes).all()
assert df.loc[df.s.isin(s2filter)].groupby('s').a.max().le([2, 6, 2]).all()

### c. _indexOfWorstMultiOrderCriteria

In [None]:
# Test cases
df = pd.DataFrame([dict(s=0, a=1, b=1, c='Kept thanks to a and b'),
                   dict(s=0, a=0, b=1, c='Kept thanks to a and b'),
                   dict(s=0, a=2, b=2, c='Dropped because of a and b'),
                   dict(s=0, a=4, b=3, c='Dropped because of a and b'),
                   dict(s=0, a=3, b=2, c='Dropped because of a and b'),
                   dict(s=0, a=5, b=1, c='Kept thanks to b'),
                   dict(s=1, a=2, b=4, c='Dropped because of a and b'),
                   dict(s=1, a=1, b=3, c='Kept thanks to a'),
                   dict(s=1, a=4, b=0, c='Kept thanks to b')])

critCols = ['a', 'b']
supCrit = 2

df

In [None]:
i2drop = RS._indexOfWorstMultiOrderCriteria(df, critCols=critCols, supCrit=supCrit)

i2drop

In [None]:
# Apply filter
df.drop(i2drop, inplace=True)

df.sort_values(by=['s', 'a'])

In [None]:
# Auto-check
assert all(i2drop == [2, 3, 4, 6])
assert all('Dropped' not in s for s in df.c)

### d. filterSortSchemeId

In [None]:
fsRes = ads.MCDSAnalysisResultsSet(sampleIndCol='Sample')

In [None]:
dupSubset = [RS.CLNObs, RS.CLEffort, RS.CLDeltaAic, RS.CLChi2, RS.CLKS, RS.CLCvMUw, RS.CLCvMCw, RS.CLDCv, 
             RS.CLPDetec, RS.CLPDetecMin, RS.CLPDetecMax, RS.CLDensity, RS.CLDensityMin, RS.CLDensityMax]
dDupRounds = {RS.CLDeltaAic: 1, RS.CLChi2: 2, RS.CLKS: 2, RS.CLCvMUw: 2, RS.CLCvMCw: 2, RS.CLDCv: 2, 
              RS.CLPDetec: 3, RS.CLPDetecMin: 3, RS.CLPDetecMax: 3, RS.CLDensity: 2, RS.CLDensityMin: 2, RS.CLDensityMax: 2}

schEx = dict(nameFmt='ExCode', 
             method=RS.filterSortOnExecCode,
             deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
             filterSort=dict(whichFinalQua=RS.CLCmbQuaBal1, ascFinalQua=False))

schACCQ1 = dict(nameFmt='ExAicMQua-r{sightRate:.1f}', 
                method=RS.filterSortOnExCAicMulQua,
                deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                filterSort=dict(sightRate=92.5, nBestAIC=3, nBestQua=1, 
                                whichBestQua=[RS.CLGrpOrdClTrChi2KSDCv, RS.CLGrpOrdClTrDCv, RS.CLGrpOrdClTrQuaBal1,
                                              RS.CLGrpOrdClTrQuaChi2, RS.CLGrpOrdClTrQuaKS, RS.CLGrpOrdClTrQuaDCv],
                                nFinalRes=12, whichFinalQua=RS.CLCmbQuaBal1, ascFinalQua=False))
    
schACCQ2 = copy.deepcopy(schACCQ1)
schACCQ2['filterSort']['sightRate'] = 93.0

schACCQ3 = copy.deepcopy(schACCQ1)

schACCQ4 = copy.deepcopy(schACCQ1)
schACCQ4['filterSort']['nFinalRes'] = 8

schACCQ5 = copy.deepcopy(schACCQ3)
schACCQ5['filterSort']['nFinalRes'] = 5

In [None]:
assert fsRes.filterSortSchemeId(schEx) == schEx['nameFmt']
assert fsRes.filterSortSchemeId(schACCQ3) == fsRes.filterSortSchemeId(schACCQ1)
assert fsRes.filterSortSchemeId(schACCQ2) != fsRes.filterSortSchemeId(schACCQ1)
assert fsRes.filterSortSchemeId(schACCQ4).startswith(fsRes.filterSortSchemeId(schACCQ1)) 
assert fsRes.filterSortSchemeId(schACCQ5).startswith(fsRes.filterSortSchemeId(schACCQ1)) 

In [None]:
dict(schEx=fsRes.filterSortSchemeId(schEx), schACCQ1=fsRes.filterSortSchemeId(schACCQ1),
     schACCQ2=fsRes.filterSortSchemeId(schACCQ2), schACCQ3=fsRes.filterSortSchemeId(schACCQ3),
     schACCQ4=fsRes.filterSortSchemeId(schACCQ4), schACCQ5=fsRes.filterSortSchemeId(schACCQ5))

### z. TODO: Complete !!!

## 14. MCDS(Opt)AnalysisResultsSet

In [None]:
RS = ads.MCDSTruncOptanalysisResultsSet

### a. Prepare stuff for creating MCDSTruncOptanalysisResultsSet objects

In [None]:
# Source / Results data
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'Distance'
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

optIndCol = 'IndOptim'
optAbbrevCol = 'AbrevOptim'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

In [None]:
# General DS analysis parameters
varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

distanceUnit = 'Meter'
areaUnit = 'Hectare'
surveyType = 'Point'
distanceType = 'Radial'
clustering = False

In [None]:
# Results post-computation parameters
ldTruncIntrvSpecs = [dict(col='left', minDist=5.0, maxLen=5.0),
                     dict(col='right', minDist=25.0, maxLen=25.0)]
truncIntrvEpsilon = 1e-6

In [None]:
# Load individualised observations and actual transects
indivObsFile = 'refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods'

dfObsIndiv = ads.DataSet(indivObsFile, sheet='DonnéesIndiv').dfData

dfTransects = ads.DataSet(indivObsFile, sheet='Inventaires').dfData

dict(indivObs=len(dfObsIndiv), transects=len(dfTransects))

In [None]:
# What's better to create an MCDS(Opt)AnalysisResultsSet objets than a MCDSTruncationOptanalyser instance ?
optanlr = \
    ads.MCDSTruncationOptanalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                                  transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                  sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                                  sampleDistCol=sampleDistCol,
                                  abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                  anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                                  distanceUnit=distanceUnit, areaUnit=areaUnit,
                                  surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                                  resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                       after=anlysParamCols + [anlysAbbrevCol]),
                                  ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon)

### b.  _filterOnExecCode

In [None]:
# Load results to play with ...
resFileName = 'refin/ACDC2019-Naturalist-UnitestOptResultats.ods'
print('Loading results from {} ...'.format(resFileName))

results = optanlr.setupResults()

results.fromOpenDoc(resFileName, postComputed=True)  # Prevent re-post-computation : not a problem here, but longer

In [None]:
# Get results table (Note: No post-computed column used here, so ... recomputation authorised, but not needed, and slower)
dfFilSorRes = results.getData(copy=True)
#dfFilSorRes = dfFilSorRes[dfFilSorRes[('header (head)', 'NumEchant', 'Value')] == 5].copy()  # Useful for debugging.

In [None]:
# Filter params
dupSubset = [RS.CLDensity, RS.CLDensityMin, RS.CLDensityMax]
dDupRounds = {RS.CLDensity: 1, RS.CLDensityMin: 2, RS.CLDensityMax: 1}

In [None]:
# Save index before filtering
iBefore = dfFilSorRes.index
len(dfFilSorRes)

In [None]:
# Filter
filSorSteps = ads.analyser._FilterSortSteps(filSorSchId='ExCodeTst', resultsSet=results, lang='fr')  # Steps logger

RS._filterOnExecCode(dfFilSorRes, filSorSteps, results.sampleIndCol,
                     dupSubset=dupSubset, dDupRounds=dDupRounds)

# Look at steps
filSorSteps.toList()

In [None]:
# List filtered-out results
sFiltered = set(iBefore) - set(dfFilSorRes.index)
print(', '.join(str(i) for i in sFiltered))

In [None]:
# Auto-check
sExpected = {+0, 14, 17, 8, 9,        # sample 0: + => because of poor status elimination
             +21, 22, 23, 27,         # sample 1: otherwise, because of non-first duplicate
             31,                      # sample 2 ... etc.
             +41, +39,                # sample 3
             +56, 49, 53, 52, 57,     # sample 4
             66, 69, 68, 65, 64}      # sample 5
print(', '.join(str(i) for i in sExpected), end=' => ')

assert sFiltered == sExpected, 'Oh, oh ... not what we expected'

print('Yesssssss !')

### c. _filterOnAicMultiQua

In [None]:
# Load results to play with ...
resFileName = 'refin/ACDC2019-Naturalist-UnitestOptResultats.ods'
print('Loading results from {} ...'.format(resFileName))

results = optanlr.setupResults()

results.fromOpenDoc(resFileName, postComputed=True)  # Prevent re-post-computation : we don't want it !

In [None]:
# Get results table without re-post computation : we want post-computed columns as in source workbook !
dfFilSorRes = results.getData(copy=True)
#dfFilSorRes = dfFilSorRes[dfFilSorRes[('header (head)', 'NumEchant', 'Value')] == 5].copy()  # Useful for debugging

In [None]:
# Filter params
minSightRate = 92.0
nBestAicOrd = 2
nBestMQuaOrd = 1
whichBestMQuaOrd = [RS.CLGrpOrdClTrChi2KSDCv, RS.CLGrpOrdClTrQuaBal3, RS.CLGrpOrdClTrQuaChi2]
nFinalQua = 3
whichFinalQua = RS.CLCmbQuaBal3
ascFinalQua = False

In [None]:
# Save index before filtering
iBefore = dfFilSorRes.index
len(dfFilSorRes)

In [None]:
# Filter
filSorSteps = ads.analyser._FilterSortSteps(filSorSchId='ExAicMQuaTst', resultsSet=results, lang='fr')  # Steps logger

RS._filterOnAicMultiQua(dfFilSorRes, filSorSteps, results.sampleIndCol,
                        minSightRate=minSightRate, nBestAicOrd=nBestAicOrd,
                        nBestMQuaOrd=nBestMQuaOrd, whichBestMQuaOrd=whichBestMQuaOrd,
                        nFinalQua=nFinalQua, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua)

# Look at steps
filSorSteps.toList()

In [None]:
# List filtered-out results
sFiltered = set(iBefore) - set(dfFilSorRes.index)
print(', '.join(str(i) for i in sFiltered))

In [None]:
# Auto-check filtered-out results
# (causes => +: lower AIC, 0+: not of best multi-qua. orders, -0: poor sight rate, other: no of N best finalQua)
sExpected =  {+5, +8, +9, 0+16, 0+17, 0+18, 14-0, 19-0, 12, # sample 0
              +23, +24, 0+20, 0+21, +26, 30-0, 22,          # sample 1
              +31, 0+35, 33,                                # sample 2
              +43, 0+44, 41-0, 47-0, 39,                    # sample 3
              +49, 0+48, 50-0, 52-0, 54-0, 57-0, 56,        # sample 4
              +64, 0+65, 0+66, 63-0, 62}                    # sample 5
print(', '.join(str(i) for i in sExpected), end=' => ')

assert sFiltered == sExpected, 'Oh, oh ... not what we expected'

print('Yesssssss !')

### y. Non regression

TODO: 
* Complete this really uncomplete and not working draft !!!
* Isn't the current state of code below more about integration tests ?

#### y.i. Load reference results

(generated once through valtests.ipynb/IV. Run truncation opt-analyses ...)

In [None]:
refRes = optanlr.setupResults()

resFileName = 'refout/ACDC2019-Naturalist-ExtraitOptResultats.ods'
print('Loading results from {} ...'.format(resFileName))

refRes.fromExcel(resFileName, postComputed=True)  # Prevent re-post-computation : this is our reference !

optanlr.shutdown()

In [None]:
# TODO: No, rather ... rather what ????
# Remove analyses with non-unique 'NumAnlys' (because of multiple optimisation tries)
# (to make comparison easier, sorry)
numAnlysCols = ('header (head)', 'NumAnlys', 'Value')
numEchantCol = ('header (head)', 'NumEchant', 'Value')

sb = refRes.dfData[[numAnlysCols, numEchantCol]].groupby([numAnlysCols]).transform(len)[numEchantCol] > 1
refRes.dropRows(sb)

refRes.dfData

#### y.ii Trigger re-post-computation on a copy

(post-computations are the first thing we want to check for non regression)

In [None]:
#ads.logger('ads.dat', level=ads.INFO, reset=True)
ads.logger('ads.anr', level=ads.DEBUG4, reset=True)
_ = ads.logger('ads.onr', level=ads.DEBUG4, reset=True)

In [None]:
res = refRes.copy()
res.setPostComputed(False)

In [None]:
# Trigger now !
res.dfData

#### y.iii Compare re-post-computed columns to reference.

TODO: Make this work ! As ... for the moment,
* 3 results get different truncation groups
* 29 results get different sort orders (for many or all of them)
* out of 54 total results !

In [None]:
refRes.columns.to_list()

In [None]:
indexCols = [('header (head)', 'NumAnlys', 'Value'), ('header (tail)', 'TrGche', 'Value'),
 ('header (tail)', 'TrDrte', 'Value'),
 ('header (tail)', 'NbTrchMod', 'Value'),
 ('header (tail)', 'OptimTrunc', 'Value')]
subsetCols=[col for col in refRes.dfData.columns if col[0] == 'auto filter sort']
subsetCols

In [None]:
refRes.dfData.set_index(indexCols).sort_index()[subsetCols] \
    .compare(res.dfData.set_index(indexCols).sort_index()[subsetCols])

In [None]:
ads.DataSet.compareDataFrames(refRes.dfTransData('fr').sort_values(by='NumAnlys'),
                              res.dfTransData('fr').sort_values(by='NumAnlys'),
                              indexCols=['NumAnlys'],
                              subsetCols=[col for col in refRes.dfTransData('fr').columns
                                          if col.startswith('Ordre') or col.startswith('Qual') or col.startswith('Groupe')],
                              dropCloser=14)

### z. Finalisation

In [None]:
optanlr.shutdown()

# II. Integration tests

In [None]:
# Tweak trace levels.
ads.logger('ads.eng', level=ads.INFO, reset=True)
if False:
    ads.logger('ads.dat', level=ads.DEBUG, reset=True)
    ads.logger('ads.opr', level=ads.DEBUG, reset=True)

## 1. MCDSAnalyser : Run multiple analyses on real-life data (1/2)

### a. Individualised data set

In [None]:
countCols =  ['nMalAd10', 'nAutAd10', 'nMalAd5', 'nAutAd5']

def count2AdultCat(sCounts):
    return 'm' if 'Mal' in sCounts[sCounts > 0].index[0] else 'a'

def count2DurationCat(sCounts):
    return '5mn' if '5' in sCounts[sCounts > 0].index[0] else '10mn'

fds = ads.FieldDataSet(source='refin/ACDC2019-Naturalist-ExtraitObsBrutesAvecDist.txt',
                       importDecFields=['distMem'], countCols=countCols,
                       addMonoCatCols={ 'Adulte': count2AdultCat, 'Durée': count2DurationCat })

dfObsIndiv = fds.individualise()

dfObsIndiv.drop(columns=countCols, inplace=True)

dfObsIndiv.tail()

### b.Explicit analysis specs

(old method: manual explicitation before run, and pass explict specs to run ;
 see 2/2 below for the new simpler and recommended method, without prior explicitation)

In [None]:
transectPlaceCol = 'Point'
transectPlaceCols = [transectPlaceCol]
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'distMem'
sampleDecCols=[effortCol, sampleDistCol]

sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']
sampleIndCol = 'IndSamp'

varIndCol = 'IndAnlys'
anlysAbbrevCol = 'AbrevAnlys'

withTruncCol = 'AvecTronc'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

In [None]:
dfAnlysExplSpecs = ads.DSAnalyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx', 
                                                        keep=['Echant1_impl', 'Echant2_impl', 'Modl_impl',
                                                              'Params1_expl', 'Params2_expl'],
                                                        varIndCol='IndAnlys',
                                                        #convertCols={ 'Durée': int }, # float 'cause of Excel
                                                        computedCols={anlysAbbrevCol: analysisAbbrev})

In [None]:
# Add a pass-through neutral column (for richer results)
dfAnlysExplSpecs[withTruncCol] = dfAnlysExplSpecs[['TrGche', 'TrDrte']].apply(lambda s: s.isnull().all(), axis='columns')

In [None]:
# Shorten analyses list to go faster
if False:
    dfAnlysExplSpecs = dfAnlysExplSpecs[(dfAnlysExplSpecs['Espèce'].isin(['Luscinia megarhynchos', 'Turdus merula']))]
    len(dfAnlysExplSpecs)

In [None]:
dfAnlysExplSpecs

### c. Objet MCDSAnalyser

In [None]:
# Build the MCDSAnalyser object
# * const effort per survey point x pass (= 1) => no need for passing transects infos (auto-generated)
anlysr = ads.MCDSAnalyser(dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea,
                          resultsHeadCols=dict(before=[varIndCol], sample=sampleSelCols, after=[withTruncCol, anlysAbbrevCol]),
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, anlysSpecCustCols=[withTruncCol],
                          distanceUnit='Meter', areaUnit='Hectare',
                          surveyType='Point', distanceType='Radial', clustering=False,
                          abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleIndCol,
                          workDir=tmpDir / 'mcds-anlr', runMethod='subprocess.run', logProgressEvery=5)

In [None]:
assert len(anlysr.specs) == 17

In [None]:
anlysr.specs

### d. Check analyses specs

In [None]:
dfAnlysExplSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfAnlysExplSpecs, dropDupes=True, check=True)

print(verdict, reasons, len(dfAnlysExplSpecs), userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

assert len(dfAnlysExplSpecs) == 48
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts']
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

In [None]:
dfAnlysExplSpecs.head()

### e. Run analyses

(parallel mode)

In [None]:
%%time

# 2021-01 to 2021-10: 4.7s, 4.9s, 5.2s Windows 10, 4-core i5-8350U, PCI-e SSD, "optimal performances" power scheme

# Analyses
results = anlysr.run(dfAnlysExplSpecs, threads=12)

#results = anlysr.run(dfAnlysExplSpecs.iloc[:2], threads=1)  # Petit sous-ensemble pour aller vite.

In [None]:
anlysr.shutdown()

In [None]:
assert withTruncCol in results.dfTransData('fr').columns

In [None]:
results.dfTransData('fr')

In [None]:
results.toExcel(pl.Path(anlysr.workDir) / 'unintst-mcds-anlyser-results-fr.xlsx', lang='fr')

In [None]:
results.specs

## 2. MCDSAnalyser : Run multiple analyses on real-life data (2/2)

(2nd, easier and recommended version, with analysis specs checks and auto-detection of analysis parameter columns)

### a. Individualised data set and analysis specs abbreviator

Run [1. MCDSAnalyser : Run multiple analyses on real-life data (1/2)](#1.-MCDSAnalyser-%3A-Run-multiple-analyses-on-real-life-data-(1%2F2)) / a. and b. before

### b. Build MCDSAnalyser object

In [None]:
# Construction d'un MCDSAnalyser
# * effort constant par point x passage (= 1) => pas besoin de passer les infos transects (auto-générées)
anlysr = ads.MCDSAnalyser(dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea,
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                          abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                          anlysIndCol=varIndCol, sampleIndCol=sampleIndCol,
                          distanceUnit='Meter', areaUnit='Hectare',
                          surveyType='Point', distanceType='Radial', clustering=False,
                          resultsHeadCols=dict(before=[varIndCol], sample=sampleSelCols, after=[anlysAbbrevCol]),
                          workDir=tmpDir / 'mcds-anlr', runMethod='subprocess.run', logProgressEvery=5)

### c. Check (and explicitate) analyses specs

In [None]:
anlysSpecFile = 'refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx'

In [None]:
dfAnlysExplSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(implParamSpecs=anlysSpecFile, dropDupes=True, check=True)

assert len(dfAnlysExplSpecs) == 48
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts']
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

### d. Run analyses

(parallel mode, and straight from implicit specs)

In [None]:
%%time

# 2021-01 to 2021-10: 4.8s, 5.2s Windows 10, 4-core i5-8350U, PCI-e SSD, "optimal performances" power scheme

# Analyses (on a tout vérifié : go).
results = anlysr.run(implParamSpecs=anlysSpecFile, threads=12)

In [None]:
anlysr.shutdown()

In [None]:
results.dfTransData('fr')

In [None]:
results.toExcel(pl.Path(anlysr.workDir) / 'unintst-mcds-anlyser-results2-fr.xlsx', lang='fr')

## 3. MCDSPreAnalyser : Run multiple pre-analyses with real-life data

### Not implemented (See `valtest` notebook, chapter VIII)

## 4. MCDSZerothOrderTruncationOptimiser : Optimise truncation params on real-life data

Note: Only from explicit specs here.

### a. Jeu de données individualisées

Run [2. MCDSAnalyser : Run multiple analyses on real-life data (2/2)](#2.-MCDSAnalyser-%3A-Run-multiple-analyses-on-real-life-data-(2%2F2)) / a., b. and c. before (need for dfObsIndiv & dfAnlysExplSpecs)

In [None]:
anlysr.shutdown()

### b. Specs d'optimisation explicites

In [None]:
optIndCol = 'IndOptim'
optAbbrevCol = 'AbrevOptim'
speAbbrevCol = 'AbrevEsp'

In [None]:
# Left part = standard analysis params withouth truncation specs, from 4. above
dfOptimExplSpecs = dfAnlysExplSpecs[sampleSelCols + ['FonctionClé', 'SérieAjust']].drop_duplicates().reset_index(drop=True)
dfOptimExplSpecs

In [None]:
# Right part : as many as possible truncation optimisation params combinations
dfMoreOptimCols = pd.DataFrame(dict(CritChx=[None, 'AIC']*6,
                                    IntervConf=[None, 95, 97]*4,
                                    TroncGche=['auto', None, 20, 'dist(5, 30)', 50.0, 'quant(3)']*2,
                                    TroncDrte=[None, 'auto', 'dist(150, 300)', 200.0, 'tucquant(2)', 250]*2,
                                    MethOutliers=[None, 'auto', None, None,
                                                  None, 'quant(6)', None, None,
                                                  None, 'tucquant(8)', None, None],
                                    NbTrModel=[None, 9.0, 'auto', 17, 'abs(5, 10)', 'mult(0.5,5/4)']*2,
                                    NbTrDiscr=[None, 'auto', 4, 'abs(5, 10)', 16.0, 'mult(0.5,5/4)']*2,
                                    ExprOpt=[None, 'max(chi2)', 'min(1-chi2)', 'max(chi2)',
                                             'max(ks)', 'max(cvmuw*cvmcw)']*2,
                                    MoteurOpt=[None, 'zoopt', 'zoopt(mxi=20, a=racos)',
                                               'zoopt(mxi=30, mxr=2, tv=0.5)']*3,
                                    ParExec=[None, 'times(2)', 'times(3, b=2)']*4))
dfMoreOptimCols

In [None]:
# Concat left and right parts
dfOptimExplSpecs = pd.concat([dfOptimExplSpecs, dfMoreOptimCols], axis='columns')

In [None]:
# Add neutral and path-through columns (from specs to results) : no real use, but for testing this usefull feature
dfOptimExplSpecs[speAbbrevCol] = dfOptimExplSpecs['Espèce'].apply(lambda s: ''.join(m[:4] for m in s.split()))
dfOptimExplSpecs

In [None]:
# Artificially generate some duplicates (for testing auto-removal later :-)
dfOptimExplSpecs = dfOptimExplSpecs.append(dfOptimExplSpecs, ignore_index=True)
len(dfOptimExplSpecs)

### c. MCDSZerothOrderTruncationOptimiser object

In [None]:
# Colonnes de dfOptimExplSpecs donnant les paramètres d'analyse / optimisation
optimParamsSpecsCols  = ['FonctionClé', 'SérieAjust', 'CritChx', 'IntervConf',
                         'TroncGche', 'TroncDrte', 'MethOutliers', 'NbTrModel', 'NbTrDiscr',
                         'ExprOpt', 'MoteurOpt', 'ParExec']

In [None]:
zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                (dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea, 
                 transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                 sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                 anlysSpecCustCols=[speAbbrevCol], abbrevCol=optAbbrevCol, abbrevBuilder=analysisAbbrev,
                 anlysIndCol=optIndCol, sampleIndCol=sampleIndCol,
                 distanceUnit='Meter', areaUnit='Hectare',
                 surveyType='Point', distanceType='Radial', clustering=False,
                 resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamsSpecsCols + [speAbbrevCol]),
                 workDir=tmpDir / 'mcds-optr', runMethod='os.system', runTimeOut=None,
                 logData=False, logProgressEvery=1, backupEvery=5,
                 defEstimKeyFn='HAZ', defEstimAdjustFn='POLY', defEstimCriterion='AIC', defCVInterval=93,
                 defExpr2Optimise='1-ks', defMinimiseExpr=True,
                 defOutliersMethod='quant', defOutliersQuantCutPct=5.5,
                 defFitDistCutsFctr=dict(min=1/2, max=4/3), defDiscrDistCutsFctr=dict(min=1/2, max=1.2),
                 defSubmitTimes=4, defSubmitOnlyBest=1,
                 defCoreMaxIters=45, defCoreTermExprValue=0.2, defCoreMaxRetries=1)

### d. Vérification des specs d'optimisation

In [None]:
dfOptimExplSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    zoptr.explicitParamSpecs(dfExplParamSpecs=dfOptimExplSpecs, dropDupes=True, check=True)

assert len(dfOptimExplSpecs) == 12
assert userParamSpecCols == optimParamsSpecsCols
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'EstimCriterion', 'CvInterval',
                            'MinDist', 'MaxDist', 'OutliersMethod', 'FitDistCuts', 'DiscrDistCuts',
                            'Expr2Optimise', 'OptimisationCore', 'SubmitParams']
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

In [None]:
dfOptimExplSpecs

### e. Exécution des optimisations

(en parallèle)

In [None]:
%%time

# Windows 10, 4-core i5-8350U, PCI-e SSD, "optimal performances" power scheme
# 2021-01: 12 optimisations, 1430 analyses, 12 threads : subprocess = 3mn13, system = 2mn35, 1mn54
# 2021-10-02: idem : system 2mn15

results = zoptr.run(dfOptimExplSpecs, threads=12)

#results = zoptr.run(dfOptimExplSpecs.iloc[:3], threads=3)  # Small subset for quicker run.

In [None]:
zoptr.shutdown()

In [None]:
assert speAbbrevCol in results.dfTransData('fr').columns

In [None]:
results.dfTransData('fr')

In [None]:
results.toExcel(pl.Path(zoptr.workDir) / 'unintst-mcds-optimiser-results-fr.xlsx', lang='fr')

### f. Recovery : Run again optimisations, but from the last backup

(use case: crash, or mandatory/auto reboot of computer in the middle of a long optimisation run)

In [None]:
# TODO: Check presence, mtime and content (optims Id lists) of $workDir/optr-resbak-*.pickle.xz
#with lzma.open(fileName, 'rb') as file:
#    dfData, specs = pickle.load(file)
#    
#len(dfData), dfData.columns, len(dfData.columns), dfData.columns.duplicated().any()

In [None]:
# Create the optimiser object : have to be a clone of the one whose execution that was backed up
zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                (dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea, 
                 transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                 sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                 anlysSpecCustCols=[speAbbrevCol], abbrevCol=optAbbrevCol, abbrevBuilder=analysisAbbrev,
                 anlysIndCol=optIndCol, sampleIndCol=sampleIndCol,
                 distanceUnit='Meter', areaUnit='Hectare',
                 surveyType='Point', distanceType='Radial', clustering=False,
                 resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamsSpecsCols + [speAbbrevCol]),
                 workDir=tmpDir / 'mcds-optr', logProgressEvery=1,
                 defEstimKeyFn='HAZ', defEstimAdjustFn='POLY', defEstimCriterion='AIC', defCVInterval=93,
                 defExpr2Optimise='1-ks', defMinimiseExpr=True,
                 defOutliersMethod='quant', defOutliersQuantCutPct=5.5,
                 defFitDistCutsFctr=dict(min=1/2, max=4/3), defDiscrDistCutsFctr=dict(min=1/2, max=1.2),
                 defSubmitTimes=4, defSubmitOnlyBest=1,
                 defCoreMaxIters=45, defCoreTermExprValue=0.2, defCoreMaxRetries=1)

In [None]:
%%time

# Run optimisation with recovery results ... using exact same optim. specs (MANDATORY)
results2 = zoptr.run(dfOptimExplSpecs, recover=True, threads=12)

#results2 = zoptr.run(dfOptimExplSpecs.iloc[:3], recover=True, threads=3)  # Petit sous-ensemble pour aller vite.

In [None]:
zoptr.shutdown()

In [None]:
results2.dfData

In [None]:
# TODO: Check equality of 1st 10 results in `results` and `results2`, + added num of results

## 4. MCDSTruncationOptAnalyser : Run multiple analyses with optimised truncation params, on real-life data

Not implemented here (but see [valtests.ipynb](valtests.ipynb#IV.-Run-truncation-opt-analyses-with-same-real-life-field-data))