<!-- Auto table of contents -->
<h1 class='tocIgnore'>AutoDS : Validation tests</h1>
<p>(for the <b>autods</b> module, a python interface to MCDS.exe, http://distancesampling.org/)</p>
<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table of contents</h2>
  <div id="toc"></div>
</div>

In [None]:
%%javascript
$.getScript('ipython_notebook_toc.js')

In [None]:
%matplotlib inline

In [None]:
import sys
import os
import pathlib as pl
import importlib as implib

import re

from collections import OrderedDict as odict, namedtuple as ntuple

import math
import numpy as np
import pandas as pd

from IPython.display import HTML, Markdown

import matplotlib.pyplot as plt

import plotly as ply
import plotly.graph_objs as plygo
import plotly.express as plyx

In [None]:
sys.path.insert(0, '..')

In [None]:
import autods as ads 

In [None]:
# Logging configuration.
ads.log.configure(handlers=[sys.stdout, 'tmp/valtst.log'], verbose=True, reset=True)

ads.logger('matplotlib', level=ads.WARNING, reset=True)

ads.logger('ads', level=ads.INFO1, reset=True)
#ads.logger('ads.eng', level=ads.WARNING, reset=True)  # Needs to be forced ('cause its level is set before ads, see engine.py)
#ads.logger('ads.exr', level=ads.DEBUG, reset=True)
#ads.logger('ads.rep', level=ads.DEBUG, reset=True)
#ads.logger('ads.opn', level=ads.DEBUG, reset=True)
#ads.logger('ads.opr', level=ads.DEBUG, reset=True)
#ads.logger('ads.anr', level=ads.DEBUG, reset=True)

logger = ads.logger('valtst', level=ads.DEBUG, reset=True)

In [None]:
# Activate Warnings as Exceptions
if False:
    
    import warnings

    warnings.filterwarnings(action='error')

    # pd.read_excel
    warnings.filterwarnings(action='default', module='etree')
    warnings.filterwarnings(action='default', module='xlrd')
    warnings.filterwarnings(action='default', module='defusedxml')

# Commons

In [None]:
# Short string for sample "identification"
def sampleAbbrev(sSample):
    
    abrvSpe = ''.join(word[:4].title() for word in sSample['Espèce'].split(' ')[:2])
    
    sampAbbrev = '{}-{}-{}-{}'.format(abrvSpe, sSample.Passage.replace('+', ''),
                                      sSample.Adulte.replace('+', ''), sSample['Durée'])
    
    return sampAbbrev

In [None]:
# Short string for analysis "identification"
def analysisAbbrev(sAnlys):
    
    # Sample abbreviation
    abbrevs = [sampleAbbrev(sAnlys)]

    # Model + Parameters abbreviation
    abbrevs += [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    dTroncAbrv = { 'l': 'TrGche' if 'TrGche' in sAnlys.index else 'TroncGche',
                   'r': 'TrDrte' if 'TrDrte' in sAnlys.index else 'TroncDrte',
                   'm': 'NbTrches' if 'NbTrches' in sAnlys.index else 'NbTrModel'
                                   if 'NbTrModel' in sAnlys.index else  'NbTrchMod',
                   'd': 'NbTrDiscr' }
    for abrv, name in dTroncAbrv.items():
        if name in sAnlys.index and not pd.isnull(sAnlys[name]):
            abbrevs.append('{}{}'.format(abrv, sAnlys[name][0].lower() if isinstance(sAnlys[name], str)
                                               else int(sAnlys[name])))
   
    return '-'.join(abbrevs)

# I. Run analyses with real life field data (1/2 : long code, long run)

With MCDSAnalysis class.

(for comparison to manually issued analyses with Distance 7.3)

## 1. Load analyses set specifications

In [None]:
# Load refout results table
refFileName = 'ACDC2019-Papyrus-ALAARV-TURMER-resultats-distance-73.xlsx'
dfRefRes = pd.read_excel(pl.Path('refout', refFileName))
dfRefRes.reset_index(inplace=True) # Generate analysis # (later need for original cases order)
dfRefRes.rename(columns=dict(index='AnlysNum', Name='Model'), inplace=True)

dfRefRes.head()

## 2. Build test cases

In [None]:
# Generate test cases definition code from refout results file (don't cheat : only input columns :-)
modelIdCols = ['Model']
modelParamCols = ['LTrunc', 'RTrunc', 'FitDistCuts', 'DiscrDistCuts']
sampleSelCols = ['Species', 'Periods', 'Prec.', 'Duration']
caseIdCols = ['AnlysNum', 'SampNum'] + sampleSelCols + modelIdCols

dfRefRes['SampNum'] = dfRefRes.groupby(sampleSelCols, sort=False).ngroup()

dfAnlysCases = dfRefRes[caseIdCols + modelParamCols].copy()

dfAnlysCases['KeyFn'] = \
    dfAnlysCases.Model.apply(lambda s: 'UNIFORM' if s.startswith('Unif') \
                                                 else 'HNORMAL' if s.startswith('Half') else 'HAZARD')
dfAnlysCases['AdjSer'] = \
    dfAnlysCases.Model.apply(lambda s: 'COSINE' if s.find(' Cos') > 0 \
                                                else 'POLY' if s.find(' SimPoly') > 0 else 'HERMITE')
dfAnlysCases['InFileName'] = \
    dfAnlysCases.apply(lambda sRow: 'ACDC2019-Papyrus-{}-{}-{}mn-{}dec-dist.txt' \
                                    .format(sRow.Species,
                                            'AB' if 'A+B' in sRow.Periods else 'A' if 'A' in sRow.Periods else 'B',
                                            sRow.Duration.split(' ')[0], sRow['Prec.'].split(' ')[0]),
                       axis='columns')
dfAnlysCases

In [None]:
#def nan2None(v):
#    return None if pd.isnull(v) else v
def distCutsFromSpecs(v):
    if pd.isnull(v):
        return None
    if isinstance(v, int):
        return v
    return [float(x) for x in v.split(',')]

## 3. Prepare analyses

In [None]:
decimalFields = ['Point transect*Survey effort', 'Observation*Radial distance']

In [None]:
# Analysis engine (sequential)
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-out'),
                      executor=None, # Non-parallel: ~7.5s elapsed on a Lenovo P52 (6-core i7-8850H with PCI-e SSD)
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Frozen analysis parameters (a choice here)
KEstimCriterion = 'AIC'
KCVInterval = 95

In [None]:
# Results object construction
sampCols = [('sample', col, 'Value') for col in sampleSelCols]
miSampCols = pd.MultiIndex.from_tuples(sampCols)

custCols = [('sample', 'AnlysNum', 'Value'), ('sample', 'SampNum', 'Value')] + sampCols + [('model', 'Model', 'Value')]
miCustCols = pd.MultiIndex.from_tuples(custCols)

dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=caseIdCols, fr=['NumAnlys', 'NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Modèle']))

results = ads.MCDSAnalysisResultsSet(miCustomCols=miCustCols, miSampleCols=miSampCols, dfCustomColTrans=dfCustColTrans,
                                     distanceUnit='Meter', areaUnit='Hectare',
                                     surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
computed = False

## 3a. Or : Really run analyses

In [None]:
# Shorten test cases and reference results lists, to go faster
# Warning: If you don't retain entire samples, later comparison will fail on Delta AIC values.
#selCaseInds = [0, 5, 7, 22, 31] # Some random cases, with uncomplete samples.
#selCaseInds = dfAnlysCases[dfAnlysCases.Sample.isin([3, 4])].index # A shorter selection, with complete samples.
selCaseInds = range(len(dfAnlysCases)) # All of them.

nOrigAnlysCases = len(dfAnlysCases)
dfAnlysCases = dfAnlysCases.loc[selCaseInds]
dfRefRes = dfRefRes.loc[selCaseInds]

logger.info(f'Retained {len(selCaseInds)} out of {nOrigAnlysCases}.')

In [None]:
%%time

# Run all analyses
lastInFileName = None
for _, sCase in dfAnlysCases.iterrows():
    
    nCase = sCase.AnlysNum
    name = sCase.InFileName[len('ACDC2019-Papyrus')+1:-len('-dist.txt')]
    name += '-' + sCase.Model.lower().translate(str.maketrans({c:'-' for c in ' ,.:;()/'}))
    logger.info(f'#{nCase+1:3d} {name} {sCase.KeyFn} {sCase.AdjSer}')
    
    # Create data set if not already done.
    if lastInFileName != sCase.InFileName:
        sds = ads.SampleDataSet(pl.Path('refin', sCase.InFileName), decimalFields=decimalFields)
        lastInFileName = sCase.InFileName
        
    # Run analysis and get results
    anlys = ads.MCDSAnalysis(engine=mcds, sampleDataSet=sds, name=name, logData=True,
                             estimKeyFn=sCase.KeyFn, estimAdjustFn=sCase.AdjSer,
                             estimCriterion=KEstimCriterion, cvInterval=KCVInterval,
                             minDist=sCase.LTrunc, maxDist=sCase.RTrunc,
                             fitDistCuts=distCutsFromSpecs(sCase.FitDistCuts),
                             discrDistCuts=distCutsFromSpecs(sCase.DiscrDistCuts))

    anlys.submit()
    sResult = anlys.getResults()

    # Save results
    sHead = pd.Series(data=[sCase[col] for col in sCase.index[:len(caseIdCols)]], index=miCustCols)

    results.append(sResult, sCustomHead=sHead)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')

results.toExcel(resFileName, sheetName='AutoDSVal')

In [None]:
resFileName = os.path.join(mcds.workDir, 'autods-validation-results-en.xlsx')

results.toExcel(resFileName, sheetName='Auto', lang='en')

In [None]:
# Check translation
dfActTrRes = results.dfTransData('fr')

dfActTrRes.head().T.iloc[:30] #.at['TroncGche', 0]

## 3b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName, sheetName='AutoDSVal')
    
    # shutdown analysis engine
    mcds.shutdown()

else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

# II. Compare actual results to reference

(reference = manually run analyses with Distance software)

## 1. Extract actual results to compare

In [None]:
# Analysis results
dfActRes = results.dfData

dfActRes.head().T[:30]

In [None]:
# Select columns of auto-results and match them with reference ones, for comparison.
dCompCols = \
{
    ('sample', 'AnlysNum', 'Value'):  'AnlysNum',
    ('sample', 'SampNum', 'Value'):   'SampNum',
    ('sample', 'Species', 'Value'):   'Species',
    ('sample', 'Periods', 'Value'):   'Periods',
    ('sample', 'Prec.', 'Value'):     'Prec.',
    ('sample', 'Duration', 'Value'):  'Duration',
    
    ('model',  'Model', 'Value'):         'Model',
    ('parameters', 'left truncation distance', 'Value'):           'LTrunc',
    ('parameters', 'right truncation distance', 'Value'):          'RTrunc',
    ('parameters', 'model fitting distance cut points', 'Value'):  'FitDistCuts',
    ('parameters', 'distance discretisation cut points', 'Value'): 'DiscrDistCuts',
    
    ('run output', 'run status', 'Value') : 'Status',
    #('run output', 'run time', 'Value') : 'Run', # Only for unintests ref. generation just below
    
    ('detection probability', 'total number of parameters (m)', 'Value'): '# params',
    ('encounter rate', 'number of observations (n)', 'Value'): '# obs',
    
    #('detection probability', 'Delta AIC', 'Value'): 'Delta AIC',
    ('detection probability', 'AIC value', 'Value'): 'AIC',
    ('detection probability', 'chi-square test probability determined', 'Value')               : 'GOF Chi-p',
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value')                  : 'GOF K-S p',
    ('detection probability', 'Cramér-von Mises (uniform weighting) test probability', 'Value'): 'GOF CvM (unif) p',
    ('detection probability', 'Cramér-von Mises (cosine weighting) test probability', 'Value') : 'GOF CvM (cos) p',
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'): 'ESW/EDR',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl')  : 'ESW/EDR LCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl')  : 'ESW/EDR UCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Cv')   : 'ESW/EDR CV',
    
    ('density/abundance', 'density of animals', 'Value'): 'D',
    ('density/abundance', 'density of animals', 'Lcl')  : 'D LCL',
    ('density/abundance', 'density of animals', 'Ucl')  : 'D UCL',
    ('density/abundance', 'density of animals', 'Cv')   : 'D CV',
    
    ('detection probability', 'probability of detection (Pw)', 'Value'): 'P',
    ('detection probability', 'probability of detection (Pw)', 'Lcl')  : 'P LCL',
    ('detection probability', 'probability of detection (Pw)', 'Ucl')  : 'P UCL',
    ('detection probability', 'probability of detection (Pw)', 'Cv')   : 'P CV',
    ('detection probability', 'probability of detection (Pw)', 'Df')   : 'P DF',
}
len(dCompCols)

In [None]:
# Warning: Unused columns (full of NaNs) have been atomatically removed
# (see last line of AnalysisResultsSet.dfData getter)
dCompCols = { k: v for k, v in dCompCols.items() if k in dfActRes.columns }
len(dCompCols)

In [None]:
# So we need to cleanup modelParamCols too
modelParamCols = [id_ for id_ in modelParamCols if id_ in dCompCols.values()]
len(modelParamCols)

In [None]:
# Safe stringification of model params
# * needed for use in indexes (hashability)
# * needed to cope with to_excel/read_excel unconsistent None management
def modelParam2Str(par):
    #print(par)
    if isinstance(par, list):
        spar = str([float(v) for v in par])
    elif pd.isnull(par):
        spar = 'None'
    elif isinstance(par, str):
        if ',' in par: # Assumed already somewhat stringified list
            spar = str([float(v) for v in par.strip('[]').split(',')])
    else:
        spar = str(par)
    return spar

In [None]:
# Select results columns and rename them as the reference is, for easier comparison
dfActRes4c = dfActRes[list(dCompCols.keys())].copy()
dfActRes4c.columns = [dCompCols[col] for col in dCompCols]
dfActRes4c[modelParamCols] = dfActRes4c[modelParamCols].applymap(modelParam2Str) # Hashable mandatory for indexing
dfActRes4c.set_index(caseIdCols + modelParamCols, inplace=True)

dfActRes4c

In [None]:
# Select usefull reference columns for comparison
dfRefRes4c = dfRefRes.copy()
dfRefRes4c[modelParamCols] = dfRefRes4c[modelParamCols].applymap(modelParam2Str) # Hashable mandatory for indexing
dfRefRes4c.set_index(caseIdCols + modelParamCols, inplace=True)
dfRefRes4c = dfRefRes4c.reindex(columns=dfActRes4c.columns)

dfRefRes4c

In [None]:
#dfActRes4c.to_excel('tmp/act-res.xlsx')
#dfRefRes4c.to_excel('tmp/ref-res.xlsx')

## 2. Automated diagnosis

In [None]:
# First checks : equality of test case lists (index) and of column names (columns)
assert sorted(dfActRes4c.index)   == sorted(dfRefRes4c.index)
assert sorted(dfActRes4c.columns) == sorted(dfRefRes4c.columns)

In [None]:
# Actual / reference closeness measure : -round(log10((actual - reference) / max(abs(actual), abs(reference))), 1)
# = Compute the order of magnitude that separate the difference from the absolute max. of the two values
# The greater it is, the lower the relative difference
#    Ex: 3 = 10**3 ratio between difference absolue max. of the two,
#        +inf = NO difference at all,
#        0 = bad, one of the two is 0, and the other not,
# See unitary test below.
def closeness(sRefAct):
    
    x, y = sRefAct.to_list()
    
    # Special cases with 1 NaN, or 1 or more inf => all different
    if np.isnan(x):
        if not np.isnan(y):
            return 0 # All different
    elif np.isnan(y):
        return 0 # All different
    
    if np.isinf(x) or np.isinf(y):
        return 0 # All different
    
    # Normal case
    c = abs(x - y)
    if not np.isnan(c) and c != 0:
        c = c / max(abs(x), abs(y))
    
    return np.inf if c == 0 else round(-np.log10(c), 1)

In [None]:
# Actual / reference comparison : compute closeness indicator
dfRelDif = dfRefRes4c.copy()
for col in dfRelDif.columns:
    dfRelDif['act'] = dfActRes4c[col]
    dfRelDif[col] = dfRelDif[[col, 'act']].apply(closeness, axis='columns')
    dfRelDif.drop(columns='act', inplace=True)
    
dfRelDif

In [None]:
# Diagnosis : we only keep lines and columns with some relevant differences.
dfBadRelDif = dfRelDif.copy()
len(dfBadRelDif)

In [None]:
# 1. Suppress rows : Same status and NaNs ibn the remainder (if status == 0/3/4, execution error or no execution)
valCols = [col for col in dfRelDif.columns if col != 'Status']
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif.Status.abs() == np.inf) & dfBadRelDif[valCols].isnull().all(axis='columns')].index,
                 axis='index', inplace=True)
assert len(dfBadRelDif) == 29, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert anlysNums == [0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17,
                     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 2. Suppress rows : Status and all other columns == inf (<=> strict equality)
#    NB. Some very small differences observed when results have just been computed or when they have been
#        loaded from a previously saved Excel file (above 10**15 closeness value)
dfBadRelDif.drop(dfBadRelDif[dfBadRelDif.apply(np.isinf, axis='columns').all(axis='columns')].index,
                 axis='index', inplace=True)
assert (computed and len(dfBadRelDif) == 26) or (not computed and len(dfBadRelDif) == 17), len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert (computed and anlysNums == [0, 1, 2, 5, 6, 7, 8, 9, 13, 14, 15, 17, 18,
                                   19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) \
       or (not computed and anlysNums == [0, 1, 2, 7, 8, 9, 13, 14, 15, 19, 20, 23, 25, 27, 28, 29, 30]), \
       anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 3. Suppress rows : Status and all other columns >= à 15 (<=> nearly strict equality)
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif >= 15).all(axis='columns')].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 5, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert anlysNums == [9, 20, 28, 29, 30], anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 4. Suppress rows : Same status and all other columns >= 4 (<=> close to equality)
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif >= 4).all(axis='columns')].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 4, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert anlysNums == [9, 20, 28, 30], anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 5. Suppress rows : Same status and all other columns >= 4 (<=> close to equality)
#                    except for GOF KS and CvM, equal to NaN, because not computed when distances are discretised.
if 'DiscrDistCuts' in dfBadRelDif.index.names:
    discrCols = [col for col in dfRelDif.columns if not col.startswith('GOF') or col.find('Chi') > 0]
    df2Drop = (dfBadRelDif.index.get_level_values('DiscrDistCuts') != -1) & (dfBadRelDif[discrCols] >= 4).all(axis='columns')
    dfBadRelDif.drop(dfBadRelDif[df2Drop].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 2, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert anlysNums == [9, 30], anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# Verdict (Cf. refFileName Excel file, sheet "DiffAuto" for explanations about the 2 different rows between Act/Ref)
dfBadRelDif.T

In [None]:
dfRefRes4c.loc[dfBadRelDif.index]

In [None]:
nFails = len(dfBadRelDif.index)
if nFails > 0:
    print(f'Warning: {nFails} test case(s) failed ;')
    print(f' ... see sheet "DiffAuto" of {refFileName} for possible explanations.')
else:
    print('All test cases succeeded !')

## 3. Save results to disk.

In [None]:
resCompFileName = os.path.join(mcds.workDir, 'autods-validation-rescomp.xlsx')

with pd.ExcelWriter(resCompFileName) as xlsxWriter:

    dfRefRes.to_excel(xlsxWriter, sheet_name='RefResults', index=True)
    dfActRes4c.reset_index().to_excel(xlsxWriter, sheet_name='ActResults', index=False)
    dfRelDif.reset_index().to_excel(xlsxWriter, sheet_name='Diff2Ref', index=False)
    dfBadRelDif.reset_index().to_excel(xlsxWriter, sheet_name='BadDiff2Ref', index=False)
    dfRefRes4c.loc[dfBadRelDif.index].reset_index().to_excel(xlsxWriter, sheet_name='RefResWithDiff', index=False)
    dfActRes4c.loc[dfBadRelDif.index].reset_index().to_excel(xlsxWriter, sheet_name='ActResWithDiff', index=False)
    dfActRes.to_excel(xlsxWriter, sheet_name='RawActResults', index=True)

In [None]:
dfActRes.head()

# III. Parallel run of same analyses

## 1. Prepare analyses

(same test cases and input data as previously, for easy comparison)

In [None]:
# Analysis executor : 6, 8, None threads => min elapsed = ~2s on a Lenovo P52 (6-core i7-8850H with PCI-e SSD)
parallelExecutor = ads.Executor(threads=6)

# Analysis engine
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-out'), executor=parallelExecutor, 
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Results object construction
parResults = ads.MCDSAnalysisResultsSet(miCustomCols=miCustCols, miSampleCols=miSampCols, dfCustomColTrans=dfCustColTrans, 
                                        distanceUnit='Meter', areaUnit='Hectare',
                                        surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
computed = False

## 2a. Or : Really run analyses

In [None]:
# Shorten test cases and reference results lists, to go faster
# Warning: If you don't retain entire samples, later comparison will fail on Delta AIC values.
#selCaseInds = [0, 5, 7, 22, 31] # Some random cases, with uncomplete samples.
#selCaseInds = dfAnlysCases[dfAnlysCases.Sample.isin([3, 4])].index # A shorter selection, with complete samples.
selCaseInds = range(len(dfAnlysCases)) # All of them.

nOrigAnlysCases = len(dfAnlysCases)
dfAnlysCases = dfAnlysCases.loc[selCaseInds]
dfRefRes = dfRefRes.loc[selCaseInds]

logger.info(f'Retained {len(selCaseInds)} out of {nOrigAnlysCases}.')

In [None]:
%%time

# Start running all analyses
lastInFileName = None
analyses = dict()
for _, sCase in dfAnlysCases.iterrows():
    
    nCase = sCase.AnlysNum
    name = sCase.InFileName[len('ACDC2019-Papyrus')+1:-len('-dist.txt')]
    name += '-' + sCase.Model.lower().translate(str.maketrans({c:'-' for c in ' ,.:;()/'}))
    logger.info(f'#{nCase+1:3d} {name} {sCase.KeyFn} {sCase.AdjSer}')
    
    # Create data set if not already done.
    if lastInFileName != sCase.InFileName:
        sds = ads.SampleDataSet(pl.Path('refin', sCase.InFileName), decimalFields=decimalFields)
        lastInFileName = sCase.InFileName
        
    # Start running analysis in parallel (don't wait for it's finished, go on)
    sResHead = pd.Series(data=[sCase[col] for col in sCase.index[:len(caseIdCols)]], index=miCustCols)

    anlys = ads.MCDSAnalysis(engine=mcds, sampleDataSet=sds, name=name, customData=sResHead, logData=True,
                             estimKeyFn=sCase.KeyFn, estimAdjustFn=sCase.AdjSer,
                             estimCriterion=KEstimCriterion, cvInterval=KCVInterval,
                             minDist=sCase.LTrunc, maxDist=sCase.RTrunc,
                             #minDist=nan2None(sCase.LTrunc), maxDist=nan2None(sCase.RTrunc),
                             fitDistCuts=distCutsFromSpecs(sCase.FitDistCuts), # TODO: do this when building dfAnlysCases
                             discrDistCuts=distCutsFromSpecs(sCase.DiscrDistCuts))
    anlysFut = anlys.submit()
    
    # Store analysis object and associated "future" for later use (should be running soon or later).
    analyses[anlysFut] = anlys
    
logger.info('All analyses started ; now waiting for their end, and results ...')

# For each analysis as it gets completed (first completed => first yielded)
for anlysFut in parallelExecutor.asCompleted(analyses):

    # Retrieve analysis object from its associated future object
    anlys = analyses[anlysFut]
    
    # Get analysis results
    sResult = anlys.getResults()

    # Save results with header
    parResults.append(sResult, sCustomHead=anlys.customData)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-parallel-results.xlsx')

parResults.toExcel(resFileName, sheetName='AutoDSVal')

## 2b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-parallel-results.xlsx')
    print('Loading results from {} ...'.format(resFileName))

    parResults.fromExcel(resFileName, sheetName='AutoDSVal')
    
    # shutdown analysis engine
    mcds.shutdown()

else:
    
    print('Just computed, not reloading ...')
    
print(f'... {len(parResults)} analyses to compare')

## 3. Compare parallel results to sequential ones

In [None]:
# Prepare sequential results for comparison
dfSeqCmpRes = results.dfTransData('en')

dfSeqCmpRes.fillna(-9999, inplace=True) # Get rid of the Nan pb (because NaN != NaN :-)

dfSeqCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
# Prepare parallel results for comparison
dfParCmpRes = parResults.dfTransData('en')

dfParCmpRes.sort_values(by='AnlysNum', inplace=True) # Back to original test case order = sequential run order

dfParCmpRes.reset_index(inplace=True, drop=True) # Enforce same index as a consequence

dfParCmpRes.fillna(-9999, inplace=True) # And get rid of the Nan pb (because NaN != Nan :-)

dfParCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
# Warning: Doesn't work if 1 of the 2 sets (not the 2) was loaded from disk (Excel numerical rounding stuff)
assert (dfSeqCmpRes == dfParCmpRes).all().all(), \
       'Oh, oh, something went differently when run parallely ... but due to one results set loaded from disk ?'

In [None]:
results = parResults

# IV. Excel and HTML reports

In [None]:
# Sélection des colonnes pour les tableaux de synthèse du rapport
synthRepCols = \
[
    ('sample', 'AnlysNum', 'Value'),
    ('sample', 'SampNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    
    ('model', 'Model', 'Value'),
    ('parameters', 'left truncation distance', 'Value'),
    ('parameters', 'right truncation distance', 'Value'),
    ('parameters', 'model fitting distance cut points', 'Value'),
    ('parameters', 'distance discretisation cut points', 'Value'),
    
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    ('detection probability', 'Delta AIC', 'Value'),
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('density/abundance', 'density of animals', 'Delta Cv'),
    ('density/abundance', 'density of animals', 'Cv'),
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
    
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Lcl'),
    ('detection probability', 'probability of detection (Pw)', 'Ucl'),
    ('detection probability', 'probability of detection (Pw)', 'Df'),

    ('run output', 'run folder', 'Value'),
]

In [None]:
sortRepCols = \
[('sample', 'SampNum', 'Value')] \
+ [('sample', col, 'Value') for col in sampleSelCols] \
+ [('parameters', 'left truncation distance', 'Value'),
   ('parameters', 'right truncation distance', 'Value'),
   ('detection probability', 'Delta AIC', 'Value')]
#   ('density/abundance', 'density of animals', 'Delta Cv')]

sortRepAscend = True

In [None]:
report = ads.MCDSResultsFullReport(resultsSet=results, synthCols=synthRepCols,
                                   sortCols=sortRepCols, sortAscend=sortRepAscend,
                                   title='Validation AutoDS : Analyses', subTitle='Rapport d\'analyse global',
                                   anlysSubTitle='Rapport détaillé', description='Qu\'ajouter de plus ?',
                                   keywords='autods, validation', pySources=['valtests.ipynb'],
                                   lang='fr', plotImgSize=(640, 320),
                                   tgtFolder=mcds.workDir, tgtPrefix='autods-validation-report')

In [None]:
htmlRep = report.toHtml() #generators=6) # Parallelism does not work for full reports (while it does for pre-reports !?)

HTML(f'Rapport HTML : <a href="{htmlRep}" target="blank">{htmlRep}</a>')

In [None]:
xlsxRep = report.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxRep}" target="blank">{xlsxRep}</a>')

# V. Run and report pre-analyses (1/3 : long code, long duration)

(to help users to setup the full analyses plan : run first try simple analyses and show PDF and few results)

## 1. Determine samples from input data

* in real life, we'd simply load field collected data, and deduce individual "samples" from it ;
* but there, for testing, it's easier to deduce samples from manual analysis specification file)

In [None]:
# Create sample table from refout results table
refFileName = 'ACDC2019-Papyrus-ALAARV-TURMER-resultats-distance-73.xlsx'

sampleSelCols = ['Species', 'Periods', 'Prec.', 'Duration']

dfSamples = pd.read_excel(pl.Path('refout', refFileName), usecols=sampleSelCols)
dfSamples.rename(columns=dict(Name='Model'), inplace=True)
dfSamples.drop_duplicates(inplace=True)
dfSamples.reset_index(drop=True, inplace=True)

dfSamples.reset_index(inplace=True) # Generate sample # (later need for original sample order)
dfSamples.rename(columns=dict(index='SampleNum'), inplace=True)

sampleSelCols = ['SampleNum'] + sampleSelCols

dfSamples

## 2. Prepare pre-analyses

In [None]:
decimalFields = ['Point transect*Survey effort', 'Observation*Radial distance']

In [None]:
# Analysis engine: Non-parallel executor here, 'cause MCDSPreAnalysis takes care of this !
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-pout'), 
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Results object construction
custCols = [('sample', col, 'Value') for col in sampleSelCols]
miCustCols = pd.MultiIndex.from_tuples(custCols)
dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=sampleSelCols, fr=['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée']))

results = ads.MCDSAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans, 
                                     distanceUnit='Meter', areaUnit='Hectare',
                                     surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
KPreEstimCrit = 'AIC'
KPreCVInterval = 95
KPreEstimModStrat = [dict(keyFn=kf, adjSr='COSINE', estCrit=KPreEstimCrit, cvInt=KPreCVInterval) \
                     for kf in['HNORMAL', 'HAZARD', 'UNIFORM', 'NEXPON']]

In [None]:
computed = False

## 3 Or : Really run pre-analyses

In [None]:
%%time

# Run all analyses
lastInFileName = None
for _, sSamp in dfSamples.iterrows():
    
    nSamp = sSamp.SampleNum
    sampId = '{}-{}-{}mn-{}dec' \
             .format(sSamp.Species,
                     'AB' if 'A+B' in sSamp.Periods else 'A' if 'A' in sSamp.Periods else 'B',
                     sSamp.Duration.split(' ')[0], sSamp['Prec.'].split(' ')[0])
    logger.info(f'#{nSamp+1:3d} {sampId}')
    
    # Create data set if not already done.
    inFileName = 'ACDC2019-Papyrus-{}-dist.txt'.format(sampId)
    if lastInFileName != inFileName:
        sds = ads.SampleDataSet(pl.Path('refin', inFileName), decimalFields=decimalFields)
        lastInFileName = inFileName
        
    # Run analysis: Not parallel runs for now ... see below.
    preAnlys = ads.MCDSPreAnalysis(engine=mcds, sampleDataSet=sds, name=sampId, executor=None,
                                   logData=False, modelStrategy=KPreEstimModStrat)
    preAnlys.submit()
    
    # Get results (wait for it's finished)
    sResult = preAnlys.getResults()

    # Save results
    sResHead = sSamp.copy()
    sResHead.index = miCustCols
    results.append(sResult, sCustomHead=sResHead)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Look at results
results.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                           'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                           'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults.xlsx')

results.toExcel(resFileName, sheetName='AutoDSVal')

## 3. Or : Load pre-analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults.xlsx')
    print(f'Loading pre-results from {resFileName} ...')

    results.fromExcel(resFileName, sheetName='AutoDSVal')
    
else:
    
    print('Just computed, not reloading ...')
    
print(f'... {len(results)} pre-analyses loaded')

In [None]:
# Look at results
results.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                           'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                           'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

## 4. Generate HTML and Excel pre-analyses reports

In [None]:
# Sélection des colonnes pour les tableaux de synthèse du rapport
synthCols = \
[
    ('sample', 'SampleNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    ('parameters', 'CV interval', 'Value'),
    ('parameters', 'left truncation distance', 'Value'),
    ('parameters', 'right truncation distance', 'Value'),
    ('parameters', 'model fitting distance cut points', 'Value'),
    ('parameters', 'distance discretisation cut points', 'Value'),
    
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('density/abundance', 'density of animals', 'Cv'),
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
    
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Lcl'),
    ('detection probability', 'probability of detection (Pw)', 'Ucl'),
    ('detection probability', 'probability of detection (Pw)', 'Df'),

    ('run output', 'run folder', 'Value'),
]

In [None]:
# Select analysis results columns for the 3 textual columns of the synthesis pre-report
sampleCols = \
[
    ('sample', 'SampleNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value')
]

paramCols = \
[
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    ('parameters', 'CV interval', 'Value')
]
    
resultCols = \
[
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),

    ('density/abundance', 'density of animals', 'Cv'),
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
]

In [None]:
#results = parResults

In [None]:
preReport = ads.MCDSResultsPreReport(resultsSet=results,
                                     title='Validation AutoDS : Pré-analyses', subTitle='Rapport de pré-analyse',
                                     anlysSubTitle='Détail des pré-analyses', description='Qu\'ajouter de plus ?',
                                     keywords='autods, validation', synthPlotsHeight=384, lang='fr',
                                     sampleCols=sampleCols, paramCols=paramCols,
                                     resultCols=resultCols, anlysSynthCols=synthCols,
                                     tgtFolder=mcds.workDir, tgtPrefix='autods-validation-prereport')

In [None]:
htmlPreRep = preReport.toHtml(generators=6)

HTML(f'Pré-rapport HTML : <a href="{htmlPreRep}" target="blank">{htmlPreRep}</a>')

In [None]:
xlsxPreRep = preReport.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxPreRep}" target="blank">{xlsxPreRep}</a>')

# VI. Parallel run of same pre-analyses (2/3 : long code, short duration)

And compare results to sequential run's.

## 1. Prepare analyses

In [None]:
# Analysis engine : Non-parallel executor here: MCDSPreAnalysis takes care of this !
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-ppout'), 
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Results object construction
parResults = ads.MCDSAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans, 
                                        distanceUnit='Meter', areaUnit='Hectare',
                                        surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
computed = False

## 2 Or : Really run pre-analyses

In [None]:
# Pre-analysis executor (kind of overkill here, with only 5 pre-analyses ... but still works twice as rapidly !).
parallelExecutor = ads.Executor(threads=6)

In [None]:
%%time

# Run all analyses
lastInFileName = None
preAnalyses = dict()
for _, sSamp in dfSamples.iterrows():
    
    nSamp = sSamp.SampleNum
    sampId = '{}-{}-{}mn-{}dec' \
             .format(sSamp.Species,
                     'AB' if 'A+B' in sSamp.Periods else 'A' if 'A' in sSamp.Periods else 'B',
                     sSamp.Duration.split(' ')[0], sSamp['Prec.'].split(' ')[0])
    logger.info(f'#{nSamp+1:3d} {sampId}')
    
    # Create data set if not already done.
    inFileName = 'ACDC2019-Papyrus-{}-dist.txt'.format(sampId)
    if lastInFileName != inFileName:
        sds = ads.SampleDataSet(pl.Path('refin', inFileName), decimalFields=decimalFields)
        lastInFileName = inFileName
        
    # Start running analysis (but don't wait for it's finished)
    sResHead = sSamp.copy()
    sResHead.index = miCustCols
    
    preAnlys = ads.MCDSPreAnalysis(engine=mcds, sampleDataSet=sds, name=sampId,
                                   customData=sResHead, executor=parallelExecutor,
                                   logData=False, modelStrategy=KPreEstimModStrat)
    preAnlysFut = preAnlys.submit()
    
    # Store analysis object and associated "future" for later use (should be running soon or later).
    preAnalyses[preAnlysFut] = preAnlys
    
logger.info('All pre-analyses started ; now waiting for their end, and results ...')

# For each analysis as it gets completed (first completed => first yielded)
for preAnlysFut in parallelExecutor.asCompleted(preAnalyses):

    # Retrieve pre-analysis object from its associated future object
    preAnlys = preAnalyses[preAnlysFut]
    
    # Get pre-analysis results
    sResult = preAnlys.getResults()

    # Save results with header
    parResults.append(sResult, sCustomHead=preAnlys.customData)
    
# shutdown executor
parallelExecutor.shutdown()

# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Look at results
parResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                              'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                              'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults-par.xlsx')

parResults.toExcel(resFileName, sheetName='AutoDSVal')

## 2. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults-par.xlsx')
    print('Loading pre-results from {} ...'.format(resFileName))

    parResults.fromExcel(resFileName, sheetName='AutoDSVal')
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} pre-analyses loaded'.format(len(parResults)))

In [None]:
# Look at results
parResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                              'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                              'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

## 3. Compare parallel results to sequential ones

In [None]:
# Prepare sequential results for comparison
dfSeqCmpRes = results.dfTransData('en')

dfSeqCmpRes.fillna(-9999, inplace=True) # Get rid of the Nan pb (because NaN != NaN :-)

dfSeqCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
# Prepare parallel results for comparison
dfParCmpRes = parResults.dfTransData('en')

dfParCmpRes.sort_values(by='SampleNum', inplace=True) # Back to original test case order = sequential run order

dfParCmpRes.reset_index(inplace=True, drop=True) # Enforce same index as a consequence

dfParCmpRes.fillna(-9999, inplace=True) # And get rid of the Nan pb (because NaN != Nan :-)

dfParCmpRes.drop(columns=['RunTime', 'RunFolder'], inplace=True) # Run date-time and folder can never be the same

In [None]:
assert (dfSeqCmpRes == dfParCmpRes).all().all(), 'Oh, oh, something went differently when run parallely ...'

# VII. Run analyses with real life field data (2/2 : short code + fast run)

Thanks to MCDSAnalyser class.

In [None]:
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDecCols = [effortCol, 'Distance']

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

## 1. Individuals data set

In [None]:
dfObsIndiv = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='DonnéesIndiv').dfData
len(dfObsIndiv)

In [None]:
dfObsIndiv.head()

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='Inventaires').dfData
len(dfTransects)

In [None]:
dfTransects

## 3. Analyses specs

In [None]:
dfAnlysSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx', 
                                                 keep=['Echant1_impl', 'Echant2_impl', 'Modl_impl',
                                                       'Params1_expl', 'Params2_expl'],
                                                 varIndCol=varIndCol,
                                                 #convertCols={ 'Durée': int }, # float 'cause of Excel
                                                 computedCols={ anlysAbbrevCol: analysisAbbrev })

len(dfAnlysSpecs)

In [None]:
# For faster debugging : reduce work.
#dfAnlysSpecs = dfAnlysSpecs[(dfAnlysSpecs.Passage == 'a+b') & (dfAnlysSpecs.Adulte == 'm') \
#                            & (dfAnlysSpecs['Durée'] == '10mn') \
#                            & ((dfAnlysSpecs.TrGche.isnull()) | (dfAnlysSpecs.TrGche < 20)) \
#                            & ((dfAnlysSpecs.TrDrte.isnull()) | (dfAnlysSpecs.TrDrte <= 500))]
#len(dfAnlysSpecs)

In [None]:
dfAnlysSpecs

In [None]:
# Recall analysis set without truncation params
dfAnlysSpecs[['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust']].drop_duplicates().reset_index(drop=True)

In [None]:
computed = False

In [None]:
workDir = pl.Path('tmp/mcds-anlr')

## 4A. Or : Really run analyses

### a. MCDS Analyser object

In [None]:
anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                          abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                          distanceUnit='Meter', areaUnit='Hectare',
                          surveyType='Point', distanceType='Radial', clustering=False,
                          resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                               after=[anlysAbbrevCol]),
                          workDir=workDir, logProgressEvery=5,
                          defEstimCriterion='AIC', defCVInterval=95)

### b. Check analysis explicit specs

In [None]:
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfAnlysSpecs, dropDupes=True, check=True)

assert len(dfAnlysSpecs) == 48
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts']
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

In [None]:
dfAnlysSpecs.head()

### c. Run analyses

In [None]:
%%time

# Analyses : min=5, max=11s elapsed for 64 analyses on a Lenovo P52 (6-core i7-8850H with PCI-e SSD)
results = anlysr.run(dfAnlysSpecs, threads=6)

computed = True

In [None]:
anlysr.shutdown()

In [None]:
results.dfTransData('fr')

In [None]:
results.dfData

### d. Save results for later reload or examination

In [None]:
results.toExcel(workDir / 'valtests-mcds-anlyser-results.xlsx')

In [None]:
#results.toExcel(workDir / 'valtests-mcds-anlyser-results-fr.xlsx', lang='fr')

## 4B. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                              resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                   after=[anlysAbbrevCol]),
                              transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                              sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                              abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                              distanceUnit='Meter', areaUnit='Hectare',
                              surveyType='Point', distanceType='Radial', clustering=False)
    
    results = anlysr.setupResults()
    
    anlysr.shutdown()
    
    # Load results from file.
    resFileName = workDir / 'valtests-mcds-anlyser-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

## 5. Compare results to reference

(reference generated with same kind of "long" code like in III above, but on another data set)

In [None]:
# Load reference
# 1. Clone results _without_ data.
rsRef = results.copy(withData=False)

# 2. Load it with reference data
rsRef.fromOpenDoc('refout/ACDC2019-Naturalist-ExtraitResultats.ods')

rsRef.dfData

In [None]:
# Compare (ignore sample and analysis indexes, no use here).
indexCols = [col for col in results.miCustomCols.to_list() if '(sample)' in col[0]] \
            + [('parameters', 'estimator key function', 'Value'),
               ('parameters', 'estimator adjustment series', 'Value'),
               ('parameters', 'left truncation distance', 'Value'),
               ('parameters', 'right truncation distance', 'Value'),
               ('parameters', 'model fitting distance cut points', 'Value')]
subsetCols = [col for col in results.dfData.columns.to_list() \
              if col not in (indexCols + [col for col in results.miCustomCols.to_list()
                                          if '(sample)' not in col[0]]
                             + [('parameters', 'estimator selection criterion', 'Value'),
                                ('parameters', 'CV interval', 'Value'),
                                ('run output', 'run time', 'Value'),
                                ('run output', 'run folder', 'Value'),
                                ('detection probability', 'key function type', 'Value'),
                                ('detection probability', 'adjustment series type', 'Value')])]

dfDiff = rsRef.compare(results, indexCols=indexCols, subsetCols=subsetCols, dropCloser=12, dropNans=True)

assert dfDiff.empty, 'No, no, no : not the same ...'

print('Yessssss !')

In [None]:
# To be perfectly honnest ... some 10**-12/15 glitches (due to worksheet I/O ?)
rsRef.compare(results, indexCols=indexCols, subsetCols=subsetCols, dropCloser=14, dropNans=True)

# VIII. Run pre-analyses (3/3 : short code, short duration)

And compare results to sequential run's.

Note: 2 modes here, with explicit or implicit sample specification.

In [None]:
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDecCols = [effortCol, 'Distance']

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

speciesAbbrevCol = 'AbrevEsp'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

## 1. Individuals data set

In [None]:
dfObsIndiv = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='DonnéesIndiv').dfData
dfObsIndiv

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='Inventaires').dfData
dfTransects

## 3. Samples to pre-analyse

In [None]:
# Implicit variants
varEspeces = ['Sylvia atricapilla', 'Turdus merula', 'Luscinia megarhynchos'] # 1 variante espèce ... par espèce <8-]

varPassages = ['b', 'a+b'] # Passage b ou a+b => 2 variantes
varAdultes = ['m'] # Les mâles, et ensuite les mâles et autres adultes (=> 2 variantes)
varDurees = ['5mn', '10mn'] # 5 1ères mn, ou toutes les 10 => 2 variantes

In [None]:
# Explicitation of variants or not
# a. Implicites specs
dImplSampleSpecs = { 'Espèce': varEspeces, 'Passage': varPassages, 'Adulte': varAdultes, 'Durée':   varDurees }

specsAreExplicit = True
if specsAreExplicit:
    
    # b. Explicit combinations
    dfExplSampleSpecs = ads.Analyser.explicitVariantSpecs(dict(_impl=dImplSampleSpecs))
    #dfExplSampleSpecs = ads.Analyser.explicitPartialVariantSpecs(dImplSampleSpecs) # Just the same, but less generic.

    # c. Add sample order columns (usefull for reports, as pre-analyses are run parallely !).
    #dfExplSampleSpecs.reset_index(drop=False, inplace=True)
    #dfExplSampleSpecs.rename(columns=dict(index=sampleNumCol), inplace=True)

    # d. Add sample abreviation column (mainly for analysis traces)
    #dfExplSampleSpecs[sampleAbbrevCol] = dfExplSampleSpecs.apply(sampleAbbrev, axis='columns')

    # e. Add neutral and pass-through column (from sample specs to results)
    dfExplSampleSpecs[speciesAbbrevCol] = dfExplSampleSpecs['Espèce'].apply(lambda s: ''.join(m[:4] for m in s.split()))
    
    print(dfExplSampleSpecs)
    
else:
    
    # b. Keep unexplicited : run will do automatically
    implSampleSpecs = dict(_impl=dImplSampleSpecs)
    
    print(implSampleSpecs)    

In [None]:
computed = False

In [None]:
workDir = pl.Path('tmp/mcds-anlr')

## 4A. Or : Really run pre-analyses

### a. MCDSPreAnalyser object

In [None]:
anlysr = ads.MCDSPreAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                             transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                             sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleSpecCustCols=[speciesAbbrevCol],
                             abbrevCol=sampleAbbrevCol, abbrevBuilder=sampleAbbrev, sampleIndCol=sampleNumCol,
                             distanceUnit='Meter', areaUnit='Hectare',
                             surveyType='Point', distanceType='Radial', clustering=False,
                             resultsHeadCols=dict(before=[sampleNumCol], sample=sampleSelCols,
                                                  after=([speciesAbbrevCol] if specsAreExplicit else []) + [sampleAbbrevCol]),
                             workDir=workDir, logProgressEvery=5)

### b. Check pre-analyses specs

In [None]:
dfExplSampleSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfExplSampleSpecs if specsAreExplicit else None,
                              implParamSpecs=implSampleSpecs if not specsAreExplicit else None,
                              dropDupes=True, check=True)

print(verdict, reasons, len(dfExplSampleSpecs), userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

assert len(dfExplSampleSpecs) == 12
assert userParamSpecCols == [] # No analysis params here (auto. generated by PreAnalyser)
assert intParamSpecCols == [] # Idem
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

### c. Run pre-analyses

In [None]:
# Model fall-down strategy
dModelStrategy = [dict(keyFn=kf, adjSr=js, estCrit='AIC', cvInt=95) \
                  for js in['COSINE', 'POLY', 'HERMITE']
                  for kf in['HNORMAL', 'HAZARD', 'UNIFORM', 'NEXPON']]

In [None]:
%%time

# Analyses : 50s to ~1mn elapsed for 12 samples, 6-12 threads on a Lenovo T490 (4-core i5-8xxx with PCI-e SSD)
results = anlysr.run(dfExplSampleSpecs if specsAreExplicit else None,
                     implSampleSpecs=implSampleSpecs if not specsAreExplicit else None, 
                     dModelStrategy=dModelStrategy, threads=12)

computed = True

In [None]:
anlysr.shutdown()

In [None]:
assert not specsAreExplicit or speciesAbbrevCol in results.dfTransData('fr').columns

In [None]:
results.dfTransData('fr')

In [None]:
results.dfData

### d. Save results for later reload or examination

In [None]:
results.toExcel(workDir / 'valtests-mcds-preanlyser-results.xlsx')

In [None]:
#results.toExcel(workDir / 'valtests-mcds-preanlyser-results-fr.xlsx', lang='fr')

## 4B. Or : Load pre-analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    anlysr = ads.MCDSPreAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                                 abbrevCol=sampleAbbrevCol,
                                 resultsHeadCols=dict(before=[sampleNumCol], sample=sampleSelCols,
                                                      after=([speciesAbbrevCol] if specsAreExplicit else [])
                                                            + [sampleAbbrevCol]),
                                 transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                 sampleDecCols=sampleDecCols,
                                 distanceUnit='Meter', areaUnit='Hectare',
                                 surveyType='Point', distanceType='Radial', clustering=False)
    
    results = anlysr.setupResults()
    
    anlysr.shutdown()

    # Load resultas from file
    resFileName = workDir / 'valtests-mcds-anlyser-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

## 5. Compare results to reference

(reference generated with same kind of "long" code like in III above, but on another data set)

In [None]:
# Load reference
# 1. Clone results _without_ data.
rsRef = results.copy(withData=False)

# 2. Load it with reference data
rsRef.fromOpenDoc('refout/ACDC2019-Naturalist-ExtraitPreResultats.ods')

rsRef.dfData

In [None]:
# Compare (ignore sample and analysis indexes, no use here).
indexCols = [col for col in results.miCustomCols.to_list() if '(sample)' in col[0]] \
            + [('parameters', 'estimator key function', 'Value'),
               ('parameters', 'estimator adjustment series', 'Value')]
subsetCols = [col for col in results.dfData.columns.to_list() \
              if col not in (indexCols + [col for col in results.miCustomCols.to_list() if '(sample)' not in col[0]]
                             + [('parameters', 'estimator selection criterion', 'Value'),
                                ('parameters', 'CV interval', 'Value'),
                                ('run output', 'run time', 'Value'),
                                ('run output', 'run folder', 'Value'),
                                ('detection probability', 'key function type', 'Value'),
                                ('detection probability', 'adjustment series type', 'Value')])]

dfDiff = rsRef.compare(results, indexCols=indexCols, subsetCols=subsetCols, dropCloser=13, dropNans=True)

assert dfDiff.empty, 'Oh oh ... some differences !'

print('Yessssss !')

In [None]:
# To be perfectly honest ... there are some 10**-15/-16 glitches (due to worksheet I/O ?)
rsRef.compare(results, indexCols=indexCols, subsetCols=subsetCols, dropNans=True)

# IX. Truncation optimisation (short code and fast run)

Thanks to MCDSZeroOrderTruncationOptimiser class.

In [None]:
def optimAbbrev(sAnlys):
    
    # Sample abbreviation
    spcAbbrev = ''.join(word[:4].title() for word in sAnlys['Espèce'].split(' ')[:2])
    sampAbbrev = [str(x) for x in [spcAbbrev, sAnlys.Passage.replace('+', ''),
                                   sAnlys.Adulte.replace('+', ''), sAnlys['Durée']]]

    # Model + Parameters abbreviation
    modParAbbrev = [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    
    return '-'.join(sampAbbrev + modParAbbrev)

## 0. Optimiser parameters

In [None]:
# Source / Results data
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'Distance'
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

optIndCol = 'IndOptim'
optAbbrevCol = 'AbrevOptim'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

In [None]:
# Les paramètres généraux d'analyse DS
distanceUnit = 'Meter'
areaUnit = 'Hectare'
surveyType = 'Point'
distanceType = 'Radial'
clustering = False

In [None]:
# Default optimisation params.
defEstimKeyFn = 'HNORMAL'
defEstimAdjustFn = 'COSINE'
defEstimCriterion = 'AIC'
defCVInterval = 95
defMinDist = None
defMaxDist = None, 
defFitDistCuts = None
defDiscrDistCuts = None

defExpr2Optimise = 'chi2'
defMinimiseExpr = False
defOutliersMethod = 'tucquant'
defOutliersQuantCutPct = 7
defFitDistCutsFctr = ads.Interval(min=0.6, max=1.4)
defDiscrDistCutsFctr = ads.Interval(min=0.5, max=1.2)

defSubmitTimes = 1
defSubmitOnlyBest = None

defCoreEngine = 'zoopt'
defCoreMaxIters = 100
defCoreTermExprValue = None
defCoreAlgorithm = 'racos'
defCoreMaxRetries = 0

[X. Run opt-analyses with real life field data (short code)](#X.-Run-opt-analyses-with-real-life-field-data-(short-code))

## 1. Individuals data set

In [None]:
dfObsIndiv = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='DonnéesIndiv').dfData
dfObsIndiv

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='Inventaires').dfData
dfTransects

## 3. Samples and analyses to optimise

In [None]:
computed = False

In [None]:
workDir = pl.Path('tmp/mcds-optr')

### a. For testing all optimisation parameters

In [None]:
varOpt = '-all'

In [None]:
dfOptimExplSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx', 
                                                     ignore=['Params3_expl'])

# No use of these cols, as we'll compute them !
dfOptimExplSpecs = dfOptimExplSpecs.drop(columns=['TrGche', 'TrDrte', 'NbTrchMod']).drop_duplicates() \
                    .reset_index(drop=True)

nOptimExplSpecs = len(dfOptimExplSpecs)

In [None]:
# Add optim. params
dfMoreOptimCols = pd.DataFrame([dict(CritChx='AIC', IntervConf=95,
                                     TrGche='auto', TrDrte='auto', MethOutliers='tucquant(2.5)',
                                     NbTrchMod='mult(2/3, 3/2)', NbTrDiscr=None,
                                     #TroncGche='auto', TroncDrte='auto', MethOutliers='tucquant(2.5)',
                                     #NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None,
                                     ExprOpt='max(chi2)', MoteurOpt='zoopt(160)')]*len(dfOptimExplSpecs))

dfOptimExplSpecs = pd.concat([dfOptimExplSpecs, dfMoreOptimCols], axis='columns')
dfOptimExplSpecs

In [None]:
nOptimedAnlyses = nOptimExplSpecs

In [None]:
# Colonnes de dfOptimExplSpecs donnant les paramètres d'analyse / optimisation
optimParamSpecCols  = ['FonctionClé', 'SérieAjust', 'CritChx', 'IntervConf',
                       'TrGche', 'TrDrte', 'MethOutliers', 'NbTrchMod', 'NbTrDiscr',
                       #'TroncGche', 'TroncDrte', 'MethOutliers', 'NbTrModel', 'NbTrDiscr',
                       'ExprOpt', 'MoteurOpt']

# Et en version interne
intOptimParamSpecCols = ['EstimKeyFn', 'EstimAdjustFn', 'EstimCriterion', 'CvInterval',
                          'MinDist', 'MaxDist', 'OutliersMethod', 'FitDistCuts', 'DiscrDistCuts',
                          'Expr2Optimise', 'OptimisationCore']

### b. Or: Only main optimisation parameters

* for comparison with X below,
* for comparing results goodness with various optimisation parameters, in XI below.

In [None]:
varOpt = '-main'

In [None]:
dfOptimExplSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx', 
                                                     ignore=['Params1_expl', 'Params2_expl'])

dfOptimExplSpecs.drop(dfOptimExplSpecs[dfOptimExplSpecs[['TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']]
                                           .isnull().all(axis='columns')].index,
                      inplace=True)

nOptimExplSpecs = len(dfOptimExplSpecs)

dfOptimExplSpecs

In [None]:
nOptimedAnlyses = 22  # See MultiOpt col

In [None]:
# Colonnes de dfOptimExplSpecs donnant les paramètres d'analyse / optimisation
optimParamSpecCols  = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']

# Et en version interne
intOptimParamSpecCols = ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts', 'SubmitParams']

## 4A. Or : Really run optimisations

### a. MCDS Zeroth Order Truncation Optimiser object

In [None]:
zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                (dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                 transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                 sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                 abbrevCol=optAbbrevCol, abbrevBuilder=optimAbbrev,
                 anlysIndCol=optIndCol, sampleIndCol=sampleNumCol,
                 distanceUnit=distanceUnit, areaUnit=areaUnit,
                 surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                 resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamSpecCols),
                 workDir='/tmp',
                 defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                 defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                 defExpr2Optimise=defExpr2Optimise, defMinimiseExpr=defMinimiseExpr,
                 defOutliersMethod=defOutliersMethod, defOutliersQuantCutPct=defOutliersQuantCutPct,
                 defFitDistCutsFctr=defFitDistCutsFctr, defDiscrDistCutsFctr=defDiscrDistCutsFctr,
                 defSubmitTimes=defSubmitTimes, defSubmitOnlyBest=defSubmitOnlyBest,
                 defCoreMaxIters=defCoreMaxIters, defCoreTermExprValue=defCoreTermExprValue,
                 defCoreAlgorithm=defCoreAlgorithm, defCoreMaxRetries=defCoreMaxRetries)

### b. Run optimisations

In [None]:
dfOptimExplSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, checkVerdict, checkErrors = \
    zoptr.explicitParamSpecs(dfExplParamSpecs=dfOptimExplSpecs, dropDupes=True, check=True)

assert len(dfOptimExplSpecs) == nOptimExplSpecs
assert userParamSpecCols == optimParamSpecCols
assert intParamSpecCols == intOptimParamSpecCols
assert unmUserParamSpecCols == []
assert checkVerdict
assert not checkErrors

In [None]:
%%time

# Analyses
results = zoptr.run(dfOptimExplSpecs, threads=12)

computed = True

#### Cas 3a : 12 analyses specs (12 parallel threads) on a Lenovo T490 4 HT Cores

Paramètres dans refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx nettoyé de Param1_expl et Param2_expl.

MoteurOpt='zoopt(120)'

2020-08-21 22:41:56,626 2880 analyses => 22 results, Wall time: 4min 21s

#### Cas 3b : 12 analyses specs (6 parallel threads) on a Lenovo T490 4 HT Cores

CritChx='AIC', IntervConf=95, TroncGche='auto', TroncDrte='auto', MethOutliers='tucquant(2.5)',
NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None, ExprOpt='max(chi2)'

MoteurOpt='zoopt(160)'

2020-06-29 21:18:04,727 Wall time: 4min 31s

MoteurOpt='zoopt(250, tv=0.6)'

2020-06-28 19:23:38,868 Wall time: 9min 19s

#### Cas 3b : 12 analyses specs (12 parallel threads) on a Lenovo T490 4 HT Cores

CritChx='AIC', IntervConf=95, TroncGche='auto', TroncDrte='auto', MethOutliers='tucquant(2.5)',
NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None, ExprOpt='max(chi2)'

MoteurOpt='zoopt(160)'

2020-07-18 15:22:47,289 Wall time: 3min 51s

In [None]:
zoptr.shutdown()

In [None]:
results.dfTransData('fr')

### c. Save results for later reload or examination

In [None]:
results.toExcel(workDir / f'valtests-mcds-optimiser{varOpt}-results.xlsx')

In [None]:
#results.toExcel(workDir / 'valtests-mcds-optimiser-results-fr.xlsx', lang='fr')

## 4B. Or : Load optimisation results from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                    (dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                     transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                     sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                     abbrevCol=optAbbrevCol, abbrevBuilder=optimAbbrev,
                     anlysIndCol=optIndCol, sampleIndCol=sampleNumCol,
                     distanceUnit=distanceUnit, areaUnit=areaUnit,
                     surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                     resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamSpecCols))
    
    results = zoptr.setupResults()
    
    zoptr.shutdown()
    
    # Load results from file.
    resFileName = workDir / f'valtests-mcds-optimiser{varOpt}-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} optimisations to compare'.format(len(results)))

## 5. Deduce analyses specs from optimisation results

In [None]:
varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'

In [None]:
# Get sample and analysis params, and above all optimised truncation param. values from optimiser results.
optTgtCols = ['TrGche', 'TrDrte', 'NbTrchMod']
#optTgtCols = ['TroncGche', 'TroncDrte', 'NbTrModel']
dfAnlysSpecs = results.dfData[['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust',
                               'minDist', 'maxDist', 'fitDistCuts'] + optTgtCols].copy()

# Add analysis abbreviation from truncation params optim. specs (not from optimised results).
dfAnlysSpecs[anlysAbbrevCol] = dfAnlysSpecs.apply(analysisAbbrev, axis='columns')

# No need for the truncation params optim. specs anymore
dfAnlysSpecs.drop(columns=optTgtCols, inplace=True)

# Rename optimised truncation param. columns for analysis
dfAnlysSpecs.rename(columns=dict(minDist='TrGche', maxDist='TrDrte', fitDistCuts='NbTrchMod'), inplace=True)

In [None]:
# But non-optimised truncation parameters are not in optimiser result columns (minDist, maxDist, fitDisCuts, ...) ...
# so we have to get them back from optimisation specs (TrGche, TrDrte, NbTrchMod, ...)

# String specs are optimisation params, numerical ones are already determined truncation params.
bdfToBeKeptSpecCells = results.dfData[optTgtCols].applymap(lambda v: isinstance(v, str))

dfAnlysSpecs[optTgtCols] = dfAnlysSpecs[optTgtCols].where(bdfToBeKeptSpecCells,
                                                          other=results.dfData[optTgtCols])

In [None]:
dfAnlysSpecs

In [None]:
workDir = pl.Path('tmp/mcds-anaftopt')

In [None]:
computed = False

## 6A. Or : Really run analyses

### a. MCDS Analyser object

In [None]:
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                          abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                          distanceUnit=distanceUnit, areaUnit=areaUnit,
                          surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                          resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                               after=anlysParamCols + [anlysAbbrevCol]),
                          workDir=workDir, logProgressEvery=1,
                          defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                          defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                          defMinDist=defMinDist, defMaxDist=defMaxDist,
                          defFitDistCuts=defFitDistCuts, defDiscrDistCuts=defDiscrDistCuts)

### b. Check analysis explicit specs

In [None]:
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfAnlysSpecs, dropDupes=True, check=True)

assert len(dfAnlysSpecs) == nOptimedAnlyses, f'{len(dfAnlysSpecs)} != {nOptimedAnlyses}'
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod'], str(userParamSpecCols)
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts'], str(intParamSpecCols)
assert unmUserParamSpecCols == []
assert verdict
assert not reasons, str(reasons)

### c. Run analyses

In [None]:
%%time

# Analyses : min=5, max=11s elapsed for 64 analyses with 6 threads on a Lenovo P52 (6-HT-core i7-8850H with PCI-e SSD)
# Analyses : ~2.1s elapsed for 22 analyses with 6-12 threads on a Lenovo T490 (4-HT-core i5-8365U with PCI-e SSD)
results = anlysr.run(dfAnlysSpecs, threads=12)

computed = True

In [None]:
anlysr.shutdown()

In [None]:
results.dfTransData('fr')

In [None]:
results.dfData

### d. Save results for later reload or examination

In [None]:
results.toExcel(workDir / f'valtests-mcds-analyser-afteropt{varOpt}-results.xlsx')

In [None]:
#results.toExcel(workDir / 'valtests-mcds-analyser-afteropt-fr.xlsx', lang='fr')

## 6B. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                              resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                   after=[anlysAbbrevCol]),
                              transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                              sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                              abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                              distanceUnit=distanceUnit, areaUnit=areaUnit,
                              surveyType=surveyType, distanceType=distanceType, clustering=clustering)
    
    results = anlysr.setupResults()
    
    anlysr.shutdown()
    
    # Load results from file.
    resFileName = workDir / f'valtests-mcds-analyser-afteropt{varOpt}-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

# X. Run opt-analyses with real life field data (short code)

i.e. analyses with ready-to-go (const values) parameters, and some others with to-be-computed parameters (through otpimisation)

Thanks to MCDSTruncationOptanalyser class.

## 0. Optanalyser parameters

In [None]:
# Description des données
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'Distance'
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

dSurveyArea = dict(Zone='ACDC', Surface='2400')

In [None]:
# Les analyses à faire (avec specs d'optimisation dedans si nécessaire)
optanlysSpecFile = 'refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx'
optanlysSpecFile = 'refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.tmp.xlsx'
#optanlysSpecFile = '../donnees/acdc/ACDC2019-Naturalist-ExtraitSpecsOptanalyses-reduit.ods'

In [None]:
# Autres paramètres
dDefSubmitOtherParams = dict()

**ATTENTION** : Exécuter IX.0 pour les autres paramètres !

[IX. Truncation optimisation (short code and fast run)](#IX.-Truncation-optimisation-(short-code-and-fast-run))

## 1. Individuals data set

In [None]:
# Les données individualisées et transects
indivObsFile = 'refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods'

In [None]:
dfObsIndiv = ads.DataSet(indivObsFile, sheet='DonnéesIndiv').dfData
len(dfObsIndiv)

In [None]:
dfObsIndiv.head()

In [None]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [None]:
dfTransects = ads.DataSet(indivObsFile, sheet='Inventaires').dfData
len(dfTransects)

In [None]:
dfTransects

In [None]:
workDir = pl.Path('tmp/mcds-optanlr')

In [None]:
optimTruncCol = ads.MCDSTruncationOptanalyser.OptimTruncFlagCol

In [None]:
computed = False

## 3A. Or : Really run opt-analyses

### a. MCDS Opt-Analyser object

In [None]:
optanlr = \
    ads.MCDSTruncationOptanalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                                  transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                  sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                                  abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                  anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                                  distanceUnit=distanceUnit, areaUnit=areaUnit,
                                  surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                                  resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                       after=anlysParamCols + [optimTruncCol, anlysAbbrevCol]),
                                  workDir=workDir, logAnlysProgressEvery=5, logOptimProgressEvery=3,
                                  defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                                  defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                                  defExpr2Optimise=defExpr2Optimise, defMinimiseExpr=defMinimiseExpr,
                                  defOutliersMethod=defOutliersMethod, defOutliersQuantCutPct=defOutliersQuantCutPct,
                                  defFitDistCutsFctr=defFitDistCutsFctr, defDiscrDistCutsFctr=defDiscrDistCutsFctr,
                                  defSubmitTimes=defSubmitTimes, defSubmitOnlyBest=defSubmitOnlyBest,
                                  dDefSubmitOtherParams=dDefSubmitOtherParams,
                                  dDefOptimCoreParams=dict(core=defCoreEngine, maxIters=defCoreMaxIters,
                                                           termExprValue=defCoreTermExprValue,
                                                           algorithm=defCoreAlgorithm, maxRetries=defCoreMaxRetries))

### b. Check opt-analyses specs

In [None]:
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    optanlr.explicitParamSpecs(implParamSpecs=optanlysSpecFile, dropDupes=True, check=True)

In [None]:
assert len(dfAnlysSpecs) == 60
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts', 'SubmitParams']
assert unmUserParamSpecCols == []
assert verdict
assert not reasons

In [None]:
print(len(dfAnlysSpecs))
if not verdict:
    print(reasons)
    print(userParamSpecCols, intParamSpecCols, unmUserParamSpecCols)

### c. Run opt-analyses

In [None]:
#dfAnlysSpecs.loc[51:53]

In [None]:
%%time

results = optanlr.run(implParamSpecs=optanlysSpecFile, threads=12)
#results = optanlr.run(dfExplParamSpecs=dfAnlysSpecs, threads=12)  # A small sample, for a quicker check

computed = True

In [None]:
optanlr.shutdown()

In [None]:
assert optimTruncCol in results.dfTransData('fr').columns

In [None]:
results.dfTransData('fr')

In [None]:
results.dfData

In [None]:
results.dfTransData('fr').to_excel('tmp/res-tst.xlsx')

### d. Save results for later reload or examination

In [None]:
results.toExcel(workDir / 'valtests-mcds-optanlyser-results.xlsx')

In [None]:
#results.toExcel(workDir / 'valtests-mcds-optanlyser-results-fr.xlsx', lang='fr')

In [None]:
#results.fromExcel(workDir / 'valtests-mcds-optanlyser-results.xlsx', specs=False)

## 3B. Or : Load opt-analyses results from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An opt-analyser object knowns how to build an empty results object ...
    optanlr = \
        ads.MCDSTruncationOptanalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                                      transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                      sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                                      sampleDistCol=sampleDistCol,
                                      abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                      anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                                      distanceUnit=distanceUnit, areaUnit=areaUnit,
                                      surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                                      resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                           after=anlysParamCols + [optimTruncCol, anlysAbbrevCol]))

    results = optanlr.setupResults()
    
    optanlr.shutdown()
    
    # Load results from file.
    resFileName = workDir / 'valtests-mcds-optanlyser-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

## 4. Compare results to reference

(reference generated with same kind of "long" code like in III above, but on another data set)

### a. Load reference unoptimised analyses results from file

In [None]:
# Load unoptimised reference
# 1. Clone results _without_ data.
rsUnoptRef = results.copy(withData=False)

# 2. Load it with reference data
rsUnoptRef.fromOpenDoc('refout/ACDC2019-Naturalist-ExtraitResultats.ods')

unoptAnlysAbbrevs = list(rsUnoptRef.dfData[('header (tail)', anlysAbbrevCol, 'Value')])

len(unoptAnlysAbbrevs)

### b. Separate actual optanalysis results in 2 sets : optimised, and unoptimised

In [None]:
# Unoptimised results.
rsUnoptRes = results.copy()

rsUnoptRes.dropRows(~rsUnoptRes.dfData[('header (tail)', anlysAbbrevCol, 'Value')].isin(unoptAnlysAbbrevs))

#rsUnoptRes.dfTransData('fr').to_excel('tmp/res.xlsx')

In [None]:
# Optimised results.
rsOptRes = results.copy()

rsOptRes.dropRows(rsOptRes.dfData[('header (tail)', anlysAbbrevCol, 'Value')].isin(unoptAnlysAbbrevs))

In [None]:
dict(unoptRef=len(rsUnoptRef), unoptRes=len(rsUnoptRes), optRes=len(rsOptRes), allRes=len(results))

### c. Compare "unoptimised" analyses results to reference

In [None]:
# Compare (ignore sample and analysis indexes, no use here).
indexCols = [col for col in rsUnoptRes.miCustomCols.to_list() if '(sample)' in col[0]] \
            + [('parameters', 'estimator key function', 'Value'),
               ('parameters', 'estimator adjustment series', 'Value'),
               ('parameters', 'left truncation distance', 'Value'),
               ('parameters', 'right truncation distance', 'Value'),
               ('parameters', 'model fitting distance cut points', 'Value'),
               ('header (tail)', 'AbrevAnlys', 'Value')]
subsetCols = [col for col in rsUnoptRes.columns.to_list() \
              if col not in (indexCols + [col for col in rsUnoptRes.miCustomCols.to_list()
                                          if '(sample)' not in col[0]]
                             + [('parameters', 'estimator selection criterion', 'Value'),
                                ('parameters', 'CV interval', 'Value'),
                                ('run output', 'run time', 'Value'),
                                ('run output', 'run folder', 'Value'),
                                ('detection probability', 'Delta AIC', 'Value'),
                                ('detection probability', 'key function type', 'Value'),
                                ('detection probability', 'adjustment series type', 'Value')])]

dfDiff = rsUnoptRef.compare(rsUnoptRes, indexCols=indexCols, subsetCols=subsetCols, dropCloser=14, dropNans=True)

assert dfDiff.empty, 'No, no, no : not the same ...'

print('Yessssss !')

In [None]:
# To be perfectly honest, some 10^-15 differences (when some results loaded from Excel, some other not).
rsUnoptRef.compare(rsUnoptRes, indexCols=indexCols, subsetCols=subsetCols, dropCloser=15, dropNans=True)

### d. Compare "with optimisation" results to "reference"

(reference = analyses results computed in IX.6)

In [None]:
# Load optimised reference (analysis results with truncation params computed through optimisation)
# 1. Clone results _without_ data.
rsOptRef = results.copy(withData=False)

# 2. Load it with reference data
rsOptRef.fromExcel(f'tmp/mcds-anaftopt/valtests-mcds-analyser-afteropt{varOpt}-results.xlsx')

In [None]:
# Sort rows for each analysis optim param specs ... by left truncation distance first
miSortCols = [('header (tail)', 'AbrevAnlys', 'Value'),
              ('parameters', 'left truncation distance', 'Value'),
              ('parameters', 'right truncation distance', 'Value'),
              ('parameters', 'model fitting distance cut points', 'Value')]

rsOptRes.sortRows(by=miSortCols)
rsOptRef.sortRows(by=miSortCols)

In [None]:
# Simple columns index (fr) + setup sorted analyses index
miAnlysNumCol = 'NumAnlys'
dfOptRes = rsOptRes.dfTransData('fr')
dfOptRes[miAnlysNumCol] = [i for i in range(len(dfOptRes))]
dfOptRef = rsOptRef.dfTransData('fr')
dfOptRef[miAnlysNumCol] = [i for i in range(len(dfOptRef))]

In [None]:
# Check that order is "compatible" between reference and actual results
miAnlysAbrevCol = 'AbrevAnlys'

assert dfOptRes[miAnlysAbrevCol].to_list() == dfOptRef[miAnlysAbrevCol].to_list()

In [None]:
# Save to disk for visual checks / comparison
#dfOptRes.to_excel('tmp/opt-res-fr.xlsx')
#dfOptRef.to_excel('tmp/opt-ref-fr.xlsx')

In [None]:
# Compare a simple subset of analyses results ...
indexCols = [miAnlysNumCol, miAnlysAbrevCol]
subsetCols = ['AIC', 'PDetec', 'EDR/ESW', 'Densité']

dfDiff = ads.ResultsSet.compareDataFrames(dfOptRes, dfOptRef, indexCols=indexCols, subsetCols=subsetCols, dropNans=True)

dfDiff

In [None]:
# Some diff. stats
dfDiffStats = pd.DataFrame(data=[dfDiff.min(), dfDiff.max(), dfDiff.replace(np.inf, 16).mean()],
                           index=['min', 'max', 'mean'])
dfDiffStats

In [None]:
# Not too bad if less that 10% mean difference (100 / 10**1 = 10%) !
assert dfDiffStats.loc['mean'].min() >= 1.0

# And actually at most P % difference
100 / 10**dfDiffStats.loc['mean'].min()

In [None]:
# Save to disk after "merging" ref and actual results, again for visual checks
dfOptRef.insert(0, 'x', 'ref')
dfOptRes.insert(0, 'x', 'res')

dfOptComp = dfOptRef.append(dfOptRes, sort=False)

dfOptComp.sort_values(by=['NumAnlys', 'x'], inplace=True)

dfOptComp.to_excel('tmp/opt-comp.xlsx')

### e. Some history of computations difference stats with various 'maxIters' values

In [None]:
# Keep stats for history ... copy/paste results below ...
print('**maxIters={} (N=?): max delta = {:.2f} %**'.format(defCoreMaxIters, 100 / 10**dfDiffStats.loc['mean'].min()))
print()
print(dfDiffStats.to_markdown())

In [None]:
%%html
<style>
table {float:left}
</style>

**maxIter=120 (N=3) : max delta = 6.1 %, 1.6 %, 1.7 %**

|Exec1 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |   0.9     |   0.6     |
| max  | inf       |  5.1     | inf       |   6.5     |
| mean |   2.37273 |  1.21364 |   2.15909 |   1.47273 |

|Exec2 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |   0.8     |   0.6     |
| max  | inf       |  5       | inf       | inf       |
| mean |   3.15455 |  1.79545 |   2.82273 |   2.47273 |

|Exec3 |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1.1     |  0.3     |   0.6     |   0.4     |
| max  | 6.6     |  4.9     |   5.2     |   4.9     |
| mean | 2.57727 |  1.76818 |   2.21364 |   1.92273 |

**maxIter=250 (N=3) : max delta = 0.83 %, 3.4 %, 0.53 %**

|Exec1 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.4     |   0.8     |   0.6     |
| max  | inf       |  5.9     | inf       | inf       |
| mean |   4.39545 |  2.08182 |   2.95455 |   2.68636 |

|Exec2 |     AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|--------:|---------:|----------:|----------:|
| min  | 1       |  0.4     |   0.5     |      0.3  |
| max  | 6.7     |  5.4     |   5.7     |      5.5  |
| mean | 2.18636 |  1.46818 |   1.82273 |      1.55 |

|Exec3 |       AIC |    PDetec |   EDR/ESW |   Densité |
|:-----|----------:|----------:|----------:|----------:|
| min  |   1       |   0.3     |   0.9     |   0.6     |
| max  | inf       | inf       | inf       | inf       |
| mean |   3.76818 |   2.27727 |   3.50909 |   3.24091 |

**maxIters=400 (N=4): max delta = 2.6 %, 2.9%, 1.9%, 1.8%**

|Exec1 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.1     |   0.5     |   0.3     |
| max  | inf       |  6.7     | inf       |   6.4     |
| mean |   3.03182 |  1.57727 |   2.65455 |   1.89091 |

|Exec2 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.3     |   0.9     |   0.6     |
| max  | inf       |  4.3     |   5       |   4.7     |
| mean |   2.79091 |  1.54091 |   2.08182 |   1.80909 |

|Exec3 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |   0.5     |   0.3     |
| max  | inf       |  6.7     | inf       |  15.9     |
| mean |   3.40455 |  1.71818 |   2.46364 |   2.24545 |

|Exec4 |       AIC |   PDetec |   EDR/ESW |   Densité |
|:-----|----------:|---------:|----------:|----------:|
| min  |   1.1     |  0.2     |      0.8  |   0.6     |
| max  |   6.7     |  4.9     |      5.2  |   4.9     |
| mean |   2.66818 |  1.74091 |      2.45 |   2.18636 |

# XI. Truncation optimisation : Study on parameter variants

Objective: How to choose key parameters ?
* how many outliers ?
* how many max iters ?
* correlation with number of sightings ?

## 1. Data set, samples, transects, analyses

In [None]:
# First, run IX. 0, 1 and 2 above.

[IX. Truncation optimisation (short code / fast run)](#IX.-Truncation-optimisation-(short-code-%2B-fast-run))

In [None]:
dfOptimExplSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx', 
                                                     ignore=['Params1_expl', 'Params2_expl'])

dfOptimExplSpecs.drop(dfOptimExplSpecs[dfOptimExplSpecs[['TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']]
                                           .isnull().all(axis='columns')].index,
                      inplace=True)

dfOptimExplSpecs.drop(columns=['TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt'], inplace=True)

nOptimExplSpecs = len(dfOptimExplSpecs)

dfOptimExplSpecs

## 2. Parameter variants plan

In [None]:
nTimes = 20

In [None]:
expr2MaxPlan = ['chi2', 'ks']

In [None]:
outliersPctPlan = [2.5, 5.0]

In [None]:
maxItersPlan = [50, 100, 150, 200, 250, 300]

In [None]:
computed = False

## 3A. Or: Run optimisations according to the plan

In [None]:
nParSets = len(expr2MaxPlan) * len(outliersPctPlan) * len(maxItersPlan)
nOpt2Run = len(dfOptimExplSpecs) * nTimes * nParSets
print(f'About to run {nOpt2Run} optimisations !')

In [None]:
# Colonnes de dfOptimExplSpecs donnant les paramètres d'analyse / optimisation
optimParamsSpecsCols  = ['FonctionClé', 'SérieAjust', 'CritChx', 'IntervConf',
                         'TroncGche', 'TroncDrte', 'MethOutliers', 'NbTrModel', 'NbTrDiscr',
                         'ExprOpt', 'ParExec', 'MoteurOpt']

In [None]:
%%time

ldfResults = list()

nParSetInd = 1
for expr2Max in expr2MaxPlan:
    
    for olrsPct in outliersPctPlan:

        for maxIters in maxItersPlan:

            logger.info(f'Params set {nParSetInd}/{nParSets}: {expr2Max=}, {nTimes=}, {maxIters=}, {olrsPct=:.1f}')
            
            # Prepare optim. params.
            dfMoreOptimCols = \
                pd.DataFrame([dict(CritChx='AIC', IntervConf=95,
                                   TroncGche='auto', TroncDrte='auto',
                                   MethOutliers=f'tucquant({olrsPct:.1f})',
                                   NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None,
                                   ExprOpt=f'max({expr2Max})', ParExec=f'times({nTimes})',
                                   MoteurOpt=f'zoopt({maxIters})')]*len(dfOptimExplSpecs))

            dfOptVarExplSpecs = pd.concat([dfOptimExplSpecs.reset_index(drop=True), dfMoreOptimCols], axis='columns')

            # Run optimisation.
            zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                            (dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea, 
                             transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                             sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                             abbrevCol=optAbbrevCol, abbrevBuilder=optimAbbrev,
                             anlysIndCol=optIndCol, sampleIndCol=sampleNumCol,
                             distanceUnit='Meter', areaUnit='Hectare',
                             surveyType='Point', distanceType='Radial', clustering=False,
                             resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamsSpecsCols),
                             workDir='/tmp', logData=False,                 
                             defCoreMaxIters=120)

            results = zoptr.run(dfOptVarExplSpecs, threads=12)

            zoptr.shutdown()

            # Save results for this run
            ldfResults.append(results.dfData)
            
            nParSet += 1
        
# Done : concat and save results.
dfResults = pd.concat(ldfResults, ignore_index=True)

resFileName = 'tmp/valtests-mcds-opter-res4stats.xlsx'
dfResults.to_excel(resFileName, index=False)
logger.info(f'Results saved to {resFileName}')

computed = True

## 3B. Or : Load results from a previous run

(already run and saved above)

In [None]:
if 'computed' not in dir():
    computed = False
if not computed:
    
    # Load results from file.
    #resFileName = 'tmp/valtests-mcds-opter-res4stats-20200705.xlsx'
    resFileName = 'tmp/valtests-mcds-opter-res4stats-20201103.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    dfResults = pd.read_excel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} results to process'.format(len(dfResults)))

## 4. First stats on optimisation results

* raw stats : mean and std
* first correlations : number of analyses / optimised criterium

In [None]:
len(dfResults), dfResults.columns

In [None]:
dfResults.head()

In [None]:
optResCols = ['minDist', 'maxDist', 'fitDistCuts', 'chi2', 'ks', 'chi2*ks']
#groupCols = [col for col in dfResults.columns if col not in optResCols]
groupCols = ['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust', 'MethOutliers', 'ExprOpt',
             'MinDist', 'MaxDist', 'FitDistCuts', 'NFunEvals']

### a. Raw stats : mean, std

In [None]:
dfStats = dfResults.groupby(groupCols).agg(['mean', 'std'])
dfStats

In [None]:
resFileName = 'tmp/valtests-mcds-opter-stats.xlsx'
dfStats.reset_index().to_excel(resFileName)

### b. Visual correlations

In [None]:
dfResults['NFunEvalsR'] = dfResults.NFunEvals.apply(lambda v: int(50*np.ceil(v/50)))

In [None]:
for esp in dfResults['Espèce'].unique():
    axes = dfResults[dfResults['Espèce'] == esp].plot.hexbin(y='NFunEvalsR', x='chi2', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : chi2 / NFunEvals')

In [None]:
plt.close()

In [None]:
for esp in dfResults['Espèce'].unique():
    axes = dfResults[dfResults['Espèce'] == esp].plot.hexbin(y='NFunEvalsR', x='ks', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : ks / NFunEvals')

In [None]:
plt.close()

In [None]:
dfResults['Outliers'] = dfResults.MethOutliers.apply(lambda s: float(s[len('tucquant('):-1]))

In [None]:
_ = dfResults.plot.scatter(y='Outliers', x='chi2', figsize=(14, 2))

In [None]:
_ = dfResults.plot.scatter(y='Outliers', x='ks', figsize=(14, 2))

In [None]:
dfResults.head()

In [None]:
plyx.violin(dfResults, x='chi2', y='NFunEvalsR', facet_row='Outliers', color="Espèce", orientation='h', height=1000)

In [None]:
plyx.violin(dfResults, x='ks', y='NFunEvalsR', facet_row='Outliers', color="Espèce", orientation='h', height=1000)

### c. Computed correlations

(linéaires, de Pearson)

In [None]:
def pearsonCorr(df, x, y):
    
    cv = np.cov(df[x].values, df[y].values)
    
    return pd.Series(dict(corr=cv[0, 1] / cv[0, 0] / cv[1, 1]))

In [None]:
# Nb of analyses run / optimisation criterium
groupCols = ['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust', 'MethOutliers', 'ExprOpt']

df = dfResults.loc[dfResults.ExprOpt == 'max(chi2)',
                   groupCols + ['NFunEvals', 'chi2']].groupby(groupCols).apply(pearsonCorr, x='NFunEvals', y='chi2')
df.rename(columns=dict(corr='NFun/Expr'), inplace=True)
dfCorr = df.copy()

df = dfResults.loc[dfResults.ExprOpt == 'max(ks)',
                   groupCols + ['NFunEvals', 'ks']].groupby(groupCols).apply(pearsonCorr, x='NFunEvals', y='ks')
df.rename(columns=dict(corr='NFun/Expr'), inplace=True)
dfCorr = dfCorr.append(df)

dfCorr.describe()

In [None]:
dfCorr[dfCorr.index.get_level_values('ExprOpt') == 'max(ks)'].sort_values(by='NFun/Expr', ascending=False)

In [None]:
dfCorr[dfCorr.index.get_level_values('ExprOpt') == 'max(chi2)'].sort_values(by='NFun/Expr', ascending=False)

In [None]:
# % of outliers excluded / optimisation criterium
groupCols = ['Espèce', 'Passage', 'Adulte', 'Durée']

df = dfResults.loc[dfResults.ExprOpt == 'max(chi2)',
                   groupCols + ['Outliers', 'chi2']].groupby(groupCols).apply(pearsonCorr, x='Outliers', y='chi2')
df.rename(columns=dict(corr='Outliers/Expr'), inplace=True)
dfCorr = df.copy()

df = dfResults.loc[dfResults.ExprOpt == 'max(ks)',
                   groupCols + ['Outliers', 'ks']].groupby(groupCols).apply(pearsonCorr, x='Outliers', y='ks')
df.rename(columns=dict(corr='Outliers/Expr'), inplace=True)
dfCorr = dfCorr.append(df)

dfCorr

## 5. Run analyses with optimised truncations

(to get the actual numbers of sightings retained)

### a. Deduce analyses specs from optimisation results

In [None]:
dfResults.columns

In [None]:
varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'

In [None]:
# Get sample and analysis params, and above all optimised truncation param. values from optimiser results.
#optTgtCols = ['TrGche', 'TrDrte', 'NbTrchMod']
optTgtCols = ['TroncGche', 'TroncDrte', 'NbTrModel']
otherOptTgtCols = ['Outliers', 'NFunEvals']
dfAnlysSpecs = dfResults[['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust',
                          'minDist', 'maxDist', 'fitDistCuts'] + optTgtCols + otherOptTgtCols].copy()

# Add analysis abbreviation from truncation params optim. specs (not from optimised results).
dfAnlysSpecs[anlysAbbrevCol] = dfAnlysSpecs.apply(analysisAbbrev, axis='columns')

# No need for the truncation params optim. specs anymore
dfAnlysSpecs.drop(columns=optTgtCols, inplace=True)

# Rename optimised truncation param. columns for analysis
dfAnlysSpecs.rename(columns=dict(minDist='TrGche', maxDist='TrDrte', fitDistCuts='NbTrchMod'), inplace=True)

dfAnlysSpecs

In [None]:
workDir = pl.Path('tmp/mcds-optstats')

In [None]:
computed = False

### b. Or : Really run analyses

In [None]:
# i. MCDS Analyser object
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

anlysr = ads.MCDSAnalyser(dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea,
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                          abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                          distanceUnit=distanceUnit, areaUnit=areaUnit,
                          surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                          resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                               after=anlysParamCols + [anlysAbbrevCol, 'Outliers', 'NFunEvals']),
                          workDir=workDir,
                          defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                          defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                          defMinDist=defMinDist, defMaxDist=defMaxDist,
                          defFitDistCuts=defFitDistCuts, defDiscrDistCuts=defDiscrDistCuts)

In [None]:
# ii. Check analysis explicit specs
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfAnlysSpecs, dropDupes=True, check=True)

assert len(dfAnlysSpecs) == len(dfResults)
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod'], str(userParamSpecCols)
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts'], str(intParamSpecCols)
assert unmUserParamSpecCols == []
assert verdict
assert not reasons, str(reasons)

In [None]:
%%time

# iii. Run analyses

# Analyses : 20mn for 8640 analyses with 12 threads on a Lenovo T490 (4-HT-core i5-8365U with PCI-e SSD)
results = anlysr.run(dfAnlysSpecs, threads=12)

computed = True

In [None]:
anlysr.shutdown()

In [None]:
results.dfTransData('fr')

In [None]:
# iiii. Save results for later reload or examination
results.toExcel(workDir / 'valtests-mcds-analyser-afteropt-results.xlsx')

In [None]:
#results.toExcel(workDir / 'valtests-mcds-analyser-afteropt-fr.xlsx', lang='fr')

### b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    anlysr = ads.MCDSAnalyser(dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea,
                              resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                   after=[anlysAbbrevCol]),
                              transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                              sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                              abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                              distanceUnit=distanceUnit, areaUnit=areaUnit,
                              surveyType=surveyType, distanceType=distanceType, clustering=clustering)
    
    results = anlysr.setupResults()
    
    anlysr.shutdown()
    
    # Load results from file.
    resFileName = workDir / 'valtests-mcds-analyser-afteropt-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to study'.format(len(results)))

In [None]:
dfAnRes = results.dfTransData('fr')
dfAnRes

## 6. Other stats on analysis results

Through NObs mainly

In [None]:
dfAnRes.columns

In [None]:
for esp in dfAnRes['Espèce'].unique():
    axes = dfAnRes[dfAnRes['Espèce'] == esp].plot.hexbin(y='NObs', x='Chi2 P', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : chi2 / NObs')

In [None]:
plt.close()

In [None]:
for esp in dfAnRes['Espèce'].unique():
    axes = dfAnRes[dfAnRes['Espèce'] == esp].plot.hexbin(y='NObs', x='KS P', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : KS / NObs')

In [None]:
plt.close()

In [None]:
dfAnRes['NFunEvalsR'] = dfAnRes.NFunEvals.apply(lambda v: int(50*np.ceil(v/50)))

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'Chi2 P', 'NObs', 'Outliers', 'NFunEvalsR']].dropna(subset=['NObs']),
             x='Chi2 P', y='NObs', facet_col='Outliers', facet_row='NFunEvalsR', color='Espèce', height=1200)

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'KS P', 'NObs', 'Outliers', 'NFunEvalsR']].dropna(subset=['NObs']),
             x='KS P', y='NObs', facet_col='Outliers', facet_row='NFunEvalsR', color='Espèce', height=1200)

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'Chi2 P', 'NObs', 'Outliers', 'NFunEvals']].dropna(subset=['NObs']),
            x='NObs', y='Chi2 P', facet_col='Outliers', facet_row='Espèce', color='NFunEvals', height=700)

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'KS P', 'NObs', 'Outliers', 'NFunEvals']].dropna(subset=['NObs']),
             x='NObs', y='KS P', facet_col='Outliers', facet_row='Espèce', color='NFunEvals', height=700)

# Development

## ResultsSet.append

Updated version thanks to pd.DataFrame.append(pd.Series) study above

In [None]:
def append(dfData, sdfResult, sCustomHead):

    if sCustomHead is not None:
        if isinstance(sdfResult, pd.Series):
            sdfResult = sCustomHead.append(sdfResult)
        else: # DataFrame
            dfCustomHead = pd.DataFrame([sCustomHead]*len(sdfResult)).reset_index(drop=True)
            sdfResult = pd.concat([dfCustomHead, sdfResult], axis='columns')

    # Normal append if _dfData not empty ; otherwise initialise _dfData in a way
    # that keeps the original types of sdfResult / 
    if dfData.columns.empty:
        if isinstance(sdfResult, pd.Series):
            dfData = pd.DataFrame([sdfResult])
        else: # DataFrame
            dfData = sdfResult
    else:
        dfData = dfData.append(sdfResult, ignore_index=True)

    return dfData

### a. Initialise DataFrame

In [None]:
# Empty
df = pd.DataFrame()

In [None]:
# Not empty, mono-index columns
df = pd.DataFrame([dict(a=1, b=2.5, c='x', x='a', y=1, z=1.78),
                   dict(a=2, b=4.5, c='y', x='b', y=2, z=5.88889)])
df

In [None]:
# Not empty, multi-index columns
df = pd.DataFrame([{('a', 'z'): 1, ('b', 'y'): 2.5, ('c', 'x'): 'x', ('x', 'w'): 'a', ('y', 'v'): 1, ('z', 'u'): 1.78},
                   {('a', 'z'): 2, ('b', 'y'): 4.5, ('c', 'x'): 'y', ('x', 'w'): 'b', ('y', 'v'): 2, ('z', 'u'): 5.88889}])
df.columns = pd.MultiIndex.from_tuples(df.columns)
df

### b. Initialise Series / DataFrame to append

In [None]:
# Mono-index
sh = pd.Series(dict(a=3, b=5.978, c='w'))

In [None]:
sr = pd.Series(dict(x='c', y=4, z=9.567))

In [None]:
sr = pd.DataFrame([dict(x='d', y=9, z=12.9),
                   dict(x='e', y=8, z=7.778)])

In [None]:
# Multi-index
sh = pd.Series({('a', 'z'): 3, ('b', 'y'): 5.978, ('c', 'x'): 'w'})

In [None]:
sr = pd.Series({('x', 'w'): 'c', ('y', 'v'): 4, ('z', 'u'): 9.567})

In [None]:
sr = pd.DataFrame([{('x', 'w'): 'd', ('y', 'v'): 9, ('z', 'u'): 12.9},
                   {('x', 'w'): 'e', ('y', 'v'): 8, ('z', 'u'): 7.778}])

### c. append Series /DataFrame to DataFrame

In [None]:
df = append(df, sr, sh)
df

### d. See what's happening

In [None]:
df.dtypes

# Sandbox

In [None]:
type(sh.a)