<!-- Auto table of contents -->
<h1 class='tocIgnore'>AutoDS : Validation tests archives</h1>
<p>(for the <b>autods</b> module, a python interface to MCDS.exe, http://distancesampling.org/)</p>
<p>For up-to-date validation tests, see <a href="./valtests.ipynb" target="_blank">valtests.ipynb</a></p>
<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table of contents</h2>
  <div id="toc"></div>
</div>

In [2]:
%%javascript
$.getScript('ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [3]:
%matplotlib inline

In [4]:
import sys
import os
import pathlib as pl

import re

import numpy as np
import pandas as pd

from IPython.display import HTML, Markdown

import matplotlib.pyplot as plt

#import plotly as ply
#import plotly.graph_objs as plygo
#import plotly.express as plyx

In [5]:
sys.path.insert(0, '..')

In [6]:
import autods as ads

ads.runtime

2021-09-02 08:31:39,135 ads.eng INFO0	Found MCDS.exe here: C:\git\perso\autods\Distance 7\MCDS.exe.


{'platform': 'win32',
 'cpython': '3.8.2 | packaged by conda-forge | (default, Apr 24 2020, 07:34:03) [MSC v.1916 64 bit (AMD64)]',
 'numpy': '1.19.4',
 'pandas': '1.2.5',
 'pickle': '4.0',
 'zoopt': '0.4.0',
 'matplotlib': '3.4.2'}

In [7]:
# Logging configuration.
ads.log.configure(handlers=[sys.stdout, 'tmp/valarc.log'], verbose=True, reset=True)

ads.logger('matplotlib', level=ads.WARNING, reset=True)

ads.logger('ads', level=ads.INFO, reset=True)
#ads.logger('ads.eng', level=ads.INFO, reset=True)
#ads.logger('ads.exr', level=ads.DEBUG, reset=True)
#ads.logger('ads.dat', level=ads.DEBUG, reset=True)
ads.logger('ads.rep', level=ads.INFO1, reset=True)
#ads.logger('ads.opn', level=ads.DEBUG, reset=True)
#ads.logger('ads.opr', level=ads.DEBUG, reset=True)
#ads.logger('ads.anr', level=ads.DEBUG, reset=True)
ads.logger('ads.onr', level=ads.DEBUG1, reset=True)

logger = ads.logger('valarc', level=ads.DEBUG, reset=True)

2021-09-02 08:31:39,578 root INFO0	Will log to Stream(stdout), File(tmp/valarc.log)


# I. Run analyses with real life field data (1/2 : long code, long run)

Note: Don't use this low level method : MCDSAnalyser is here for than now.

Here we use directly MCDSAnalysis class.

(for comparison to manually issued analyses with Distance 7.3)

## 1. Load analyses set specifications

In [None]:
# Load refout results table
refFileName = 'ACDC2019-Papyrus-ALAARV-TURMER-resultats-distance-73.xlsx'
dfRefRes = pd.read_excel(pl.Path('refout', refFileName))
dfRefRes.reset_index(inplace=True) # Generate analysis # (later need for original cases order)
dfRefRes.rename(columns=dict(index='AnlysNum', Name='Model'), inplace=True)

dfRefRes.head()

## 2. Build test cases

In [None]:
# Generate test cases definition code from refout results file (don't cheat : only input columns :-)
modelIdCols = ['Model']
modelParamCols = ['LTrunc', 'RTrunc', 'FitDistCuts', 'DiscrDistCuts']
sampleSelCols = ['Species', 'Periods', 'Prec.', 'Duration']
caseIdCols = ['AnlysNum', 'SampNum'] + sampleSelCols + modelIdCols

dfRefRes['SampNum'] = dfRefRes.groupby(sampleSelCols, sort=False).ngroup()

dfAnlysCases = dfRefRes[caseIdCols + modelParamCols].copy()

dfAnlysCases['KeyFn'] = \
    dfAnlysCases.Model.apply(lambda s: 'UNIFORM' if s.startswith('Unif') \
                                                 else 'HNORMAL' if s.startswith('Half') else 'HAZARD')
dfAnlysCases['AdjSer'] = \
    dfAnlysCases.Model.apply(lambda s: 'COSINE' if s.find(' Cos') > 0 \
                                                else 'POLY' if s.find(' SimPoly') > 0 else 'HERMITE')
dfAnlysCases['InFileName'] = \
    dfAnlysCases.apply(lambda sRow: 'ACDC2019-Papyrus-{}-{}-{}mn-{}dec-dist.txt' \
                                    .format(sRow.Species,
                                            'AB' if 'A+B' in sRow.Periods else 'A' if 'A' in sRow.Periods else 'B',
                                            sRow.Duration.split(' ')[0], sRow['Prec.'].split(' ')[0]),
                       axis='columns')
dfAnlysCases

In [None]:
#def nan2None(v):
#    return None if pd.isnull(v) else v
def distCutsFromSpecs(v):
    if pd.isnull(v):
        return None
    if isinstance(v, int):
        return v
    return [float(x) for x in v.split(',')]

## 3. Prepare analyses

In [None]:
decimalFields = ['Point transect*Survey effort', 'Observation*Radial distance']

In [None]:
# Analysis engine (sequential)
mcds = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-out'),
                      executor=None, # Non-parallel: ~7.5s elapsed on a Lenovo P52 (6-core i7-8850H with PCI-e SSD)
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Frozen analysis parameters (a choice here)
KEstimCriterion = 'AIC'
KCVInterval = 95

In [None]:
# Results object construction
sampCols = [('sample', col, 'Value') for col in sampleSelCols]
miSampCols = pd.MultiIndex.from_tuples(sampCols)

sampIndCol = ('sample', 'SampNum', 'Value')
custCols = [('sample', 'AnlysNum', 'Value'), sampIndCol] + sampCols + [('model', 'Model', 'Value')]
miCustCols = pd.MultiIndex.from_tuples(custCols)

dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=caseIdCols, fr=['NumAnlys', 'NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Modèle']))

results = ads.MCDSAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                                     miSampleCols=miSampCols, sampleIndCol=sampIndCol,
                                     distanceUnit='Meter', areaUnit='Hectare',
                                     surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
computed = False

## 3a. Or : Really run analyses

In [None]:
# Shorten test cases and reference results lists, to go faster
# Warning: If you don't retain entire samples, later comparison will fail on Delta AIC values.
#selCaseInds = [0, 5, 7, 22, 31] # Some random cases, with uncomplete samples.
#selCaseInds = dfAnlysCases[dfAnlysCases.Sample.isin([3, 4])].index # A shorter selection, with complete samples.
selCaseInds = range(len(dfAnlysCases)) # All of them.

nOrigAnlysCases = len(dfAnlysCases)
dfAnlysCases = dfAnlysCases.loc[selCaseInds]
dfRefRes = dfRefRes.loc[selCaseInds]

logger.info(f'Retained {len(selCaseInds)} out of {nOrigAnlysCases}.')

In [None]:
%%time

# Run all analyses
lastInFileName = None
for _, sCase in dfAnlysCases.iterrows():
    
    nCase = sCase.AnlysNum
    name = sCase.InFileName[len('ACDC2019-Papyrus')+1:-len('-dist.txt')]
    name += '-' + sCase.Model.lower().translate(str.maketrans({c:'-' for c in ' ,.:;()/'}))
    logger.info(f'#{nCase+1:3d} {name} {sCase.KeyFn} {sCase.AdjSer}')
    
    # Create data set if not already done.
    if lastInFileName != sCase.InFileName:
        sds = ads.SampleDataSet(pl.Path('refin', sCase.InFileName), decimalFields=decimalFields)
        lastInFileName = sCase.InFileName
        
    # Run analysis and get results
    anlys = ads.MCDSAnalysis(engine=mcds, sampleDataSet=sds, name=name, logData=True,
                             estimKeyFn=sCase.KeyFn, estimAdjustFn=sCase.AdjSer,
                             estimCriterion=KEstimCriterion, cvInterval=KCVInterval,
                             minDist=sCase.LTrunc, maxDist=sCase.RTrunc,
                             fitDistCuts=distCutsFromSpecs(sCase.FitDistCuts),
                             discrDistCuts=distCutsFromSpecs(sCase.DiscrDistCuts))

    anlys.submit()
    sResult = anlys.getResults()

    # Save results
    sHead = pd.Series(data=[sCase[col] for col in sCase.index[:len(caseIdCols)]], index=miCustCols)

    results.append(sResult, sCustomHead=sHead)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')

results.toExcel(resFileName, sheetName='AutoDSVal')

In [None]:
resFileName = os.path.join(mcds.workDir, 'autods-validation-results-en.xlsx')

results.toExcel(resFileName, sheetName='Auto', lang='en')

In [None]:
# Check translation
dfActTrRes = results.dfTransData('fr')

dfActTrRes.head().T.iloc[:30] #.at['TroncGche', 0]

## 3b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
#computed = False

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName, sheetName='AutoDSVal')
    
    # shutdown analysis engine
    mcds.shutdown()

else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

# II. Compare actual results to reference

(reference = manually run analyses with Distance software)

## 1. Extract actual results to compare

In [None]:
# Analysis results
dfActRes = results.dfData

dfActRes.head().T[:30]

In [None]:
# Select columns of auto-results and match them with reference ones, for comparison.
dCompCols = \
{
    ('sample', 'AnlysNum', 'Value'):  'AnlysNum',
    ('sample', 'SampNum', 'Value'):   'SampNum',
    ('sample', 'Species', 'Value'):   'Species',
    ('sample', 'Periods', 'Value'):   'Periods',
    ('sample', 'Prec.', 'Value'):     'Prec.',
    ('sample', 'Duration', 'Value'):  'Duration',
    
    ('model',  'Model', 'Value'):         'Model',
    ('parameters', 'left truncation distance', 'Value'):           'LTrunc',
    ('parameters', 'right truncation distance', 'Value'):          'RTrunc',
    ('parameters', 'model fitting distance cut points', 'Value'):  'FitDistCuts',
    ('parameters', 'distance discretisation cut points', 'Value'): 'DiscrDistCuts',
    
    ('run output', 'run status', 'Value'): 'Status',
    #('run output', 'run time', 'Value'): 'Run', # Only for unintests ref. generation just below
    
    ('detection probability', 'total number of parameters (m)', 'Value'): '# params',
    ('encounter rate', 'number of observations (n)', 'Value'): '# obs',
    
    #('detection probability', 'Delta AIC', 'Value'): 'Delta AIC',
    ('detection probability', 'AIC value', 'Value'): 'AIC',
    ('detection probability', 'chi-square test probability determined', 'Value')               : 'GOF Chi-p',
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value')                  : 'GOF K-S p',
    ('detection probability', 'Cramér-von Mises (uniform weighting) test probability', 'Value'): 'GOF CvM (unif) p',
    ('detection probability', 'Cramér-von Mises (cosine weighting) test probability', 'Value') : 'GOF CvM (cos) p',
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'): 'ESW/EDR',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl')  : 'ESW/EDR LCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl')  : 'ESW/EDR UCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Cv')   : 'ESW/EDR CV',
    
    ('density/abundance', 'density of animals', 'Value'): 'D',
    ('density/abundance', 'density of animals', 'Lcl')  : 'D LCL',
    ('density/abundance', 'density of animals', 'Ucl')  : 'D UCL',
    ('density/abundance', 'density of animals', 'Cv')   : 'D CV',
    
    ('detection probability', 'probability of detection (Pw)', 'Value'): 'P',
    ('detection probability', 'probability of detection (Pw)', 'Lcl')  : 'P LCL',
    ('detection probability', 'probability of detection (Pw)', 'Ucl')  : 'P UCL',
    ('detection probability', 'probability of detection (Pw)', 'Cv')   : 'P CV',
    ('detection probability', 'probability of detection (Pw)', 'Df')   : 'P DF',
}
len(dCompCols)

In [None]:
# Warning: Unused columns (full of NaNs) have been automatically removed
# (see last line of AnalysisResultsSet.dfData getter)
dCompCols = { k: v for k, v in dCompCols.items() if k in dfActRes.columns }
len(dCompCols)

In [None]:
# So we need to cleanup modelParamCols too
modelParamCols = [id_ for id_ in modelParamCols if id_ in dCompCols.values()]
len(modelParamCols)

In [None]:
# Safe stringification of model params
# * needed for use in indexes (hashability)
# * needed to cope with to_excel/read_excel unconsistent None management
def modelParam2Str(par):
    #print(par)
    if isinstance(par, list):
        spar = str([float(v) for v in par])
    elif pd.isnull(par):
        spar = 'None'
    elif isinstance(par, str):
        if ',' in par: # Assumed already somewhat stringified list
            spar = str([float(v) for v in par.strip('[]').split(',')])
    else:
        spar = str(par)
    return spar

In [None]:
# Select results columns and rename them as the reference is, for easier comparison
dfActRes4c = dfActRes[list(dCompCols.keys())].copy()
dfActRes4c.columns = [dCompCols[col] for col in dCompCols]
dfActRes4c[modelParamCols] = dfActRes4c[modelParamCols].applymap(modelParam2Str) # Hashable mandatory for indexing
dfActRes4c.set_index(caseIdCols + modelParamCols, inplace=True)

dfActRes4c

In [None]:
# Select usefull reference columns for comparison
dfRefRes4c = dfRefRes.copy()
dfRefRes4c[modelParamCols] = dfRefRes4c[modelParamCols].applymap(modelParam2Str) # Hashable mandatory for indexing
dfRefRes4c.set_index(caseIdCols + modelParamCols, inplace=True)
dfRefRes4c = dfRefRes4c.reindex(columns=dfActRes4c.columns)

dfRefRes4c

In [None]:
#dfActRes4c.to_excel('tmp/act-res.xlsx')
#dfRefRes4c.to_excel('tmp/ref-res.xlsx')

## 2. Automated diagnosis

Note: Since then, ads.DataSet.compare has been developed based on this prototype ... use it !

In [None]:
# First checks : equality of test case lists (index) and of column names (columns)
assert sorted(dfActRes4c.index)   == sorted(dfRefRes4c.index)
assert sorted(dfActRes4c.columns) == sorted(dfRefRes4c.columns)

In [None]:
# Actual / reference closeness measure : -round(log10((actual - reference) / max(abs(actual), abs(reference))), 1)
# = Compute the order of magnitude that separate the difference from the absolute max. of the two values
# The greater it is, the lower the relative difference
#    Ex: 3 = 10**3 ratio between difference absolue max. of the two,
#        +inf = NO difference at all,
#        0 = bad, one of the two is 0, and the other not,
# See unitary test below.
def closeness(sRefAct):
    
    x, y = sRefAct.to_list()
    
    # Special cases with 1 NaN, or 1 or more inf => all different
    if np.isnan(x):
        if not np.isnan(y):
            return 0 # All different
    elif np.isnan(y):
        return 0 # All different
    
    if np.isinf(x) or np.isinf(y):
        return 0 # All different
    
    # Normal case
    c = abs(x - y)
    if not np.isnan(c) and c != 0:
        c = c / max(abs(x), abs(y))
    
    return np.inf if c == 0 else round(-np.log10(c), 1)

In [None]:
# Actual / reference comparison : compute closeness indicator
dfRelDif = dfRefRes4c.copy()
for col in dfRelDif.columns:
    dfRelDif['act'] = dfActRes4c[col]
    dfRelDif[col] = dfRelDif[[col, 'act']].apply(closeness, axis='columns')
    dfRelDif.drop(columns='act', inplace=True)
    
dfRelDif

In [None]:
# Diagnosis : we only keep lines and columns with some relevant differences.
dfBadRelDif = dfRelDif.copy()
len(dfBadRelDif)

In [None]:
# 1. Suppress rows : Same status and NaNs in the remainder (if status == 0/3/4, execution error or no execution)
valCols = [col for col in dfRelDif.columns if col != 'Status']
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif.Status.abs() == np.inf) & dfBadRelDif[valCols].isnull().all(axis='columns')].index,
                 axis='index', inplace=True)
assert len(dfBadRelDif) == 29, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert anlysNums == [0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17,
                     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 2. Suppress rows : Status and all other columns == inf (<=> strict equality)
#    NB. Some very small differences observed when results have just been computed or when they have been
#        loaded from a previously saved Excel file (above 10**15 closeness value)
dfBadRelDif.drop(dfBadRelDif[dfBadRelDif.apply(np.isinf, axis='columns').all(axis='columns')].index,
                 axis='index', inplace=True)
assert (computed and len(dfBadRelDif) <= 26) or (not computed and len(dfBadRelDif) <= 17), len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert (computed and all(anlysNum in [0, 1, 2, 5, 6, 7, 8, 9, 13, 14, 15, 17, 18,
                                       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
                         for anlysNum in anlysNums)) \
       or (not computed and all(anlysNum in [0, 1, 2, 7, 8, 9, 13, 14, 15, 19, 20, 23, 25, 27, 28, 29, 30]
                                for anlysNum in anlysNums)), \
       anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 3. Suppress rows : Status and all other columns >= à 15 (<=> nearly strict equality)
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif >= 15).all(axis='columns')].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 5, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert all(anlysNum in [9, 20, 28, 29, 30] for anlysNum in anlysNums), anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 4. Suppress rows : Same status and all other columns >= 4 (<=> close to equality)
dfBadRelDif.drop(dfBadRelDif[(dfBadRelDif >= 4).all(axis='columns')].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 4, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert all(anlysNum in [9, 20, 28, 30] for anlysNum in anlysNums), anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# 5. Suppress rows : Same status and all other columns >= 4 (<=> close to equality)
#                    except for GOF KS and CvM, equal to NaN, because not computed when distances are discretised.
if 'DiscrDistCuts' in dfBadRelDif.index.names:
    discrCols = [col for col in dfRelDif.columns if not col.startswith('GOF') or col.find('Chi') > 0]
    df2Drop = (dfBadRelDif.index.get_level_values('DiscrDistCuts') != -1) & (dfBadRelDif[discrCols] >= 4).all(axis='columns')
    dfBadRelDif.drop(dfBadRelDif[df2Drop].index, axis='index', inplace=True)
assert len(dfBadRelDif) == 2, len(dfBadRelDif)
anlysNums = dfBadRelDif.index.get_level_values('AnlysNum').to_list()
assert all(anlysNum in [9, 30] for anlysNum in anlysNums), anlysNums
print(len(dfBadRelDif), 'analyses:', ', '.join(map(str, anlysNums)))

In [None]:
# Verdict (Cf. refFileName Excel file, sheet "DiffAuto" for explanations about the 2 different rows between Act/Ref)
dfBadRelDif.T

In [None]:
dfRefRes4c.loc[dfBadRelDif.index]

In [None]:
nFails = len(dfBadRelDif.index)
if nFails > 0:
    print(f'Warning: {nFails} test case(s) failed ;')
    print(f' ... see sheet "DiffAuto" of {refFileName} for possible explanations.')
else:
    print('All test cases succeeded !')

## 3. Save results to disk.

In [None]:
resCompFileName = os.path.join(mcds.workDir, 'autods-validation-rescomp.xlsx')

with pd.ExcelWriter(resCompFileName) as xlsxWriter:

    dfRefRes.to_excel(xlsxWriter, sheet_name='RefResults', index=True)
    dfActRes4c.reset_index().to_excel(xlsxWriter, sheet_name='ActResults', index=False)
    dfRelDif.reset_index().to_excel(xlsxWriter, sheet_name='Diff2Ref', index=False)
    dfBadRelDif.reset_index().to_excel(xlsxWriter, sheet_name='BadDiff2Ref', index=False)
    dfRefRes4c.loc[dfBadRelDif.index].reset_index().to_excel(xlsxWriter, sheet_name='RefResWithDiff', index=False)
    dfActRes4c.loc[dfBadRelDif.index].reset_index().to_excel(xlsxWriter, sheet_name='ActResWithDiff', index=False)
    dfActRes.to_excel(xlsxWriter, sheet_name='RawActResults', index=True)

## 4. Build HTML and Excel reports

See [IV. Excel and HTML reports](#IV.-Excel-and-HTML-reports)

# III. Parallel run of same analyses

Note: Don't use this low level method : MCDSAnalyser is here for than now.

Here, we directly call MCDSAnalysis.

## 1. Prepare analyses

(same test cases and input data as previously, for easy comparison)

In [None]:
# Analysis executor : 6, 8, None threads => min elapsed = ~2s on a Lenovo P52 (6-core i7-8850H with PCI-e SSD)
parallelExecutor = ads.Executor(threads=6)

# Analysis engine
mcds = ads.MCDSEngine(workDir=pl.Path('tmp') / 'mcds-pout', executor=parallelExecutor, 
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Results object construction
parResults = ads.MCDSAnalysisResultsSet(miCustomCols=miCustCols, miSampleCols=miSampCols, dfCustomColTrans=dfCustColTrans, 
                                        distanceUnit='Meter', areaUnit='Hectare', sampleIndCol=sampIndCol,
                                        surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
computed = False

## 2a. Or : Really run analyses

In [None]:
# Shorten test cases and reference results lists, to go faster
# Warning: If you don't retain entire samples, later comparison will fail on Delta AIC values.
#selCaseInds = [0, 5, 7, 22, 31] # Some random cases, with uncomplete samples.
#selCaseInds = dfAnlysCases[dfAnlysCases.Sample.isin([3, 4])].index # A shorter selection, with complete samples.
selCaseInds = range(len(dfAnlysCases)) # All of them.

nOrigAnlysCases = len(dfAnlysCases)
dfAnlysCases = dfAnlysCases.loc[selCaseInds]
dfRefRes = dfRefRes.loc[selCaseInds]

logger.info(f'Retained {len(selCaseInds)} out of {nOrigAnlysCases}.')

In [None]:
%%time

# Start running all analyses
lastInFileName = None
analyses = dict()
for _, sCase in dfAnlysCases.iterrows():
    
    nCase = sCase.AnlysNum
    name = sCase.InFileName[len('ACDC2019-Papyrus')+1:-len('-dist.txt')]
    name += '-' + sCase.Model.lower().translate(str.maketrans({c:'-' for c in ' ,.:;()/'}))
    logger.info(f'#{nCase+1:3d} {name} {sCase.KeyFn} {sCase.AdjSer}')
    
    # Create data set if not already done.
    if lastInFileName != sCase.InFileName:
        sds = ads.SampleDataSet(pl.Path('refin', sCase.InFileName), decimalFields=decimalFields)
        lastInFileName = sCase.InFileName
        
    # Start running analysis in parallel (don't wait for it's finished, go on)
    sResHead = pd.Series(data=[sCase[col] for col in sCase.index[:len(caseIdCols)]], index=miCustCols)

    anlys = ads.MCDSAnalysis(engine=mcds, sampleDataSet=sds, name=name, customData=sResHead, logData=True,
                             estimKeyFn=sCase.KeyFn, estimAdjustFn=sCase.AdjSer,
                             estimCriterion=KEstimCriterion, cvInterval=KCVInterval,
                             minDist=sCase.LTrunc, maxDist=sCase.RTrunc,
                             #minDist=nan2None(sCase.LTrunc), maxDist=nan2None(sCase.RTrunc),
                             fitDistCuts=distCutsFromSpecs(sCase.FitDistCuts), # TODO: do this when building dfAnlysCases
                             discrDistCuts=distCutsFromSpecs(sCase.DiscrDistCuts))
    anlysFut = anlys.submit()
    
    # Store analysis object and associated "future" for later use (should be running soon or later).
    analyses[anlysFut] = anlys
    
logger.info('All analyses started ; now waiting for their end, and results ...')

# For each analysis as it gets completed (first completed => first yielded)
for anlysFut in parallelExecutor.asCompleted(analyses):

    # Retrieve analysis object from its associated future object
    anlys = analyses[anlysFut]
    
    # Get analysis results
    sResult = anlys.getResults()

    # Save results with header
    parResults.append(sResult, sCustomHead=anlys.customData)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')

parResults.toExcel(resFileName, sheetName='AutoDSVal')

## 2b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-results.xlsx')
    print('Loading results from {} ...'.format(resFileName))

    parResults.fromExcel(resFileName, sheetName='AutoDSVal')
    
    # shutdown analysis engine
    mcds.shutdown()

else:
    
    print('Just computed, not reloading ...')
    
print(f'... {len(parResults)} analyses to compare')

## 3. Compare parallel results to sequential ones

In [None]:
# Prepare sequential results for comparison
dfSeqCmpRes = results.dfTransData('en')

dfSeqCmpRes.fillna(-9999, inplace=True) # Get rid of the Nan pb (because NaN != NaN :-)

# Start date-time and elapsed time and folder can never be the same
dfSeqCmpRes.drop(columns=['StartTime', 'ElapsedTime', 'RunFolder'], inplace=True)

In [None]:
# Prepare parallel results for comparison
dfParCmpRes = parResults.dfTransData('en')

dfParCmpRes.sort_values(by='AnlysNum', inplace=True) # Back to original test case order = sequential run order

dfParCmpRes.reset_index(inplace=True, drop=True) # Enforce same index as a consequence

dfParCmpRes.fillna(-9999, inplace=True) # And get rid of the Nan pb (because NaN != Nan :-)

# Start date-time and elapsed time and folder can never be the same
dfParCmpRes.drop(columns=['StartTime', 'ElapsedTime', 'RunFolder'], inplace=True)

In [None]:
# Warning: Doesn't work if 1 of the 2 sets (not both) was loaded from disk (Excel numerical rounding stuff)
assert (dfSeqCmpRes == dfParCmpRes).all().all(), \
       'Oh, oh, something went differently when run parallely ... but due to one results set loaded from disk ?'

In [None]:
dfSeqCmpRes.compare(dfParCmpRes)

# IV. Excel and HTML reports

In [None]:
# Run this if you want to report parallel results.
# Or NOT if you want to report sequential sequential
seqResults = results
results = parResults

In [None]:
# Sélection des colonnes pour les tableaux de synthèse du rapport
synthRepCols = \
[
    ('sample', 'AnlysNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    ('sample', 'SampNum', 'Value'),
    ('sample stats', 'total number of observations', 'Value'),
    ('sample stats', 'maximal observation distance', 'Value'),
    
    ('model', 'Model', 'Value'),
    ('parameters', 'left truncation distance', 'Value'),
    ('parameters', 'right truncation distance', 'Value'),
    ('parameters', 'model fitting distance cut points', 'Value'),
    ('parameters', 'distance discretisation cut points', 'Value'),
    
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    ('detection probability', 'Delta AIC', 'Value'),
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('density/abundance', 'density of animals', 'Delta Cv'),
    ('density/abundance', 'density of animals', 'Cv'),
    
    ('encounter rate', 'observation rate', 'Value'),
    ('combined quality', 'balanced 1', 'Value'),
    ('combined quality', 'balanced 2', 'Value'),
    ('combined quality', 'balanced 3', 'Value'),
    ('combined quality', 'more Chi2', 'Value'),
    ('combined quality', 'more KS', 'Value'),
    ('combined quality', 'more DCv', 'Value'),
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
    
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Lcl'),
    ('detection probability', 'probability of detection (Pw)', 'Ucl'),
    ('detection probability', 'probability of detection (Pw)', 'Df'),

    ('run output', 'run folder', 'Value'),
]

In [None]:
# Select analysis results columns for the 3 textual columns of the synthesis pre-report
sampleRepCols = \
[
    ('sample', 'SampNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    ('sample', 'AnlysNum', 'Value'),
    ('sample stats', 'total number of observations', 'Value'),
    ('sample stats', 'maximal observation distance', 'Value'),
]

paramRepCols = \
[
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    ('parameters', 'left truncation distance', 'Value'),
    ('parameters', 'right truncation distance', 'Value'),
]
    
resultRepCols = \
[
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'left truncation distance', 'Value'),
    ('encounter rate', 'right truncation distance (w)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('combined quality', 'balanced 1', 'Value'),
    ('combined quality', 'balanced 2', 'Value'),
    ('combined quality', 'balanced 3', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),

    ('density/abundance', 'density of animals', 'Cv'),
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
]

In [None]:
sortRepCols = \
[('sample', 'SampNum', 'Value'), ('sample', 'AnlysNum', 'Value')] \
+ [('sample', col, 'Value') for col in sampleSelCols] \
+ [('parameters', 'left truncation distance', 'Value'),
   ('parameters', 'right truncation distance', 'Value'),
   ('detection probability', 'Delta AIC', 'Value')]
#   ('density/abundance', 'density of animals', 'Delta Cv')]

sortRepAscend = True

In [None]:
report = ads.MCDSResultsFullReport(resultsSet=results, 
                                   sampleCols=sampleRepCols, paramCols=paramRepCols,
                                   resultCols=resultRepCols, synthCols=synthRepCols,
                                   sortCols=sortRepCols, sortAscend=sortRepAscend,
                                   title='Validation AutoDS : Analyses automatiques', subTitle='Rapport d\'analyse global',
                                   anlysSubTitle='Rapport détaillé',
                                   description=("Résultats d'analyses exécutées "
                                                + ('en parallèle' if mcds.workDir.name.endswith('pout')
                                                   else 'séquentiellement')),
                                   keywords='autods, validation', pySources=['valtests.ipynb'],
                                   lang='fr', superSynthPlotsHeight=288,
                                   #plotImgSize=(640, 400), plotLineWidth=1, plotDotWidth=4,
                                   #plotFontSizes=dict(title=11, axes=10, ticks=9, legend=10),
                                   tgtFolder=mcds.workDir, tgtPrefix='autods-validation-report')

In [None]:
xlsxRep = report.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxRep}" target="blank">{xlsxRep}</a>')

In [None]:
%%time

# Lenovo T490  (4-core i5-8350U with PCI-e SSD) 6 generators (2021-02-13) : 38s (n=3)
htmlRep = report.toHtml(generators=6)

HTML(f'Rapport HTML : <a href="{htmlRep}" target="blank">{htmlRep}</a>')

# V. Run and report pre-analyses (1/2 : long code, long duration)

(to help users to setup the full analyses plan : run first try simple analyses and show PDF and few results)

On same input data as for I, II, III.

Note: Don't use this low level method : MCDSPreAnalyser is here for than now.

Here we directly call MCDSPreAnalysis class.

## 1. Determine samples from input data

* in real life, we'd simply load field collected data, and deduce individual "samples" from it ;
* but there, for testing, it's easier to deduce samples from manual analysis specification file)

In [None]:
# Create sample table from refout results table
refFileName = 'ACDC2019-Papyrus-ALAARV-TURMER-resultats-distance-73.xlsx'

sampleSelCols = ['Species', 'Periods', 'Prec.', 'Duration']

dfSamples = pd.read_excel(pl.Path('refout', refFileName), usecols=sampleSelCols)
dfSamples.rename(columns=dict(Name='Model'), inplace=True)
dfSamples.drop_duplicates(inplace=True)
dfSamples.reset_index(drop=True, inplace=True)

dfSamples.reset_index(inplace=True) # Generate sample # (later need for original sample order)
sampleIndCol = 'SampleNum'
dfSamples.rename(columns=dict(index=sampleIndCol), inplace=True)

sampleSelCols = [sampleIndCol] + sampleSelCols

dfSamples

## 2. Prepare pre-analyses

In [None]:
decimalFields = ['Point transect*Survey effort', 'Observation*Radial distance']

In [None]:
# Analysis engine: Non-parallel executor here, 'cause MCDSPreAnalysis takes care of this !
mcds = ads.MCDSEngine(workDir=pl.Path('tmp') / 'mcds-preout', 
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Results object construction
custCols = [('sample', col, 'Value') for col in sampleSelCols]
sampMIndCol = next(iter(mCol for mCol in custCols if mCol[1] == sampleIndCol))
miCustCols = pd.MultiIndex.from_tuples(custCols)
dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=sampleSelCols, fr=['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée']))

preResults = ads.MCDSPreAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans, sampleIndCol=sampMIndCol,
                                           distanceUnit='Meter', areaUnit='Hectare',
                                           surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
KPreEstimCrit = 'AIC'
KPreCVInterval = 95
KPreEstimModStrat = [dict(keyFn=kf, adjSr='COSINE', estCrit=KPreEstimCrit, cvInt=KPreCVInterval) \
                     for kf in['HNORMAL', 'HAZARD', 'UNIFORM', 'NEXPON']]

In [None]:
computed = False

## 3 Or : Really run pre-analyses

In [None]:
%%time

# Run all analyses
lastInFileName = None
for _, sSamp in dfSamples.iterrows():
    
    nSamp = sSamp.SampleNum
    sampId = '{}-{}-{}mn-{}dec' \
             .format(sSamp.Species,
                     'AB' if 'A+B' in sSamp.Periods else 'A' if 'A' in sSamp.Periods else 'B',
                     sSamp.Duration.split(' ')[0], sSamp['Prec.'].split(' ')[0])
    logger.info(f'#{nSamp+1:3d} {sampId}')
    
    # Create data set if not already done.
    inFileName = 'ACDC2019-Papyrus-{}-dist.txt'.format(sampId)
    if lastInFileName != inFileName:
        sds = ads.SampleDataSet(pl.Path('refin', inFileName), decimalFields=decimalFields)
        lastInFileName = inFileName
        
    # Run analysis: Not parallel runs for now ... see below.
    preAnlys = ads.MCDSPreAnalysis(engine=mcds, sampleDataSet=sds, name=sampId, executor=None,
                                   logData=False, modelStrategy=KPreEstimModStrat)
    preAnlys.submit()
    
    # Get results (wait for it's finished)
    sResult = preAnlys.getResults()

    # Save results
    sResHead = sSamp.copy()
    sResHead.index = miCustCols
    preResults.append(sResult, sCustomHead=sResHead)
    
# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Look at results
preResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                              'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                              'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults.xlsx')

preResults.toExcel(resFileName, sheetName='AutoDSVal')

## 3. Or : Load pre-analyses results from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults.xlsx')
    print(f'Loading pre-results from {resFileName} ...')

    preResults.fromExcel(resFileName, sheetName='AutoDSVal')
    
else:
    
    print('Just computed, not reloading ...')
    
print(f'... {len(preResults)} pre-analyses loaded')

In [None]:
# Look at results
preResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                              'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                              'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

## 4. HTML and Excel pre-analyses reports

In [None]:
# Run this if you want to report parallel results.
# Or NOT if you want to report sequential sequential
seqPreResults = preResults
preResults = parPreResults

In [None]:
# Sélection des colonnes pour les tableaux de synthèse du rapport
synthPreRepCols = \
[
    ('sample', 'SampleNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    ('sample stats', 'total number of observations', 'Value'),
    ('sample stats', 'maximal observation distance', 'Value'),
    
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    ('parameters', 'model fitting distance cut points', 'Value'),
    ('parameters', 'distance discretisation cut points', 'Value'),
    
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'right truncation distance (w)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('density/abundance', 'density of animals', 'Cv'),
    ('encounter rate', 'observation rate', 'Value'),
    ('combined quality', 'balanced 1', 'Value'),
    ('combined quality', 'balanced 2', 'Value'),
    ('combined quality', 'balanced 3', 'Value'),
    ('combined quality', 'more Chi2', 'Value'),
    ('combined quality', 'more KS', 'Value'),
    ('combined quality', 'more DCv', 'Value'),
    
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
    
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Lcl'),
    ('detection probability', 'probability of detection (Pw)', 'Ucl'),
    ('detection probability', 'probability of detection (Pw)', 'Df'),

    ('run output', 'run folder', 'Value'),
]

In [None]:
# Select analysis results columns for the 3 textual columns of the synthesis pre-report
samplePreRepCols = \
[
    ('sample', 'SampleNum', 'Value'),
    ('sample', 'Species', 'Value'),
    ('sample', 'Periods', 'Value'),
    ('sample', 'Prec.', 'Value'),
    ('sample', 'Duration', 'Value'),
    ('sample stats', 'total number of observations', 'Value'),
    ('sample stats', 'maximal observation distance', 'Value'),
]

paramPreRepCols = \
[
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    ('parameters', 'CV interval', 'Value')
]
    
resultPreRepCols = \
[
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),

    ('encounter rate', 'observation rate', 'Value'),
    ('combined quality', 'balanced 1', 'Value'),
    ('combined quality', 'balanced 2', 'Value'),
    ('combined quality', 'balanced 3', 'Value'),
    ('combined quality', 'more Chi2', 'Value'),
    ('combined quality', 'more KS', 'Value'),
    ('combined quality', 'more DCv', 'Value'),
    
    ('density/abundance', 'density of animals', 'Cv'),
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
]

In [None]:
sortPreRepCols = [('sample', 'SampleNum', 'Value')]

sortPreRepAscend = True

In [None]:
preReport = ads.MCDSResultsPreReport(resultsSet=preResults,
                                     title='Validation AutoDS : Pré-analyses', subTitle='Rapport de pré-analyse',
                                     anlysSubTitle='Détail des pré-analyses',
                                     description=("Résultats de pré-analyses exécutées "
                                                  + ('en parallèle' if mcds.workDir.name.endswith('pout')
                                                     else 'séquentiellement')),
                                     keywords='autods, validation', lang='fr', superSynthPlotsHeight=288,
                                     #plotImgSize=(640, 400), plotLineWidth=1, plotDotWidth=4,
                                     #plotFontSizes=dict(title=11, axes=10, ticks=9, legend=10),
                                     sampleCols=samplePreRepCols, paramCols=paramPreRepCols,
                                     resultCols=resultPreRepCols, synthCols=synthPreRepCols,
                                     sortCols=sortPreRepCols, sortAscend=sortPreRepAscend,
                                     tgtFolder=mcds.workDir, tgtPrefix='autods-validation-prereport')

In [None]:
xlsxPreRep = preReport.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxPreRep}" target="blank">{xlsxPreRep}</a>')

In [None]:
htmlPreRep = preReport.toHtml()

HTML(f'Pré-rapport HTML : <a href="{htmlPreRep}" target="blank">{htmlPreRep}</a>')

In [None]:
preResults.specs

# VI. Parallel run of same pre-analyses (2/2 : long code, short duration)

And compare results to sequential run's.

Note: Don't use this low level method : MCDSPreAnalyser is here for than now.

Here we directly call MCDSPreAnalysis class.

## 1. Prepare analyses

In [None]:
# Analysis engine : Non-parallel executor here: MCDSPreAnalysis takes care of this !
mcds = ads.MCDSEngine(workDir=pl.Path('tmp') / 'mcds-prepout', 
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Results object construction
parPreResults = ads.MCDSPreAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                                              sampleIndCol=sampMIndCol, 
                                              distanceUnit='Meter', areaUnit='Hectare',
                                              surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
computed = False

## 2 Or : Really run pre-analyses

In [None]:
# Pre-analysis executor (kind of overkill here, with only 5 pre-analyses ... but still works twice as rapidly !).
parallelExecutor = ads.Executor(threads=6)

In [None]:
%%time

# Run all analyses
lastInFileName = None
parPreAnalyses = dict()
for _, sSamp in dfSamples.iterrows():
    
    nSamp = sSamp.SampleNum
    sampId = '{}-{}-{}mn-{}dec' \
             .format(sSamp.Species,
                     'AB' if 'A+B' in sSamp.Periods else 'A' if 'A' in sSamp.Periods else 'B',
                     sSamp.Duration.split(' ')[0], sSamp['Prec.'].split(' ')[0])
    logger.info(f'#{nSamp+1:3d} {sampId}')
    
    # Create data set if not already done.
    inFileName = 'ACDC2019-Papyrus-{}-dist.txt'.format(sampId)
    if lastInFileName != inFileName:
        sds = ads.SampleDataSet(pl.Path('refin', inFileName), decimalFields=decimalFields)
        lastInFileName = inFileName
        
    # Start running analysis (but don't wait for it's finished)
    sResHead = sSamp.copy()
    sResHead.index = miCustCols
    
    preAnlys = ads.MCDSPreAnalysis(engine=mcds, sampleDataSet=sds, name=sampId,
                                   customData=sResHead, executor=parallelExecutor,
                                   logData=False, modelStrategy=KPreEstimModStrat)
    preAnlysFut = preAnlys.submit()
    
    # Store analysis object and associated "future" for later use (should be running soon or later).
    parPreAnalyses[preAnlysFut] = preAnlys
    
logger.info('All pre-analyses started ; now waiting for their end, and results ...')

# For each analysis as it gets completed (first completed => first yielded)
for preAnlysFut in parallelExecutor.asCompleted(parPreAnalyses):

    # Retrieve pre-analysis object from its associated future object
    preAnlys = parPreAnalyses[preAnlysFut]
    
    # Get pre-analysis results
    sResult = preAnlys.getResults()

    # Save results with header
    parPreResults.append(sResult, sCustomHead=preAnlys.customData)
    
# shutdown executor
parallelExecutor.shutdown()

# shutdown analysis engine
mcds.shutdown()

# Done.
computed = True

In [None]:
# Look at results
parPreResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                                 'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                                 'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

In [None]:
# Save results in case need for not recomputing them
resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults-par.xlsx')

parPreResults.toExcel(resFileName, sheetName='AutoDSVal')

## 2. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    resFileName = os.path.join(mcds.workDir, 'autods-validation-preresults-par.xlsx')
    print('Loading pre-results from {} ...'.format(resFileName))

    parPreResults.fromExcel(resFileName, sheetName='AutoDSVal')
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} pre-analyses loaded'.format(len(parPreResults)))

In [None]:
# Look at results
parPreResults.dfTransData('fr')[['NumEchant', 'Espèce', 'Périodes', 'Préc.', 'Durée', 'Fn Clé',
                                 'Sér Ajust', 'CodEx', 'NObs', 'AIC', 'Chi2 P', 'KS P', 
                                 'Densité', 'CoefVar Densité', 'Min Densité', 'Max Densité']]

## 3. Compare parallel results to sequential ones

In [None]:
# Prepare sequential results for comparison
dfSeqCmpRes = preResults.dfTransData('en')

dfSeqCmpRes.fillna(-9999, inplace=True) # Get rid of the Nan pb (because NaN != NaN :-)

# Start date-time and elapsed time and folder can never be the same
dfSeqCmpRes.drop(columns=['StartTime', 'ElapsedTime', 'RunFolder'], inplace=True)

In [None]:
# Prepare parallel results for comparison
dfParCmpRes = parPreResults.dfTransData('en')

dfParCmpRes.sort_values(by='SampleNum', inplace=True) # Back to original test case order = sequential run order

dfParCmpRes.reset_index(inplace=True, drop=True) # Enforce same index as a consequence

dfParCmpRes.fillna(-9999, inplace=True) # And get rid of the Nan pb (because NaN != Nan :-)

# Start date-time and elapsed time and folder can never be the same
dfParCmpRes.drop(columns=['StartTime', 'ElapsedTime', 'RunFolder'], inplace=True)

In [None]:
assert (dfSeqCmpRes == dfParCmpRes).all().all(), 'Oh, oh, something went differently when run parallely ...'

## 4. Build HTML and Excel pre-analyses reports

See [4. HTML and Excel pre-analyses reports](#4.-HTML-and-Excel-pre-analyses-reports)

# VII. Truncation optimisation (short code and fast run)

Thanks to MCDSZeroOrderTruncationOptimiser class.

Note: Don't use this low level method, MCDSTruncationOptanalyser class is now here for that (easier from far, and shorter code).

In [8]:
if False:
    ads.logger('ads.opr', level=ads.DEBUG, reset=True)
    ads.logger('ads.dat', level=ads.DEBUG, reset=True)

In [9]:
def optimAbbrev(sAnlys):
    
    # Sample abbreviation
    spcAbbrev = ''.join(word[:4].title() for word in sAnlys['Espèce'].split(' ')[:2])
    sampAbbrev = [str(x) for x in [spcAbbrev, sAnlys.Passage.replace('+', ''),
                                   sAnlys.Adulte.replace('+', ''), sAnlys['Durée']]]

    # Model + Parameters abbreviation
    modParAbbrev = [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    
    return '-'.join(sampAbbrev + modParAbbrev)

## 0. Optimiser parameters

In [10]:
# Source / Results data
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'Distance'
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

optIndCol = 'IndOptim'
optAbbrevCol = 'AbrevOptim'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

In [11]:
# Les paramètres généraux d'analyse DS
distanceUnit = 'Meter'
areaUnit = 'Hectare'
surveyType = 'Point'
distanceType = 'Radial'
clustering = False

In [12]:
# Default optimisation params.
defEstimKeyFn = 'HNORMAL'
defEstimAdjustFn = 'COSINE'
defEstimCriterion = 'AIC'
defCVInterval = 95
defMinDist = None
defMaxDist = None, 
defFitDistCuts = None
defDiscrDistCuts = None

defExpr2Optimise = 'chi2'
defMinimiseExpr = False
defOutliersMethod = 'tucquant'
defOutliersQuantCutPct = 7
defFitDistCutsFctr = ads.Interval(min=0.6, max=1.4)
defDiscrDistCutsFctr = ads.Interval(min=0.5, max=1.2)

defSubmitTimes = 1
defSubmitOnlyBest = None

defCoreEngine = 'zoopt'
defCoreMaxIters = 100
defCoreTermExprValue = None
defCoreAlgorithm = 'racos'
defCoreMaxRetries = 0

## 1. Individuals data set

In [13]:
dfObsIndiv = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='DonnéesIndiv').dfData
dfObsIndiv

2021-09-02 08:34:11,344 ads.dat INFO0	Loaded 1543 total rows in data set ...
2021-09-02 08:34:11,345 ads.dat INFO0	... found columns: [Observateur|Point|Passage|DateHeure|Espèce|Distance|Adulte|Durée]


Unnamed: 0,Observateur,Point,Passage,DateHeure,Espèce,Distance,Adulte,Durée
0,A,146,a,2019-05-02 08:00:00,Turdus merula,43.418829,m,10mn
1,A,146,a,2019-05-02 08:00:00,Turdus merula,43.418829,m,10mn
2,A,146,a,2019-05-02 08:00:00,Turdus merula,43.418829,m,5mn
3,A,146,a,2019-05-02 08:00:00,Turdus merula,43.418829,m,5mn
4,A,146,a,2019-05-02 08:01:00,Luscinia megarhynchos,76.630008,m,10mn
...,...,...,...,...,...,...,...,...
1538,H,216,b,2019-05-25 10:17:00,Turdus merula,278.261431,m,10mn
1539,H,216,b,2019-05-25 10:17:00,Turdus merula,278.261431,m,5mn
1540,H,216,b,2019-05-25 10:23:00,Turdus merula,110.957560,m,10mn
1541,H,216,b,2019-05-25 10:15:00,Sylvia atricapilla,66.591277,m,10mn


In [14]:
{ col: dfObsIndiv[col].unique() for col in ['Observateur', 'Point', 'Passage', 'Adulte', 'Durée', 'Espèce'] }

{'Observateur': array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], dtype=object),
 'Point': array([146, 162, 129, 113, 130, 147, 301, 300, 299, 280, 262, 281, 263,
        282, 196, 198, 194, 197, 202, 218, 201, 199, 219, 200,  90,  91,
         88, 105, 106, 122,  89, 141, 161, 123, 125, 142, 212, 145, 213,
        229, 144, 143, 110, 112, 126, 128, 127, 109, 166, 185, 183, 184,
        182, 165, 164, 148, 163, 159, 158, 157, 174, 192, 175, 176, 193,
         56,  57,  58,  59,  41,  60,  40,  39,  23,  42, 210, 228, 246,
        245, 284, 265, 247, 266, 177, 160, 178, 180, 179, 181, 215, 216,
        250, 233, 232, 195, 211], dtype=int64),
 'Passage': array(['a', 'b'], dtype=object),
 'Adulte': array(['m'], dtype=object),
 'Durée': array(['10mn', '5mn'], dtype=object),
 'Espèce': array(['Turdus merula', 'Luscinia megarhynchos', 'Sylvia atricapilla'],
       dtype=object)}

## 2. Actual transects

(can't deduce them from data, some points are missing because of data selection)

In [15]:
dfTransects = ads.DataSet('refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods', sheet='Inventaires').dfData
dfTransects

2021-09-02 08:34:12,699 ads.dat INFO0	Loaded 190 total rows in data set ...
2021-09-02 08:34:12,700 ads.dat INFO0	... found columns: [Point|Observateur|Date|Passage|Effort]


Unnamed: 0,Point,Observateur,Date,Passage,Effort
0,23,G,2019-04-13,a,1
1,23,G,2019-06-01,b,1
2,39,G,2019-04-13,a,1
3,39,G,2019-06-01,b,1
4,40,G,2019-04-13,a,1
...,...,...,...,...,...
185,299,B,2019-06-08,b,1
186,300,B,2019-05-01,a,1
187,300,B,2019-06-08,b,1
188,301,B,2019-05-01,a,1


## 3. Samples and analyses to optimise

In [16]:
computed = False

In [17]:
workDir = pl.Path('tmp') / 'mcds-optr'

### a. For testing all optimisation parameters

In [24]:
varOpt = '-all'

In [19]:
dfRawOptimExplSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsAnalyses.xlsx', 
                                                        ignore=['Params3_expl'])

# No use of these cols, as we'll compute them !
dfRawOptimExplSpecs = dfRawOptimExplSpecs.drop(columns=['TrGche', 'TrDrte', 'NbTrchMod']) \
                                         .drop_duplicates().reset_index(drop=True)

nOptimExplSpecs = len(dfRawOptimExplSpecs)

In [20]:
# Add optim. params
dfMoreOptimCols = pd.DataFrame([dict(CritChx='AIC', IntervConf=95,
                                     TrGche='auto', TrDrte='auto', MethOutliers='tucquant(2.5)',
                                     NbTrchMod='mult(2/3, 3/2)', NbTrDiscr=None,
                                     #TroncGche='auto', TroncDrte='auto', MethOutliers='tucquant(2.5)',
                                     #NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None,
                                     ExprOpt='max(chi2)', MoteurOpt='zoopt(160)')]*len(dfRawOptimExplSpecs))

dfRawOptimExplSpecs = pd.concat([dfRawOptimExplSpecs, dfMoreOptimCols], axis='columns')
dfRawOptimExplSpecs

Unnamed: 0,Espèce,Passage,Adulte,Durée,FonctionClé,SérieAjust,CritChx,IntervConf,TrGche,TrDrte,MethOutliers,NbTrchMod,NbTrDiscr,ExprOpt,MoteurOpt
0,Sylvia atricapilla,a+b,m,5mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
1,Sylvia atricapilla,a+b,m,5mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
2,Sylvia atricapilla,a+b,m,10mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
3,Sylvia atricapilla,a+b,m,10mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
4,Turdus merula,a+b,m,5mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
5,Turdus merula,a+b,m,5mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
6,Turdus merula,a+b,m,10mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
7,Turdus merula,a+b,m,10mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
8,Luscinia megarhynchos,b,m,5mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)
9,Luscinia megarhynchos,b,m,5mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160)


In [21]:
nOptimedAnlyses = nOptimExplSpecs

In [22]:
# Colonnes de dfRawOptimExplSpecs donnant les paramètres d'analyse / optimisation
optimParamSpecCols  = ['FonctionClé', 'SérieAjust', 'CritChx', 'IntervConf',
                       'TrGche', 'TrDrte', 'MethOutliers', 'NbTrchMod', 'NbTrDiscr',
                       #'TroncGche', 'TroncDrte', 'MethOutliers', 'NbTrModel', 'NbTrDiscr',
                       'ExprOpt', 'MoteurOpt']

# Et en version interne
intOptimParamSpecCols = ['EstimKeyFn', 'EstimAdjustFn', 'EstimCriterion', 'CvInterval',
                          'MinDist', 'MaxDist', 'OutliersMethod', 'FitDistCuts', 'DiscrDistCuts',
                          'Expr2Optimise', 'OptimisationCore']

### b. Or: Only main optimisation parameters

* for comparison with X below,
* for comparing results goodness with various optimisation parameters, in XI below.

In [None]:
varOpt = '-main'

In [None]:
dfRawOptimExplSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx', 
                                                        ignore=['Params1_expl', 'Params2_expl'])

dfRawOptimExplSpecs.drop(dfRawOptimExplSpecs[dfRawOptimExplSpecs[['TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']]
                                    .isnull().all(axis='columns')].index,
                         inplace=True)

dfRawOptimExplSpecs

In [None]:
nOptimExplSpecs = len(dfRawOptimExplSpecs) - dfRawOptimExplSpecs.duplicated().sum()  # Duplicates will be removed

nOptimedAnlyses = 22  # See MultiOpt col

In [None]:
# Colonnes de dfRawOptimExplSpecs donnant les paramètres d'analyse / optimisation
optimParamSpecCols  = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']

# Et en version interne
intOptimParamSpecCols = ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts', 'SubmitParams']

## 4A. Or : Really run optimisations

### a. MCDS Zeroth Order Truncation Optimiser object

In [25]:
zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                (dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                 transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                 sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                 abbrevCol=optAbbrevCol, abbrevBuilder=optimAbbrev,
                 anlysIndCol=optIndCol, sampleIndCol=sampleNumCol,
                 distanceUnit=distanceUnit, areaUnit=areaUnit,
                 surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                 resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamSpecCols),
                 workDir=workDir, runMethod='os.system', runTimeOut=None, logProgressEvery=1, backupEvery=5,
                 defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                 defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                 defExpr2Optimise=defExpr2Optimise, defMinimiseExpr=defMinimiseExpr,
                 defOutliersMethod=defOutliersMethod, defOutliersQuantCutPct=defOutliersQuantCutPct,
                 defFitDistCutsFctr=defFitDistCutsFctr, defDiscrDistCutsFctr=defDiscrDistCutsFctr,
                 defSubmitTimes=defSubmitTimes, defSubmitOnlyBest=defSubmitOnlyBest,
                 defCoreMaxIters=defCoreMaxIters, defCoreTermExprValue=defCoreTermExprValue,
                 defCoreAlgorithm=defCoreAlgorithm, defCoreMaxRetries=defCoreMaxRetries)

2021-09-02 08:35:05,919 ads.dat INFO0	Loaded 1543 total rows in data set ...
2021-09-02 08:35:05,921 ads.dat INFO0	... found columns: [Observateur|Point|Passage|DateHeure|Espèce|Distance|Adulte|Durée]
2021-09-02 08:35:05,922 ads.dat INFO0	Individuals data : 1543 sightings, 190 transects


### b. Run optimisations

In [26]:
dfOptimExplSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, checkVerdict, checkErrors = \
    zoptr.explicitParamSpecs(dfExplParamSpecs=dfRawOptimExplSpecs, dropDupes=True, check=True)

print(dict(checkVerdict=checkVerdict, checkErrors=checkErrors, 
           nActualOptimExplSpecs=len(dfOptimExplSpecs), nExpectedOptimExplSpecs=nOptimExplSpecs,
           optimParamSpecCols=optimParamSpecCols, intOptimParamSpecCols=intOptimParamSpecCols,
           unmUserParamSpecCols=unmUserParamSpecCols))

assert len(dfOptimExplSpecs) == nOptimExplSpecs
assert userParamSpecCols == optimParamSpecCols
assert intParamSpecCols == intOptimParamSpecCols
assert unmUserParamSpecCols == []
assert checkVerdict
assert not checkErrors

2021-09-02 08:35:07,598 ads.anr INFO0	Dropped 0 last duplicate specs of 12, on [Espèce, Passage, Adulte, Durée, FonctionClé, SérieAjust, CritChx, IntervConf, TrGche, TrDrte, MethOutliers, NbTrchMod, NbTrDiscr, ExprOpt, MoteurOpt] columns
{'checkVerdict': True, 'checkErrors': [], 'nActualOptimExplSpecs': 12, 'nExpectedOptimExplSpecs': 12, 'optimParamSpecCols': ['FonctionClé', 'SérieAjust', 'CritChx', 'IntervConf', 'TrGche', 'TrDrte', 'MethOutliers', 'NbTrchMod', 'NbTrDiscr', 'ExprOpt', 'MoteurOpt'], 'intOptimParamSpecCols': ['EstimKeyFn', 'EstimAdjustFn', 'EstimCriterion', 'CvInterval', 'MinDist', 'MaxDist', 'OutliersMethod', 'FitDistCuts', 'DiscrDistCuts', 'Expr2Optimise', 'OptimisationCore'], 'unmUserParamSpecCols': []}


In [27]:
dfOptimExplSpecs

Unnamed: 0,Espèce,Passage,Adulte,Durée,FonctionClé,SérieAjust,CritChx,IntervConf,TrGche,TrDrte,MethOutliers,NbTrchMod,NbTrDiscr,ExprOpt,MoteurOpt,AbrevOptim,IndOptim,NumEchant
0,Sylvia atricapilla,a+b,m,5mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),SylvAtri-ab-m-5mn-hno-cos,0,0
1,Sylvia atricapilla,a+b,m,5mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),SylvAtri-ab-m-5mn-haz-cos,1,0
2,Sylvia atricapilla,a+b,m,10mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),SylvAtri-ab-m-10mn-hno-cos,2,1
3,Sylvia atricapilla,a+b,m,10mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),SylvAtri-ab-m-10mn-haz-cos,3,1
4,Turdus merula,a+b,m,5mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),TurdMeru-ab-m-5mn-hno-cos,4,2
5,Turdus merula,a+b,m,5mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),TurdMeru-ab-m-5mn-haz-cos,5,2
6,Turdus merula,a+b,m,10mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),TurdMeru-ab-m-10mn-hno-cos,6,3
7,Turdus merula,a+b,m,10mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),TurdMeru-ab-m-10mn-haz-cos,7,3
8,Luscinia megarhynchos,b,m,5mn,HNORMAL,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),LuscMega-b-m-5mn-hno-cos,8,4
9,Luscinia megarhynchos,b,m,5mn,HAZARD,COSINE,AIC,95,auto,auto,tucquant(2.5),"mult(2/3, 3/2)",,max(chi2),zoopt(160),LuscMega-b-m-5mn-haz-cos,9,4


In [28]:
%%time

# Analyses
results = zoptr.run(dfOptimExplSpecs, threads=12)

computed = True

2021-09-02 08:35:10,105 ads.eng INFO0	DSEngine work folder: C:\git\perso\autods\tmp\mcds-optr
2021-09-02 08:35:10,109 ads.anr INFO0	Dropped 0 last duplicate specs of 12, on [Espèce, Passage, Adulte, Durée, FonctionClé, SérieAjust, CritChx, IntervConf, TrGche, TrDrte, MethOutliers, NbTrchMod, NbTrDiscr, ExprOpt, MoteurOpt] columns
2021-09-02 08:35:10,112 ads.opr INFO0	Running MCDS truncation optimisations for 12 analyses specs (12 parallel threads) ...
2021-09-02 08:35:10,114 ads.opr INFO0	#1/12: SylvAtri-ab-m-5mn-hno-cos (Id 0)
2021-09-02 08:35:10,133 ads.dat INFO0	Loaded 276 total rows in data set ...
2021-09-02 08:35:10,134 ads.dat INFO0	... found columns: [Observateur|Point|Passage|DateHeure|Espèce|Distance|Adulte|Durée|Effort|Zone|Surface]
2021-09-02 08:35:10,136 ads.dat INFO0	Sample data : 276 sightings = 261 individuals + 15 absence rows
2021-09-02 08:35:10,140 ads.opn INFO0	ZOTrOptimisation({'minDist': Parameter(name='MinDist', interval=[10.843323181859, 22.12912149316365], cont

[zoopt] expected remaining running time: 00:02:38
[zoopt] expected remaining running time: 00:03:08
[zoopt] expected remaining running time: 00:03:18
[zoopt] expected remaining running time: 00:02:47
[zoopt] expected remaining running time: 00:03:10
[zoopt] expected remaining running time: 00:03:24
[zoopt] expected remaining running time: 00:03:32
[zoopt] expected remaining running time: 00:03:44
[zoopt] expected remaining running time: 00:07:31
[zoopt] x: [40.42035415622268, 984.3765547187332, 5]
[zoopt] value: -0.8066809
[zoopt] x: [17.291720088212642, 479.38829225525956, 19]
[zoopt] value: -0.3229441
[zoopt] x: [7.160333494003399, 455.4583465747577, 15]
[zoopt] value: -0.7893247
[zoopt] x: [41.24782286823763, 632.1681011761734, 11]
[zoopt] value: -0.861603
[zoopt] x: [21.44846976885328, 311.3066557655076, 14]
[zoopt] value: -0.4094353
2021-09-02 08:38:15,393 ads.dat INFO0	5x34 results rows x columns and 2 specs saved to tmp\mcds-optr\optr-resbak-1.pickle.xz in 0.028s
[zoopt] x: [11.

**Target:**
* Computer: Lenovo T490 Core i5 8365U 4 HT Cores, Ruindows 10
* runMethod: os.system

**Variant 3a: "all"**

* 22 analyses specs (12 parallel threads)
* Paramètres dans refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx nettoyé de Param1_expl et Param2_expl.
* MoteurOpt='zoopt(120)' => 2020-08-21 22:41:56,626 2880 analyses => 22 results, Wall time: 4min 21s
* MoteurOpt='zoopt(160)' => 2021-08-22 1920 analyses => 12 results, Wall time: 2min 44s

**Variant 3b: "main"**

* 12 analyses specs (6 parallel threads)
* CritChx='AIC', IntervConf=95, TroncGche='auto', TroncDrte='auto', MethOutliers='tucquant(2.5)', NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None, ExprOpt='max(chi2)'
* MoteurOpt='zoopt(160)' => 2020-06-29 21:18:04,727 Wall time: 4min 31s
* MoteurOpt='zoopt(250, tv=0.6)' => 2020-06-28 19:23:38,868 Wall time: 9min 19s

**Variant 3b: "main"**

* 12 analyses specs (12 parallel threads)
* CritChx='AIC', IntervConf=95, TroncGche='auto', TroncDrte='auto', MethOutliers='tucquant(2.5)', NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None, ExprOpt='max(chi2)'
* MoteurOpt='zoopt(160)' => 2020-07-18 15:22:47,289 Wall time: 3min 51s

**Target:**
* Computer: Lenovo T490 Core i5 8365U 4 HT Cores, Ruindows 10
* runMethod: subprocess.run

**Variant 3a: "all"**

* 12 analyses specs (12 parallel threads)
* Paramètres dans refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx nettoyé de Param1_expl et Param2_expl.
* default: CritChx='AIC', IntervConf=95, MethOutliers='tucquant(7)', NbTrModel='mult(0.6, 1.4)', NbTrDiscr='mult(0.5, 1.2)', ExprOpt='max(chi2)'
* MoteurOpt='zoopt(100)' => 2021-01-14 10:52:28,040 1920 analyses => 12 results, Wall time: 3min 45s
* MoteurOpt='zoopt(100)' => 2021-08-22 10:52:28,040 1920 analyses => 12 results, Wall time: 3min 45s

**Variant 3b: "main"**

* 16 analyses specs (12 parallel threads)
* default: CritChx='AIC', IntervConf=95, MethOutliers='tucquant(7)', NbTrModel='mult(0.6, 1.4)', NbTrDiscr='mult(0.5, 1.2)', ExprOpt='max(chi2)'
* MoteurOpt='zoopt(100)' => 2021-01-14 10:30:19,463 2400 analyses => 22 results, Wall time: 4min 28s
* MoteurOpt='zoopt(100)' => 2021-08-22 18:55:55,701 2400 analyses => 22 results, Wall time: 3min 31s

In [29]:
zoptr.shutdown()

In [30]:
results.dfTransData('fr')

Unnamed: 0,IndOptim,Espèce,Passage,Adulte,Durée,FonctionClé,SérieAjust,CritChx,IntervConf,TrGche,...,Tranch Dist Mod,Tranch Dist Discr,SetupStatus,SubmitStatus,NFunEvals,MeanFunElapd,minDist,maxDist,fitDistCuts,chi2
0,8,Luscinia megarhynchos,b,m,5mn,HNORMAL,COSINE,AIC,95,auto,...,"[5, 11]",,,,160.0,1.015248,40.420354,984.376555,5.0,0.806681
1,5,Turdus merula,a+b,m,5mn,HAZARD,COSINE,AIC,95,auto,...,"[10, 23]",,,,160.0,1.146336,17.29172,479.388292,19.0,0.322944
2,4,Turdus merula,a+b,m,5mn,HNORMAL,COSINE,AIC,95,auto,...,"[10, 23]",,,,160.0,1.15166,7.160333,455.458347,15.0,0.789325
3,10,Luscinia megarhynchos,b,m,10mn,HNORMAL,COSINE,AIC,95,auto,...,"[6, 14]",,,,160.0,1.15184,41.247823,632.168101,11.0,0.861603
4,0,Sylvia atricapilla,a+b,m,5mn,HNORMAL,COSINE,AIC,95,auto,...,"[11, 24]",,,,160.0,1.15659,21.44847,311.306656,14.0,0.409435
5,2,Sylvia atricapilla,a+b,m,10mn,HNORMAL,COSINE,AIC,95,auto,...,"[13, 30]",,,,160.0,1.161211,11.291141,290.687103,24.0,0.134032
6,7,Turdus merula,a+b,m,10mn,HAZARD,COSINE,AIC,95,auto,...,"[13, 30]",,,,160.0,1.191375,5.495316,417.513987,14.0,0.520992
7,1,Sylvia atricapilla,a+b,m,5mn,HAZARD,COSINE,AIC,95,auto,...,"[11, 24]",,,,160.0,1.214987,21.700359,379.711278,12.0,0.313249
8,6,Turdus merula,a+b,m,10mn,HNORMAL,COSINE,AIC,95,auto,...,"[13, 30]",,,,160.0,1.24231,14.600095,665.093774,23.0,0.679625
9,3,Sylvia atricapilla,a+b,m,10mn,HAZARD,COSINE,AIC,95,auto,...,"[13, 30]",,,,160.0,1.266454,19.729553,337.733848,17.0,0.193909


### c. Save results for later reload or examination

In [31]:
results.toExcel(workDir / f'valtests-mcds-optimiser{varOpt}-results.xlsx')

2021-09-02 08:40:55,291 ads.dat INFO0	12x34 results rows x columns and 2 specs saved to tmp\mcds-optr\valtests-mcds-optimiser-all-results.xlsx in 0.099s


In [None]:
#results.toExcel(workDir / 'valtests-mcds-optimiser-results-fr.xlsx', lang='fr')

## 4B. Or : Restart optimisation from recovery file

(already run above)

### a. MCDS Zeroth Order Truncation Optimiser object

In [None]:
# Warning: Must be a real clone of the above one, otherwise recovery may not work !
zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                (dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                 transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                 sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                 abbrevCol=optAbbrevCol, abbrevBuilder=optimAbbrev,
                 anlysIndCol=optIndCol, sampleIndCol=sampleNumCol,
                 distanceUnit=distanceUnit, areaUnit=areaUnit,
                 surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                 resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamSpecCols),
                 workDir=workDir, runMethod='os.system', runTimeOut=None, logProgressEvery=1,
                 defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                 defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                 defExpr2Optimise=defExpr2Optimise, defMinimiseExpr=defMinimiseExpr,
                 defOutliersMethod=defOutliersMethod, defOutliersQuantCutPct=defOutliersQuantCutPct,
                 defFitDistCutsFctr=defFitDistCutsFctr, defDiscrDistCutsFctr=defDiscrDistCutsFctr,
                 defSubmitTimes=defSubmitTimes, defSubmitOnlyBest=defSubmitOnlyBest,
                 defCoreMaxIters=defCoreMaxIters, defCoreTermExprValue=defCoreTermExprValue,
                 defCoreAlgorithm=defCoreAlgorithm, defCoreMaxRetries=defCoreMaxRetries)

### b. Run optimisations

In [None]:
dfOptimExplSpecs

In [None]:
%%time

# Optimisations with recovery
results2 = zoptr.run(dfOptimExplSpecs, recover=True, threads=12)

computed = True

In [None]:
zoptr.shutdown()

In [None]:
results2.dfTransData('fr')

### c. Save results for later reload or examination

In [None]:
results2.toExcel(workDir / f'valtests-mcds-optimiser{varOpt}-results2.xlsx')

## 4C. Or : Load optimisation results from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                    (dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                     transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                     sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                     abbrevCol=optAbbrevCol, abbrevBuilder=optimAbbrev,
                     anlysIndCol=optIndCol, sampleIndCol=sampleNumCol,
                     distanceUnit=distanceUnit, areaUnit=areaUnit,
                     surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                     resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamSpecCols))
    
    # TODO: use new loadFrom param to load data !!!
    results = zoptr.setupResults()
    
    # Load results from file.
    resFileName = workDir / f'valtests-mcds-optimiser{varOpt}-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} optimisations to analyse'.format(len(results)))

## 5. Deduce analyses specs from optimisation results

In [32]:
# Short string for sample "identification"
def sampleAbbrev(sSample):
    
    abrvSpe = ''.join(word[:4].title() for word in sSample['Espèce'].split(' ')[:2])
    
    sampAbbrev = '{}-{}-{}-{}'.format(abrvSpe, sSample.Passage.replace('+', ''),
                                      sSample.Adulte.replace('+', ''), sSample['Durée'])
    
    return sampAbbrev

In [33]:
# Short string for analysis "identification"
def analysisAbbrev(sAnlys):
    
    # Sample abbreviation
    abbrevs = [sampleAbbrev(sAnlys)]

    # Model + Parameters abbreviation
    abbrevs += [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    dTroncAbrv = { 'l': 'TrGche' if 'TrGche' in sAnlys.index else 'TroncGche',
                   'r': 'TrDrte' if 'TrDrte' in sAnlys.index else 'TroncDrte',
                   'm': 'NbTrches' if 'NbTrches' in sAnlys.index else 'NbTrModel'
                                   if 'NbTrModel' in sAnlys.index else  'NbTrchMod',
                   'd': 'NbTrDiscr' }
    for abrv, name in dTroncAbrv.items():
        if name in sAnlys.index and not pd.isnull(sAnlys[name]):
            abbrevs.append('{}{}'.format(abrv, sAnlys[name][0].lower() if isinstance(sAnlys[name], str)
                                               else int(sAnlys[name])))
   
    return '-'.join(abbrevs)

In [34]:
varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'

In [35]:
# Get sample and analysis params, and above all optimised truncation param. values from optimiser results.
optTgtCols = ['TrGche', 'TrDrte', 'NbTrchMod']
#optTgtCols = ['TroncGche', 'TroncDrte', 'NbTrModel']
dfAnlysSpecs = results.dfData[['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust',
                               'minDist', 'maxDist', 'fitDistCuts'] + optTgtCols].copy()

# Add analysis abbreviation from truncation params optim. specs (not from optimised results).
dfAnlysSpecs[anlysAbbrevCol] = dfAnlysSpecs.apply(analysisAbbrev, axis='columns')

# No need for the truncation params optim. specs anymore
dfAnlysSpecs.drop(columns=optTgtCols, inplace=True)

# Rename optimised truncation param. columns for analysis
dfAnlysSpecs.rename(columns=dict(minDist='TrGche', maxDist='TrDrte', fitDistCuts='NbTrchMod'), inplace=True)

In [36]:
# But non-optimised truncation parameters are not in optimiser result columns (minDist, maxDist, fitDisCuts, ...) ...
# so we have to get them back from optimisation specs (TrGche, TrDrte, NbTrchMod, ...)

# String specs are optimisation params, numerical ones are already determined truncation params.
bdfToBeKeptSpecCells = results.dfData[optTgtCols].applymap(lambda v: isinstance(v, str))

dfAnlysSpecs[optTgtCols] = dfAnlysSpecs[optTgtCols].where(bdfToBeKeptSpecCells,
                                                          other=results.dfData[optTgtCols])

In [37]:
dfAnlysSpecs

Unnamed: 0,Espèce,Passage,Adulte,Durée,FonctionClé,SérieAjust,TrGche,TrDrte,NbTrchMod,AbrevAnlys
0,Luscinia megarhynchos,b,m,5mn,HNORMAL,COSINE,40.420354,984.376555,5.0,LuscMega-b-m-5mn-hno-cos-la-ra-mm
1,Turdus merula,a+b,m,5mn,HAZARD,COSINE,17.29172,479.388292,19.0,TurdMeru-ab-m-5mn-haz-cos-la-ra-mm
2,Turdus merula,a+b,m,5mn,HNORMAL,COSINE,7.160333,455.458347,15.0,TurdMeru-ab-m-5mn-hno-cos-la-ra-mm
3,Luscinia megarhynchos,b,m,10mn,HNORMAL,COSINE,41.247823,632.168101,11.0,LuscMega-b-m-10mn-hno-cos-la-ra-mm
4,Sylvia atricapilla,a+b,m,5mn,HNORMAL,COSINE,21.44847,311.306656,14.0,SylvAtri-ab-m-5mn-hno-cos-la-ra-mm
5,Sylvia atricapilla,a+b,m,10mn,HNORMAL,COSINE,11.291141,290.687103,24.0,SylvAtri-ab-m-10mn-hno-cos-la-ra-mm
6,Turdus merula,a+b,m,10mn,HAZARD,COSINE,5.495316,417.513987,14.0,TurdMeru-ab-m-10mn-haz-cos-la-ra-mm
7,Sylvia atricapilla,a+b,m,5mn,HAZARD,COSINE,21.700359,379.711278,12.0,SylvAtri-ab-m-5mn-haz-cos-la-ra-mm
8,Turdus merula,a+b,m,10mn,HNORMAL,COSINE,14.600095,665.093774,23.0,TurdMeru-ab-m-10mn-hno-cos-la-ra-mm
9,Sylvia atricapilla,a+b,m,10mn,HAZARD,COSINE,19.729553,337.733848,17.0,SylvAtri-ab-m-10mn-haz-cos-la-ra-mm


In [38]:
workDir = pl.Path('tmp/mcds-anaftopt')

In [39]:
computed = False

## 6A. Or : Really run analyses

(now truncation parameters have been auto-computed)

### a. MCDS Analyser object

In [40]:
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                          abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                          distanceUnit=distanceUnit, areaUnit=areaUnit,
                          surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                          resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                               after=anlysParamCols + [anlysAbbrevCol]),
                          workDir=workDir, logProgressEvery=1,
                          defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                          defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                          defMinDist=defMinDist, defMaxDist=defMaxDist,
                          defFitDistCuts=defFitDistCuts, defDiscrDistCuts=defDiscrDistCuts)

2021-09-02 08:48:37,309 ads.dat INFO0	Loaded 1543 total rows in data set ...
2021-09-02 08:48:37,309 ads.dat INFO0	... found columns: [Observateur|Point|Passage|DateHeure|Espèce|Distance|Adulte|Durée]
2021-09-02 08:48:37,311 ads.dat INFO0	Individuals data : 1543 sightings, 190 transects


### b. Check analysis explicit specs

In [41]:
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfAnlysSpecs, dropDupes=True, check=True)

assert len(dfAnlysSpecs) == nOptimedAnlyses, f'{len(dfAnlysSpecs)} != {nOptimedAnlyses}'
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod'], str(userParamSpecCols)
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts'], str(intParamSpecCols)
assert unmUserParamSpecCols == []
assert verdict
assert not reasons, str(reasons)

2021-09-02 08:48:41,176 ads.anr INFO0	Dropped 0 last duplicate specs of 12, on [Espèce, Passage, Adulte, Durée, FonctionClé, SérieAjust, TrGche, TrDrte, NbTrchMod] columns


### c. Run analyses

In [42]:
%%time

# Analyses : min=5, max=11s elapsed for 64 analyses with 6 threads on a Lenovo P52 (6-HT-core i7-8850H with PCI-e SSD)
# Analyses : min=2.1, max=2.5s elapsed for 22 analyses with 6-12 threads on a Lenovo T490 (4-HT-core i5-8365U with PCI-e SSD)
# Analyses : 1.5s elapsed for 12 analyses with 6-12 threads on a Lenovo T490 (4-HT-core i5-8365U with PCI-e SSD)
results = anlysr.run(dfAnlysSpecs, threads=12)

computed = True

2021-09-02 08:48:42,403 ads.eng INFO0	DSEngine work folder: C:\git\perso\autods\tmp\mcds-anaftopt
2021-09-02 08:48:42,406 ads.anr INFO0	Dropped 0 last duplicate specs of 12, on [Espèce, Passage, Adulte, Durée, FonctionClé, SérieAjust, TrGche, TrDrte, NbTrchMod] columns
2021-09-02 08:48:42,408 ads.anr INFO0	Running 12 MCDS analyses (12 parallel threads) ...
2021-09-02 08:48:42,410 ads.anr INFO0	#1/12 : LuscMega-b-m-5mn-hno-cos-la-ra-mm
2021-09-02 08:48:42,430 ads.dat INFO0	Loaded 109 total rows in data set ...
2021-09-02 08:48:42,431 ads.dat INFO0	... found columns: [Observateur|Point|Passage|DateHeure|Espèce|Distance|Adulte|Durée|Effort|Zone|Surface]
2021-09-02 08:48:42,433 ads.dat INFO0	Sample data : 109 sightings = 57 individuals + 52 absence rows
2021-09-02 08:48:42,438 ads.anr INFO0	#2/12 : TurdMeru-ab-m-5mn-haz-cos-la-ra-mm
2021-09-02 08:48:42,461 ads.dat INFO0	Loaded 244 total rows in data set ...
2021-09-02 08:48:42,462 ads.dat INFO0	... found columns: [Observateur|Point|Passage

In [43]:
anlysr.shutdown()

In [44]:
results.dfTransData('fr')

Unnamed: 0,NumAnlys,NumEchant,Espèce,Passage,Adulte,Durée,FonctionClé,SérieAjust,TrGche,TrDrte,...,Ordre Tronc Proch Qual Equi KS+,Ordre Tronc Proch Qual Equi DCv+,Ordre Global Chi2 KS DCv,Ordre Global Qual Equi 1,Ordre Global Qual Equi 2,Ordre Global Qual Equi 3,Ordre Global Qual Equi Chi2+,Ordre Global Qual Equi KS+,Ordre Global Qual Equi DCv+,Ordre Global DeltaAIC Chi2 KS DCv
6,0,0,Luscinia megarhynchos,b,m,5mn,HNORMAL,COSINE,40.420354,984.376555,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,1,Turdus merula,a+b,m,5mn,HAZARD,COSINE,17.29172,479.388292,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2,1,Turdus merula,a+b,m,5mn,HNORMAL,COSINE,7.160333,455.458347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,2,Luscinia megarhynchos,b,m,10mn,HNORMAL,COSINE,41.247823,632.168101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,4,3,Sylvia atricapilla,a+b,m,5mn,HNORMAL,COSINE,21.44847,311.306656,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,5,4,Sylvia atricapilla,a+b,m,10mn,HNORMAL,COSINE,11.291141,290.687103,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
0,6,5,Turdus merula,a+b,m,10mn,HAZARD,COSINE,5.495316,417.513987,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,7,3,Sylvia atricapilla,a+b,m,5mn,HAZARD,COSINE,21.700359,379.711278,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,8,5,Turdus merula,a+b,m,10mn,HNORMAL,COSINE,14.600095,665.093774,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,9,4,Sylvia atricapilla,a+b,m,10mn,HAZARD,COSINE,19.729553,337.733848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [45]:
results.dfData

Unnamed: 0_level_0,header (head),header (head),header (sample),header (sample),header (sample),header (sample),header (tail),header (tail),header (tail),header (tail),...,auto filter sort,auto filter sort,auto filter sort,auto filter sort,auto filter sort,auto filter sort,auto filter sort,auto filter sort,auto filter sort,auto filter sort
Unnamed: 0_level_1,NumAnlys,NumEchant,Espèce,Passage,Adulte,Durée,FonctionClé,SérieAjust,TrGche,TrDrte,...,Bal. quality KS+ (close trunc),Bal. quality DCv+ (close trunc),Chi2 KS DCv (global),Bal. quality 1 (global),Bal. quality 2 (global),Bal. quality 3 (global),Bal. quality Chi2+ (global),Bal. quality KS+ (global),Bal. quality DCv+ (global),DeltaAIC Chi2 KS DCv (global)
Unnamed: 0_level_2,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,...,Order,Order,Order,Order,Order,Order,Order,Order,Order,Order
6,0,0,Luscinia megarhynchos,b,m,5mn,HNORMAL,COSINE,40.420354,984.376555,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,1,Turdus merula,a+b,m,5mn,HAZARD,COSINE,17.29172,479.388292,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2,1,Turdus merula,a+b,m,5mn,HNORMAL,COSINE,7.160333,455.458347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,2,Luscinia megarhynchos,b,m,10mn,HNORMAL,COSINE,41.247823,632.168101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,4,3,Sylvia atricapilla,a+b,m,5mn,HNORMAL,COSINE,21.44847,311.306656,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,5,4,Sylvia atricapilla,a+b,m,10mn,HNORMAL,COSINE,11.291141,290.687103,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
0,6,5,Turdus merula,a+b,m,10mn,HAZARD,COSINE,5.495316,417.513987,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,7,3,Sylvia atricapilla,a+b,m,5mn,HAZARD,COSINE,21.700359,379.711278,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,8,5,Turdus merula,a+b,m,10mn,HNORMAL,COSINE,14.600095,665.093774,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,9,4,Sylvia atricapilla,a+b,m,10mn,HAZARD,COSINE,19.729553,337.733848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### d. Save results for later reload or examination

In [46]:
results.toExcel(workDir / f'valtests-mcds-analyser-afteropt{varOpt}-results.xlsx')

2021-09-02 08:48:48,171 ads.dat INFO0	12x114 results rows x columns and 3 specs saved to tmp\mcds-anaftopt\valtests-mcds-analyser-afteropt-all-results.xlsx in 0.173s


In [47]:
#results.toExcel(workDir / 'valtests-mcds-analyser-afteropt-fr.xlsx', lang='fr')

## 6B. Or : Load analyses results from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    anlysr = ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea,
                              resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                   after=[anlysAbbrevCol]),
                              transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                              sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                              abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                              distanceUnit=distanceUnit, areaUnit=areaUnit,
                              surveyType=surveyType, distanceType=distanceType, clustering=clustering)
    
    results = anlysr.setupResults()
    
    # Load results from file.
    resFileName = workDir / f'valtests-mcds-analyser-afteropt{varOpt}-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to compare'.format(len(results)))

# VIII. Truncation optimisation : Study on parameter variants

Objective: How to choose key parameters ?
* how many outliers ?
* how many max iters ?
* correlation with number of sightings ?

In [None]:
import plotly.express as plyx

## 1. Data set, samples, transects, analyses

Warning: First run [VII. Truncation optimisation (short code / fast run)](#VII.-Truncation-optimisation-(short-code-and-fast-run)) 0, 1 and 2 above

In [None]:
dfOptimExplSpecs = ads.Analyser.explicitVariantSpecs('refin/ACDC2019-Naturalist-ExtraitSpecsOptanalyses.xlsx', 
                                                     ignore=['Params1_expl', 'Params2_expl'])

dfOptimExplSpecs.drop(dfOptimExplSpecs[dfOptimExplSpecs[['TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt']]
                                           .isnull().all(axis='columns')].index,
                      inplace=True)

dfOptimExplSpecs.drop(columns=['TrGche', 'TrDrte', 'NbTrchMod', 'MultiOpt'], inplace=True)

nOptimExplSpecs = len(dfOptimExplSpecs)

dfOptimExplSpecs

## 2. Parameter variants plan

In [None]:
nTimes = 20

In [None]:
expr2MaxPlan = ['chi2', 'ks']

In [None]:
outliersPctPlan = [2.5, 5.0]

In [None]:
maxItersPlan = [50, 100, 150, 200, 250, 300]

In [None]:
computed = False

## 3A. Or: Run optimisations according to the plan

In [None]:
nParSets = len(expr2MaxPlan) * len(outliersPctPlan) * len(maxItersPlan)
nOpt2Run = len(dfOptimExplSpecs) * nTimes * nParSets
print(f'About to run {nOpt2Run} optimisations !')

In [None]:
# Colonnes de dfOptimExplSpecs donnant les paramètres d'analyse / optimisation
optimParamsSpecsCols  = ['FonctionClé', 'SérieAjust', 'CritChx', 'IntervConf',
                         'TroncGche', 'TroncDrte', 'MethOutliers', 'NbTrModel', 'NbTrDiscr',
                         'ExprOpt', 'ParExec', 'MoteurOpt']

In [None]:
%%time

ldfResults = list()

nParSetInd = 1
for expr2Max in expr2MaxPlan:
    
    for olrsPct in outliersPctPlan:

        for maxIters in maxItersPlan:

            logger.info(f'Params set {nParSetInd}/{nParSets}: {expr2Max=}, {nTimes=}, {maxIters=}, {olrsPct=:.1f}')
            
            # Prepare optim. params.
            dfMoreOptimCols = \
                pd.DataFrame([dict(CritChx='AIC', IntervConf=95,
                                   TroncGche='auto', TroncDrte='auto',
                                   MethOutliers=f'tucquant({olrsPct:.1f})',
                                   NbTrModel='mult(2/3, 3/2)', NbTrDiscr=None,
                                   ExprOpt=f'max({expr2Max})', ParExec=f'times({nTimes})',
                                   MoteurOpt=f'zoopt({maxIters})')]*len(dfOptimExplSpecs))

            dfOptVarExplSpecs = pd.concat([dfOptimExplSpecs.reset_index(drop=True), dfMoreOptimCols], axis='columns')

            # Run optimisation.
            zoptr = ads.MCDSZerothOrderTruncationOptimiser \
                            (dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea, 
                             transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                             sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                             abbrevCol=optAbbrevCol, abbrevBuilder=optimAbbrev,
                             anlysIndCol=optIndCol, sampleIndCol=sampleNumCol,
                             distanceUnit='Meter', areaUnit='Hectare',
                             surveyType='Point', distanceType='Radial', clustering=False,
                             resultsHeadCols=dict(before=[optIndCol], sample=sampleSelCols, after=optimParamsSpecsCols),
                             workDir='/tmp', logData=False,                 
                             defCoreMaxIters=120)

            results = zoptr.run(dfOptVarExplSpecs, threads=12)

            zoptr.shutdown()

            # Save results for this run
            ldfResults.append(results.dfData)
            
            nParSet += 1
        
# Done : concat and save results.
dfResults = pd.concat(ldfResults, ignore_index=True)

resFileName = 'tmp/valtests-mcds-opter-res4stats.xlsx'
dfResults.to_excel(resFileName, index=False)
logger.info(f'Results saved to {resFileName}')

computed = True

## 3B. Or : Load results from a previous run

(already run and saved above)

In [None]:
if 'computed' not in dir():
    computed = False
if not computed:
    
    # Load results from file.
    #resFileName = 'tmp/valtests-mcds-opter-res4stats-20200705.xlsx'
    resFileName = 'tmp/valtests-mcds-opter-res4stats-20201103.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    dfResults = pd.read_excel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} results to process'.format(len(dfResults)))

## 4. First stats on optimisation results

* raw stats : mean and std
* first correlations : number of analyses / optimised criterium

In [None]:
len(dfResults), dfResults.columns

In [None]:
dfResults.head()

In [None]:
optResCols = ['minDist', 'maxDist', 'fitDistCuts', 'chi2', 'ks', 'chi2*ks']
#groupCols = [col for col in dfResults.columns if col not in optResCols]
groupCols = ['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust', 'MethOutliers', 'ExprOpt',
             'MinDist', 'MaxDist', 'FitDistCuts', 'NFunEvals']

### a. Raw stats : mean, std

In [None]:
dfStats = dfResults.groupby(groupCols).agg(['mean', 'std'])
dfStats

In [None]:
resFileName = 'tmp/valtests-mcds-opter-stats.xlsx'
dfStats.reset_index().to_excel(resFileName)

### b. Visual correlations

In [None]:
dfResults['NFunEvalsR'] = dfResults.NFunEvals.apply(lambda v: int(50*np.ceil(v/50)))

In [None]:
for esp in dfResults['Espèce'].unique():
    axes = dfResults[dfResults['Espèce'] == esp].plot.hexbin(y='NFunEvalsR', x='chi2', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : chi2 / NFunEvals')

In [None]:
plt.close()

In [None]:
for esp in dfResults['Espèce'].unique():
    axes = dfResults[dfResults['Espèce'] == esp].plot.hexbin(y='NFunEvalsR', x='ks', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : ks / NFunEvals')

In [None]:
plt.close()

In [None]:
dfResults['Outliers'] = dfResults.MethOutliers.apply(lambda s: float(s[len('tucquant('):-1]))

In [None]:
_ = dfResults.plot.scatter(y='Outliers', x='chi2', figsize=(14, 2))

In [None]:
_ = dfResults.plot.scatter(y='Outliers', x='ks', figsize=(14, 2))

In [None]:
dfResults.head()

In [None]:
plyx.violin(dfResults, x='chi2', y='NFunEvalsR', facet_row='Outliers', color="Espèce", orientation='h', height=1000)

In [None]:
plyx.violin(dfResults, x='ks', y='NFunEvalsR', facet_row='Outliers', color="Espèce", orientation='h', height=1000)

### c. Computed correlations

(linéaires, de Pearson)

In [None]:
def pearsonCorr(df, x, y):
    
    cv = np.cov(df[x].values, df[y].values)
    
    return pd.Series(dict(corr=cv[0, 1] / cv[0, 0] / cv[1, 1]))

In [None]:
# Nb of analyses run / optimisation criterium
groupCols = ['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust', 'MethOutliers', 'ExprOpt']

df = dfResults.loc[dfResults.ExprOpt == 'max(chi2)',
                   groupCols + ['NFunEvals', 'chi2']].groupby(groupCols).apply(pearsonCorr, x='NFunEvals', y='chi2')
df.rename(columns=dict(corr='NFun/Expr'), inplace=True)
dfCorr = df.copy()

df = dfResults.loc[dfResults.ExprOpt == 'max(ks)',
                   groupCols + ['NFunEvals', 'ks']].groupby(groupCols).apply(pearsonCorr, x='NFunEvals', y='ks')
df.rename(columns=dict(corr='NFun/Expr'), inplace=True)
dfCorr = dfCorr.append(df)

dfCorr.describe()

In [None]:
dfCorr[dfCorr.index.get_level_values('ExprOpt') == 'max(ks)'].sort_values(by='NFun/Expr', ascending=False)

In [None]:
dfCorr[dfCorr.index.get_level_values('ExprOpt') == 'max(chi2)'].sort_values(by='NFun/Expr', ascending=False)

In [None]:
# % of outliers excluded / optimisation criterium
groupCols = ['Espèce', 'Passage', 'Adulte', 'Durée']

df = dfResults.loc[dfResults.ExprOpt == 'max(chi2)',
                   groupCols + ['Outliers', 'chi2']].groupby(groupCols).apply(pearsonCorr, x='Outliers', y='chi2')
df.rename(columns=dict(corr='Outliers/Expr'), inplace=True)
dfCorr = df.copy()

df = dfResults.loc[dfResults.ExprOpt == 'max(ks)',
                   groupCols + ['Outliers', 'ks']].groupby(groupCols).apply(pearsonCorr, x='Outliers', y='ks')
df.rename(columns=dict(corr='Outliers/Expr'), inplace=True)
dfCorr = dfCorr.append(df)

dfCorr

## 5. Run analyses with optimised truncations

(to get the actual numbers of sightings retained)

### a. Deduce analyses specs from optimisation results

In [None]:
dfResults.columns

In [None]:
varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'

In [None]:
# Get sample and analysis params, and above all optimised truncation param. values from optimiser results.
#optTgtCols = ['TrGche', 'TrDrte', 'NbTrchMod']
optTgtCols = ['TroncGche', 'TroncDrte', 'NbTrModel']
otherOptTgtCols = ['Outliers', 'NFunEvals']
dfAnlysSpecs = dfResults[['Espèce', 'Passage', 'Adulte', 'Durée', 'FonctionClé', 'SérieAjust',
                          'minDist', 'maxDist', 'fitDistCuts'] + optTgtCols + otherOptTgtCols].copy()

# Add analysis abbreviation from truncation params optim. specs (not from optimised results).
dfAnlysSpecs[anlysAbbrevCol] = dfAnlysSpecs.apply(analysisAbbrev, axis='columns')

# No need for the truncation params optim. specs anymore
dfAnlysSpecs.drop(columns=optTgtCols, inplace=True)

# Rename optimised truncation param. columns for analysis
dfAnlysSpecs.rename(columns=dict(minDist='TrGche', maxDist='TrDrte', fitDistCuts='NbTrchMod'), inplace=True)

dfAnlysSpecs

In [None]:
workDir = pl.Path('tmp/mcds-optstats')

In [None]:
computed = False

### b. Or : Really run analyses

In [None]:
# i. MCDS Analyser object
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

anlysr = ads.MCDSAnalyser(dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea,
                          transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                          sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                          abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                          distanceUnit=distanceUnit, areaUnit=areaUnit,
                          surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                          resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                               after=anlysParamCols + [anlysAbbrevCol, 'Outliers', 'NFunEvals']),
                          workDir=workDir,
                          defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                          defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                          defMinDist=defMinDist, defMaxDist=defMaxDist,
                          defFitDistCuts=defFitDistCuts, defDiscrDistCuts=defDiscrDistCuts)

In [None]:
# ii. Check analysis explicit specs
dfAnlysSpecs, userParamSpecCols, intParamSpecCols, unmUserParamSpecCols, verdict, reasons = \
    anlysr.explicitParamSpecs(dfExplParamSpecs=dfAnlysSpecs, dropDupes=True, check=True)

assert len(dfAnlysSpecs) == len(dfResults)
assert userParamSpecCols == ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod'], str(userParamSpecCols)
assert intParamSpecCols == ['EstimKeyFn', 'EstimAdjustFn', 'MinDist', 'MaxDist', 'FitDistCuts'], str(intParamSpecCols)
assert unmUserParamSpecCols == []
assert verdict
assert not reasons, str(reasons)

In [None]:
%%time

# iii. Run analyses

# Analyses : 20mn for 8640 analyses with 12 threads on a Lenovo T490 (4-HT-core i5-8365U with PCI-e SSD)
results = anlysr.run(dfAnlysSpecs, threads=12)

computed = True

In [None]:
anlysr.shutdown()

In [None]:
results.dfTransData('fr')

In [None]:
# iiii. Save results for later reload or examination
results.toExcel(workDir / 'valtests-mcds-analyser-afteropt-results.xlsx')

In [None]:
#results.toExcel(workDir / 'valtests-mcds-analyser-afteropt-fr.xlsx', lang='fr')

### b. Or : Load analyses from a previous run

(already run and saved above)

In [None]:
if not computed:
    
    # An analyser object knowns how to build an empty results object ...
    anlysr = ads.MCDSAnalyser(dfObsIndiv, effortConstVal=1, dSurveyArea=dSurveyArea,
                              resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                                   after=[anlysAbbrevCol]),
                              transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                              sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                              abbrevCol=anlysAbbrevCol, anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                              distanceUnit=distanceUnit, areaUnit=areaUnit,
                              surveyType=surveyType, distanceType=distanceType, clustering=clustering)
    
    results = anlysr.setupResults()
    
    # Load results from file.
    resFileName = workDir / 'valtests-mcds-analyser-afteropt-results.xlsx'
    print('Loading results from {} ...'.format(resFileName))

    results.fromExcel(resFileName)
    
else:
    
    print('Just computed, not reloading ...')
    
print('... {} analyses to study'.format(len(results)))

In [None]:
dfAnRes = results.dfTransData('fr')
dfAnRes

## 6. Other stats on analysis results

Through NObs mainly

In [None]:
dfAnRes.columns

In [None]:
for esp in dfAnRes['Espèce'].unique():
    axes = dfAnRes[dfAnRes['Espèce'] == esp].plot.hexbin(y='NObs', x='Chi2 P', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : chi2 / NObs')

In [None]:
plt.close()

In [None]:
for esp in dfAnRes['Espèce'].unique():
    axes = dfAnRes[dfAnRes['Espèce'] == esp].plot.hexbin(y='NObs', x='KS P', gridsize=(20, 6), figsize=(14, 3))
    axes.set_title(f'{esp} : KS / NObs')

In [None]:
plt.close()

In [None]:
dfAnRes['NFunEvalsR'] = dfAnRes.NFunEvals.apply(lambda v: int(50*np.ceil(v/50)))

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'Chi2 P', 'NObs', 'Outliers', 'NFunEvalsR']].dropna(subset=['NObs']),
             x='Chi2 P', y='NObs', facet_col='Outliers', facet_row='NFunEvalsR', color='Espèce', height=1200)

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'KS P', 'NObs', 'Outliers', 'NFunEvalsR']].dropna(subset=['NObs']),
             x='KS P', y='NObs', facet_col='Outliers', facet_row='NFunEvalsR', color='Espèce', height=1200)

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'Chi2 P', 'NObs', 'Outliers', 'NFunEvals']].dropna(subset=['NObs']),
            x='NObs', y='Chi2 P', facet_col='Outliers', facet_row='Espèce', color='NFunEvals', height=700)

In [None]:
plyx.scatter(dfAnRes[['Espèce', 'KS P', 'NObs', 'Outliers', 'NFunEvals']].dropna(subset=['NObs']),
             x='NObs', y='KS P', facet_col='Outliers', facet_row='Espèce', color='NFunEvals', height=700)

# Sandbox

In [None]:
type(sh.a)