<!-- Auto table of contents -->
<h1 class='tocIgnore'>Development archives: auto-filtered and sorted results / reports</h1>

**pyaudisam**: Automation of Distance Sampling analyses with [Distance software](http://distancesampling.org/)

Copyright (C) 2021 Jean-Philippe Meuret

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program.
If not, see https://www.gnu.org/licenses/.

<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table of contents</h2>
  <div id="toc"></div>
</div>

In [None]:
%%javascript
$.getScript('ipython_notebook_toc.js')

In [None]:
%matplotlib inline

In [None]:
import sys
import os
import pathlib as pl

import re

import concurrent.futures as cofu

import math
import numpy as np
import pandas as pd

from tqdm import tqdm

from IPython.display import HTML

In [None]:
# Activate Warnings as Exception
#import warnings
#warnings.filterwarnings('error')

In [None]:
sys.path.insert(0, '..')

In [None]:
import pyaudisam as ads

ads.runtime

In [None]:
# Create temporary directory if not yet done.
tmpDir = pl.Path('tmp')
tmpDir.mkdir(exist_ok=True)

tmpDir.absolute().as_posix()

In [None]:
# Logging configuration.
ads.log.configure(handlers=[sys.stdout, tmpDir / 'devarc2.log'], reset=True,
                  loggers=[dict(name='matplotlib', level=ads.WARNING),
                           dict(name='ads', level=ads.INFO),
                           dict(name='ads.dat', level=ads.INFO2),
                           dict(name='ads.eng', level=ads.INFO2),
                           dict(name='ads.onr', level=ads.DEBUG),
                           dict(name='ads.anr', level=ads.DEBUG1)])

logger = ads.logger('devarc2', level=ads.DEBUG)

# Commons

In [None]:
# v. Sample and analysis identification.
def sampleAbbrev(sSamp):
    abbrvs = [''.join(word[:4].title() for word in sSamp[colEspece].split(' ')[:2])]
    if colPassage in sSamp.index and not pd.isnull(sSamp.Passage) and sSamp.Passage:
        abbrvs.append(sSamp.Passage.replace('+', ''))
    if 'Durée' in sSamp.index:
        abbrvs.append(sSamp['Durée'].replace('+', ''))
    if 'Adulte' in sSamp.index:
        abbrvs.append(sSamp.Adulte.replace('+', ''))
    return '-'.join(abbrvs)

def analysisAbbrev(sAnlys):
    
    # Sample abbreviation
    abbrevs = [sampleAbbrev(sAnlys)]

    # Model + Parameters abbreviation
    abbrevs += [sAnlys['FonctionClé'][:3].lower(), sAnlys['SérieAjust'][:3].lower()]
    dTroncAbbrv = { 'l': 'TrGche' if 'TrGche' in sAnlys.index else 'TroncGche',
                    'r': 'TrDrte' if 'TrDrte' in sAnlys.index else 'TroncDrte',
                    'm': 'NbTrModel' if 'NbTrModel' in sAnlys.index else  'NbTrchMod',
                    'd': 'NbTrDiscr' }
    for abbrev, name in dTroncAbbrv.items():
        if name in sAnlys.index and not pd.isnull(sAnlys[name]):
            abbrevs.append('{}{}'.format(abbrev, sAnlys[name][0].lower() if isinstance(sAnlys[name], str)
                                                 else int(sAnlys[name])))
   
    return '-'.join(abbrevs)

# Load optanalysis results

Tooling for subsequent tests, comparison, etc ... (see below).

In [None]:
# Select target study
dossier = pl.Path('../../perso/donnees/acdc')

nomEtude = 'ACDC2019'
sousEtude = '-Nat'
#sousEtude = '-Pap'

varEtude = ''

## 1. Select results file to load

In [None]:
# List eligible folders for choosen study
resFileName = f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-resultats.xlsx'

resFolders = [fn.name for fn in dossier.glob('[0-9]'*6+'-'+'[0-9]'*4) if (fn / resFileName).is_file()]

logger.info('Rapports historiques disponibles: {}'.format(', '.join(resFolders)))

In [None]:
# Select the one to process
workDir = dossier / resFolders[0]  # <=== Choisir le dossier de résultats ici.

updatedResFileNameExists = (pl.Path('tmp') / resFileName).is_file()
if not updatedResFileNameExists:

    resFileName = workDir / resFileName

    logger.info(f'Fichier choisi : {resFileName.as_posix()}')
    
else:
    
    # An updated results file (with sample stats : see below) exists: select this one.
    resFileName = tmpDir / resFileName

    logger.info(f'... mais résultats à jour aussi: {resFileName.as_posix()}')

## 2. Build an MCDSTruncOptanalysisResultsSet object to load file

In [None]:
# Build an empty MCDSTruncOptanalysisResultsSet from an MCDSTruncationOptanalyser object

# i. Load individualised data with computed distances + transects info.
fpn = dossier / f'{nomEtude}{sousEtude}-ObsIndivDist.xlsx'
with pd.ExcelFile(fpn) as xlsFile:
    dfObsCatIndiv = pd.read_excel(xlsFile, sheet_name='Donnees')
    dfTransects = pd.read_excel(xlsFile, sheet_name='Inventaires')

print(dict(etude=nomEtude+sousEtude, donnees=len(dfObsCatIndiv), inventaires=len(dfTransects)))

In [None]:
# ii. Data description
colEspece = 'Espèce'
colPassage = 'Passage'
colDistance = 'Distance'
transectPlaceCols = ['Point']
passIdCol = colPassage

assert 'effortCol' not in dir() or effortCol == 'Effort'  # In rare cases, needs to be defined before here, but the same way !
effortCol = 'Effort'

colsSpeSelEchant = ['Adulte', 'Durée']  # Colonnes de sélection des échantillons : en plus de Espèce et Passage. 
sampleDistCol = colDistance
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'Echant'
sampleSelCols = [colEspece, passIdCol] + colsSpeSelEchant

In [None]:
# iii. Analysis params.
distanceUnit = 'Meter'
areaUnit = 'Sq. Kilometer'
surveyType = 'Point'
distanceType = 'Radial'

groupage = False
effortConst = 1 # Const effort value = 1 per pass on each points

dZoneEtude = dict(Zone='ACDC', Surface=24) # Area unit = Sq. Kilometer (see areaUnit above)

In [None]:
# iv. Opt-analyses parameters.
anlysIndCol = 'Analyse'
anlysAbbrevCol = 'Abrev. Analyse'
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

In [None]:
# vi. Distance truncation grouping parameters (for best analysis selection after some simplification)
ldTruncIntrvSpecs = [dict(col='left', minDist=5.0, maxLen=5.0),  dict(col='right', minDist=25.0, maxLen=25.0)]
truncIntrvEpsilon = 1e-6

In [None]:
# vii. At last, the empty results set object !
optanlr = \
    ads.MCDSTruncationOptanalyser(dfObsCatIndiv, dfTransects=dfTransects,
                                  effortConstVal=effortConst, dSurveyArea=dZoneEtude, 
                                  transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                  sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                                  abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                  anlysIndCol=anlysIndCol, sampleIndCol=sampleNumCol,
                                  distanceUnit=distanceUnit, areaUnit=areaUnit,
                                  surveyType=surveyType, distanceType=distanceType, clustering=groupage,
                                  ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon,
                                  resultsHeadCols=dict(before=[anlysIndCol, sampleNumCol], sample=sampleSelCols,
                                                       after=anlysParamCols + [anlysAbbrevCol]))

results = optanlr.setupResults()

## 3. Load file

In [None]:
results.fromFile(resFileName, postComputed=False)  # Use postComputed=True to avoid updating post-computed columns on load.

## 4. Backward compatibility: complete columns set

In [None]:
if not updatedResFileNameExists and 'sample stats' not in results._dfData.columns.unique(level=0):
    
    # If needed, add sample stats a posteriori
    # (these stats had not been implemented when the historical results were saved to disk)
    dfSampleStats = pd.read_excel(dossier / f'{nomEtude}{sousEtude}-StatsEchantillons.xlsx')
    dfSampleStats.rename(columns={'NTot Obs': 'NTot Obs0'}, inplace=True)
    dfSampleStats.insert(dfSampleStats.columns.to_list().index('Distance Min'), 'NTot Obs', dfSampleStats['NTot Obs0'])
    dfSampleStats.drop(columns=['NTot Obs0'], inplace=True)

    miSampleCols = pd.MultiIndex.from_tuples([('header (sample)', colEspece, 'Value'),
                                              ('header (sample)', colPassage, 'Value'),
                                              ('header (sample)', colsSpeSelEchant[0], 'Value'),
                                              ('header (sample)', colsSpeSelEchant[1], 'Value')])
    dfSampleStats.columns = miSampleCols.append(ads.MCDSEngine.MIStatSampCols)

    results.dfData = results._dfData.join(dfSampleStats.set_index(miSampleCols.to_list()), on=miSampleCols.to_list())
        
    # Save updated results for later use.
    logger.info('Ecriture des resultats mis à jour: {} ...'.format((tmpDir / resFileName.name).as_posix()))
    results.toExcel(tmpDir / resFileName.name)  # Note: This actually triggers post-computations.

In [None]:
# In case not yet done, trigger post-compute columns (re)-computation (create/update quality indicators and sort orders).
dfActRes = results.dfTransData('fr')
dfActRes

## 5. Figures and checks

In [None]:
# Some facts and figures
dfActRes['NbTot Pars'].value_counts(), dfActRes['NbPars FnClé'].value_counts(), dfActRes['NbPars SérAjust'].value_counts(), 

In [None]:
# Some checks before going on.
assert dfActRes.Analyse.nunique() == len(dfActRes)

# Load / generate filter & sort reports

Tooling for subsequent tests, comparison, etc ... (see below).

Early 2021 prototype version, or industrialised version, ... for comparison, non-regression tests, quality tests ...

## 1. Or: Load early 2021 prototype report

In [None]:
# Select target study
dossier = pl.Path('../../perso/donnees/acdc')

nomEtude = 'ACDC2019'
sousEtude = '-Nat'
#sousEtude = '-Pap'

varEtude = ''

In [None]:
# Select target report
refRepFileName = f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-raptousech.ods'

resFolders = [fn.name for fn in dossier.glob('[0-9]'*6+'-'+'[0-9]'*4) if (fn / refRepFileName).is_file()]

logger.info('Rapports prototypes disponibles : ' + ', '.join(resFolders))

In [None]:
workDir = dossier / resFolders[0]  # <=== Choisir le dossier de résultats ici.

refRepFileName = workDir / refRepFileName

logger.info(f'Choix {refRepFileName.as_posix()}')

In [None]:
%%time

# Load report (all sheets)
logger.info(f'Lecture du rapport prototype choisi ...')

ddfProtoRep = pd.read_excel(refRepFileName, sheet_name=None)

protoSheetPrefix = ''

print('=> feuilles : ' + ', '.join(ddfProtoRep.keys()))

In [None]:
# Rename columns from prototype scheme to the industrial one to compare with later if needed.
# TODO: Add EN version
DRefRep2IndusResCols = {'Fn Clé Mod': 'FonctionClé', 'Sér Ajust Mod': 'SérieAjust',
                        'Dist Tronc Gche': 'TrGche', 'Dist Tronc Drte': 'TrDrte',
                        'Tranch Dist Mod': 'NbTrchMod',
                        
                        'Sélection Qual Equi': 'Pré-sélection Qual Equi 1',
                        
                        'Qual Equi': 'Qual Equi 1',
                        'Qual Chi2': 'Qual Chi2+',
                        'Qual DCV': 'Qual DCv+',
                        'Qual KS': 'Qual KS+',
                        
                        'Grp Dist Tronc Gche': 'Groupe Tronc Gche',
                        'Grp Dist Tronc Drte': 'Groupe Tronc Drte',
                        
                        'Meil AIC Tronc Id': 'Ordre Tronc Ident AIC',
                        
                        'Meil CKCv Tronc Proch'     : 'Ordre Tronc Proch Chi2 KS DCv',
                        'Meil CVDens Tronc Proch'   : 'Ordre Tronc Proch DCv',
                        'Meil Qual Equi Tronc Proch': 'Ordre Tronc Proch Qual Equi 1',
                        'Meil Qual Chi2 Tronc Proch': 'Ordre Tronc Proch Qual Equi Chi2+',
                        'Meil Qual KS Tronc Proch'  : 'Ordre Tronc Proch Qual Equi KS+',
                        'Meil Qual DCV Tronc Proch' : 'Ordre Tronc Proch Qual Equi DCv+',
                        
                        'Ord CKCv'     : 'Ordre Global Chi2 KS DCv',
                        'Ord Qual Equi': 'Ordre Global Qual Equi 1',
                        'Ord Qual Chi2': 'Ordre Global Qual Equi Chi2+',
                        'Ord Qual KS'  : 'Ordre Global Qual Equi KS+',
                        'Ord Qual DCV' : 'Ordre Global Qual Equi DCv+',
                        'Ord Simpl Tronc': 'Ordre Global DeltaAIC Chi2 KS DCv'}

for sn in ddfProtoRep.keys():
    
    if sn in ['paramètres', 'échantillons']:
        continue
    
    dfFSSheet = ddfProtoRep[sn]
    assert all(col in dfFSSheet.columns for col in DRefRep2IndusResCols), \
           ', '.join(col for col in DRefRep2IndusResCols if col not in dfFSSheet.columns)

    dfFSSheet.rename(columns=DRefRep2IndusResCols, inplace=True)

## 2. Or: Load existing industrialised report

Warning: May not be up-to-date with analysis results post-computation and filter-sorting evolutions

In [None]:
# Select target study
dossier = pl.Path('../../perso/donnees/acdc')

nomEtude = 'ACDC2019'
sousEtude = '-Nat'
#sousEtude = '-Pap'

varEtude = ''

In [None]:
repFileNameGlobs = [f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-rapport.xlsx',
                    f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-rapport*.ods']

possRepFilePaths = list()
for fnGlob in repFileNameGlobs:
    possRepFilePaths += [fpn for fpn in dossier.glob('[0-9]'*6 + '-' + '[0-9]'*4 + '/' + fnGlob)]

logger.info('Rapports industrialisés disponibles :')
for ind, fpn in enumerate(possRepFilePaths):
    logger.info(f'#{ind} {fpn.as_posix()}')

In [None]:
repFileName = possRepFilePaths[0]  # <=== Choisir le fichier de rapport ici.

workDir = repFileName.parent

logger.info(f'Choix {repFileName.as_posix()}')

In [None]:
%%time

# Load report (all sheets)
logger.info(f'Lecture du rapport industrialisé choisi ...')

ddfIndusRep = pd.read_excel(repFileName, sheet_name=None)

indusSheetPrefix = 'MFTA-'

logger.info('=> feuilles : ' + ', '.join(ddfIndusRep.keys()))

Jump to [MCDSTruncOptAnalysisResultsSet non-regressions tests and checks](#MCDSTruncOptAnalysisResultsSet-non-regressions-tests-and-checks)

## 3. Or: Generate industrialised filter & sort report from existing results

### a. Select and load source results

To do so, run [Load optanalysis results](#Load-optanalysis-results)

### b. Apply filter and sort schemes for the report

In [None]:
# Define appliable filter and sort schemes
# Warning: Carefully check all parameters in case you are targeting some kind of comparison ...
R = ads.MCDSTruncOptanalysisResultsSet

whichBestQua = [R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaBal3,  # <= WARNING: Which QuaBal<n> ?
                R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv]

whichFinalQua = R.CLCmbQuaBal3  # <= WARNING: Which QuaBal<n> ?
ascFinalQua = False

dupSubset = [R.CLNObs, R.CLEffort, R.CLDeltaAic, R.CLChi2, R.CLKS, R.CLCvMUw, R.CLCvMCw, R.CLDCv, 
             R.CLPDetec, R.CLPDetecMin, R.CLPDetecMax, R.CLDensity, R.CLDensityMin, R.CLDensityMax]
dDupRounds = {R.CLDeltaAic: 1, R.CLChi2: 2, R.CLKS: 2, R.CLCvMUw: 2, R.CLCvMCw: 2, R.CLDCv: 2, 
              R.CLPDetec: 3, R.CLPDetecMin: 3, R.CLPDetecMax: 3, R.CLDensity: 2, R.CLDensityMin: 2, R.CLDensityMax: 2}

DFilSorRepSchemes = [dict(method=R.filterSortOnExecCode,
                          deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                          filterSort=dict(whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                          preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                          preselThrhs=0.2, preselAscs=False, preselNum=15),
                     dict(method=R.filterSortOnExCAicMulQua,
                          deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                          filterSort=dict(sightRate=92.5, nBestAIC=3, nBestQua=1, whichBestQua=whichBestQua,
                                          nFinalRes=12, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                          preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                          preselThrhs=0.2, preselAscs=False, preselNum=12),
                     dict(method=R.filterSortOnExCAicMulQua,
                          deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                          filterSort=dict(sightRate=95, nBestAIC=2, nBestQua=1, whichBestQua=whichBestQua,
                                          nFinalRes=10, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                          preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                          preselThrhs=0.2, preselAscs=False, preselNum=10),
                     dict(method=R.filterSortOnExCAicMulQua,
                          deduplicate=dict(dupSubset=dupSubset, dDupRounds=dDupRounds),
                          filterSort=dict(sightRate=97.5, nBestAIC=2, nBestQua=1, whichBestQua=whichBestQua,
                                          nFinalRes=8, whichFinalQua=whichFinalQua, ascFinalQua=ascFinalQua),
                          preselCols=[R.CLCmbQuaBal1, R.CLCmbQuaBal2, R.CLCmbQuaBal3],
                          preselThrhs=0.2, preselAscs=False, preselNum=8)]

results.filSorCache.clear()
results.dFilSorSchemes.clear()

[results.filSorSchemeId(scheme) for scheme in DFilSorRepSchemes]

In [None]:
# Apply target filter and sort schemes
ddfIndusRep = dict()
dIndusFSSteps = dict()
for scheme in DFilSorRepSchemes:
    filSorSchId, dfFilSorRes, filSorSteps = \
        results.dfFilSorData(scheme=scheme, columns=None, lang='fr', rebuild=False)
    ddfIndusRep[filSorSchId] = dfFilSorRes
    dIndusFSSteps[filSorSchId] = filSorSteps
    
indusSheetPrefix = ''

In [None]:
# Add full results sheet (just as in real filter & sort reports)
ddfIndusRep['Détails'] = results.dfTransData('fr')

In [None]:
print(', '.join(ddfIndusRep.keys()))

# MCDSTruncOptAnalysisResultsSet non regressions tests and checks

To do so, run [Load optanalysis results](#Load-optanalysis-results) or [Load / generate filter & sort reports](#Load-%2F-generate-filter-%26-sort-reports) for each results to load

## 1. Load results to compare / check

### a. Reference results

#### Or: Load results from _RESULTS_ file

through [Load optanalysis results](#Load-optanalysis-results) and then ...

In [None]:
dfRefRes = results.dfTransData('fr')

#### Or: Load results from _REPORT_ file

through [2. Or: Load existing industrialised report](#2.-Or%3A-Load-existing-industrialised-report) or
[3. Or: Generate industrialised filter & sort report from existing results](#3.-Or%3A-Generate-industrialised-filter-%26-sort-report-from-existing-results) and then ...

In [None]:
print(', '.join(ddfIndusRep.keys()))

In [None]:
dfRefRes = ddfIndusRep['Détails']

In [None]:
dfRefRes.head()

### b. Target results

#### Or: Load results from _RESULTS_ file

through [Load optanalysis results](#Load-optanalysis-results) and then ...

In [None]:
dfActRes = results.dfTransData('fr')

#### Or: Load results from _REPORT_ file

through [2. Or: Load existing industrialised report](#2.-Or%3A-Load-existing-industrialised-report) or
[3. Or: Generate industrialised filter & sort report from existing results](#3.-Or%3A-Generate-industrialised-filter-%26-sort-report-from-existing-results) and then ...

In [None]:
print(', '.join(ddfIndusRep.keys()))

In [None]:
dfActRes = ddfIndusRep['Détails']

In [None]:
dfActRes.head()

## 2. Truncation groups

In [None]:
# Check strict equality
indexCols = ['Analyse']
compCols = ['Echant', 'Groupe Tronc Gche', 'Groupe Tronc Drte']

In [None]:
assert dfRefRes.set_index(indexCols).sort_index()[compCols].compare(dfActRes.set_index(indexCols).sort_index()[compCols]).empty

## 3. Filter and sort keys

In [None]:
dfActRes.columns.to_list()

In [None]:
# Check strict equality
indexCols = ['Analyse']
compCols = [col for col in dfActRes.columns if col.startswith('Ordre ')]

', '.join(compCols)

In [None]:
assert dfRefRes.set_index(indexCols).sort_index()[compCols].compare(dfActRes.set_index(indexCols).sort_index()[compCols]).empty

# MCDSTruncOptAnalysisResultsSet quality indicators progress

General target: Have the industrialised automated filter and sort report produce the same "winner result per sample", or a better one, when compared to the "final automated [protopype below](#D%C3%A9veloppement-%3A-Filtrage-et-tri-automatis%C3%A9-des-r%C3%A9sultats-d'optanalyses) + manual selection" reports donnees/*/ACDC2019-[Nat|Pap|NatPap]-OptAnalyses-raptousech.ods

Here, we try and compare filter and sort results from :
* reference = historical [protopype below](#D%C3%A9veloppement-%3A-Filtrage-et-tri-automatis%C3%A9-des-r%C3%A9sultats-d'optanalyses) + manual selection" reports donnees/*/ACDC2019-[Nat|Pap|NatPap]-OptAnalyses-raptousech.ods
* actual = a newer auto-generated report through the industrialised filter and sort report system.

WARNING: No more really useful as most filtered and sorted results now select mostly different analyses than in the historical case.

## 1. Select target filter and sort method for prototype report

WARNING: First run [1. Or: Load early 2021 prototype report](#1.-Or%3A-Load-early-2021-prototype-report) to load the full report.

In [None]:
# Select target filter & sort method sub-report
#refExCMeth = 'codexec'
refTgtMeth = 'ckcvqual925d12'

In [None]:
dfRefRep = ddfProtoRep[protoSheetPrefix + refTgtMeth]

logger.info('Référence (proto) : {}x{} lignes x colonnes.'.format(len(dfRefRep), len(dfRefRep.columns)))

In [None]:
# Extract useful columns in the right order
dfRefRep = dfRefRep[['Echant', 'Espèce', 'Passage', 'Adulte', 'Durée', 'Analyse',
                     'FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod',
                     'NbTot Pars', 'NObs', 'NTot Obs', 'Taux Obs', 'CodEx', 
                     'Delta AIC', 'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P',
                     'CoefVar Densité', 'Sélection finale',
                     'Pré-sélection Qual Equi 1', 'Qual Equi 1',
                     'Qual Chi2+', 'Qual KS+', 'Qual DCv+',
                     'Densité', 'Min Densité', 'Max Densité']].copy()

# Add source column
dfRefRep['Ref'] = True

# Prepare comparison of quality indicators specific columns
dfRefRep.rename(columns={col: col + ' REF' for col in ['Pré-sélection Qual Equi 1', 'Qual Equi 1',
                                                       'Qual Chi2+', 'Qual DCv+', 'Qual KS+']},
                inplace=True)

In [None]:
# Some checks before going on.
assert dfRefRep.Analyse.nunique() == len(dfRefRep)

In [None]:
dfRefRep

## 2. Select target filter and sort method for industrialised report

WARNING: First run [2. Or: Load existing industrialised report](#2.-Or%3A-Load-existing-industrialised-report) or
[3. Or: Generate industrialised filter & sort report from existing results](#3.-Or%3A-Generate-industrialised-filter-%26-sort-report-from-existing-results) to load the full report.

In [None]:
', '.join(ddfIndusRep.keys())

In [None]:
# Select target filter & sort method sub-report
#actTgtMeth = 'ExCode'

#actTgtMeth = 'AicCKCvQua-r925d12' # Old industrialised naming before 2021-10-15

#actTgtMeth = 'ExAicMQua-r925d12'  # Intermediate industrialised naming between 2021-10-15 and 2021-11-05

actTgtMeth = 'ExAicMQua-r925m6q3d12' # New industrialised naming for same method/scheme after 2021-11-05
#actTgtMeth = 'ExAicMQua-r925m6q2d12'

In [None]:
dfActRep = ddfIndusRep[indusSheetPrefix + actTgtMeth]

logger.info('Cible (indus) : {}x{} lignes x colonnes.'.format(len(dfActRep), len(dfActRep.columns)))

In [None]:
dfActRep.columns

In [None]:
# Extract useful columns in the right order
nParsCol = 'NbTot Pars' if 'NbTot Pars' in dfActRep.columns else 'NbPars SérAjust'
codExCol = ['CodEx'] if 'CodEx' in dfActRep.columns else []
dfActRep = dfActRep[['Echant', 'Espèce', 'Passage', 'Adulte', 'Durée', 'Analyse',
                     'FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod',
                     nParsCol, 'NObs', 'NTot Obs', 'Taux Obs'] + codExCol \
                     + ['Delta AIC', 'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P',
                        'CoefVar Densité',
                        'Pré-sélection Qual Equi 1', 'Qual Equi 1',
                        'Pré-sélection Qual Equi 2', 'Qual Equi 2',
                        'Pré-sélection Qual Equi 3', 'Qual Equi 3',
                        'Qual Chi2+', 'Qual KS+', 'Qual DCv+',
                        'Densité', 'Min Densité', 'Max Densité']].copy()

# Add source column
dfActRep['Act'] = True

# Prepare comparison of quality indicators specific columns
dfActRep.rename(columns={col: col + ' ACT' for col in ['Pré-sélection Qual Equi 1', 'Qual Equi 1',
                                                       'Pré-sélection Qual Equi 2', 'Qual Equi 2',
                                                       'Pré-sélection Qual Equi 3', 'Qual Equi 3',
                                                       'Qual Chi2+', 'Qual DCv+', 'Qual KS+']},
                inplace=True)

In [None]:
# Some checks before going on.
assert dfActRep.Analyse.nunique() == len(dfActRep)

In [None]:
dfActRep

## 3. Compare

### a. Visual comparison

In [None]:
len(dfRefRep), len(dfActRep)

In [None]:
# Merge reference and actual reports
actJoinCols = [col + ' ACT' for col in ['Pré-sélection Qual Equi 1', 'Qual Equi 1',
                                        'Pré-sélection Qual Equi 2', 'Qual Equi 2',
                                        'Pré-sélection Qual Equi 3', 'Qual Equi 3',
                                        'Qual Chi2+', 'Qual DCv+', 'Qual KS+']] + ['Act']
dfComp = dfRefRep.set_index('Analyse').join(dfActRep.set_index('Analyse')[actJoinCols])

dfComp = pd.concat([dfComp, dfActRep[~dfActRep.Analyse.isin(dfComp.index)].set_index('Analyse')])

dfComp.reset_index(inplace=True)
dfComp.rename(columns=dict(index='Analyse'), inplace=True)

dfComp.sort_values(by=['Echant', 'TrGche', 'TrDrte', 'Qual Equi 1 REF'], ascending=True, na_position='first', inplace=True)

compFileName = f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-quality-progress.xlsx'
dfComp.to_excel(tmpDir / compFileName, sheet_name=actTgtMeth, index=False)

dfComp

In [None]:
# Some checks before going on.
assert dfComp.Analyse.nunique() == len(dfComp)

### b. Simple results quality progress measurement

(as developpement goes on)

Warning: Only useful when most filtered and sorted results come from the same analyses ... BUT, this is less the case after 2021-10-10

In [None]:
# Compare reference final = manual selection, reference auto pre-selection, and actual auto pre-selection
nulComp = pd.Series(dict(goodRefSel1=np.nan, goodActSel=np.nan, betterActSel=np.nan, sharedRefAct=np.nan))
def compare2Ref(dfGrp, column='Pré-sélection Qual Equi 3 ACT'):
    
    # Extract reference final = manual selection
    sFinSel = dfGrp[dfGrp['Sélection finale'].notnull()]
    if sFinSel.empty:
        return nulComp
    sFinSel = sFinSel.iloc[0]
    
    # Extract reference auto pre-selection (on Qual Equi 1)
    sRefPSel = dfGrp[dfGrp['Pré-sélection Qual Equi 1 REF'] == 1]
    if sRefPSel.empty:
        return nulComp
    sRefPSel = sRefPSel.iloc[0]
    
    # Extract actual auto pre-selection (on specified column)
    sActPSel = dfGrp[dfGrp[column] == 1]
    if sActPSel.empty:
        return nulComp
    sActPSel = sActPSel.iloc[0]
    
    # Report: Compare ref final to ref, ref final to actual ; and check if 
    return pd.Series(dict(goodRefSel1=1 if (sFinSel.Analyse == sRefPSel.Analyse) else 0,
                          goodActSel=1 if (sFinSel.Analyse == sActPSel.Analyse) else 0,
                          betterActSel=1 if (sFinSel['Pré-sélection Qual Equi 1 REF'] >= sFinSel[column]) else 0,
                          sharedRefAct=(dfGrp['Ref'] & dfGrp['Act']).sum()))

In [None]:
dfDiag2 = dfComp.groupby('Echant').apply(compare2Ref, column='Pré-sélection Qual Equi 2 ACT')
dfDiag2

In [None]:
print(len(dfComp), 'analyses:', len(dfDiag2), 'samples:', dfDiag2.sum().to_dict())

#### 2021-10-10

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal1
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal1

* Progress history of len(dfDiag2), dfDiag2.sum():
  * 12:00 => 60 samples: {goodRefSel1: 31, goodActSel: 34, betterActSel: 48}
  * 18:00 => 60 samples: {goodRefSel1: 31, goodActSel: 33, betterActSel: 47}
  * 19:20 => 60 samples: {goodRefSel1: 31, goodActSel: 32, betterActSel: 46}

* Visual checks OK on a 9 sample subset of ACDC 2019 Nat:
  4 samples with > 120/200 sightings, 4 samples with 35-80 sightings, and 1 sample with 25-30 sightings

#### 2021-11-01 CLGrpOrdClTrQuaBal1 after MCDSAnalyserResultsSet fixes of the day

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal1
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal1

* len(dfComp), len(dfDiag2), dfDiag2.sum().to_dict():
  925 analyses: {sharedRefAct: 454}, 60 samples, {goodRefSel1: 31, goodActSel: 30, betterActSel: 39}
  
#### 2021-11-02 09:00 Mix of CLGrpOrdClTrQuaBal1, 2 & 3 ... not very interesting

After fixing buggy MCDSAnalyser.filterDichotScheme (now _indexOfWorstOneCriterion)

* Actual Fil/Sor params:
  - whichFinalQua = CLGrpOrdClTrQuaBal2, then CLGrpOrdClTrQuaBal3
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal1 <= QuaBal1 and not 2/3 :-(

* len(dfComp), len(dfDiag2), dfDiag2.sum().to_dict():
  - CLGrpOrdClTrQuaBal3 => 969 analyses: {sharedRefAct: 410}, 60 samples: {goodRefSel1: 31, goodActSel: 28, betterActSel: 39}
  - CLGrpOrdClTrQuaBal2 => 965 analyses: {sharedRefAct: 414}, 60 samples: {goodRefSel1: 31, goodActSel: 28, betterActSel: 39}
 
#### 2021-11-02 10:20 CLGrpOrdClTrQuaBal2 only

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal2
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal2 <= QuaBal2 :-)

* len(dfComp), len(dfDiag2), dfDiag2.sum().to_dict():
  966 analyses: {sharedRefAct: 409}, 60 samples: {goodRefSel1: 31, goodActSel: 27, betterActSel: 39}

In [None]:
dfDiag3 = dfComp.groupby('Echant').apply(compare2Ref, column='Pré-sélection Qual Equi 3 ACT')
dfDiag3

In [None]:
print(len(dfComp), 'analyses:', len(dfDiag3), 'samples:', dfDiag3.sum().to_dict())

#### 2021-10-10

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal1
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal1

* Progress history of len(dfDiag3), dfDiag3.sum() :
  - 12:00 => (60, {rePSel: 31, sel: 33, better: 47})
  - 18:00 => (60, {rePSel: 31, sel: 30, better: 41})
  - 19:20 => (60, {rePSel: 31, sel: 30, better: 40})

* Visual checks OK on a 9 sample subset of ACDC 2019 Nat:
  4 samples with > 120/200 sightings, 4 samples with 35-80 sightings, and 1 sample with 25-30 sightings
 

#### 2021-11-01 CLGrpOrdClTrQuaBal1 after MCDSAnalyserResultsSet fixes of the day

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal1
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal1

* len(dfComp), len(dfDiag3), dfDiag3.sum().to_dict():
  925 analyses: {sharedRefAct: 454}, 60 samples, {goodRefSel1: 31, goodActSel: 28, betterActSel: 33}

#### 2021-11-02 09:00 Mix of CLGrpOrdClTrQuaBal1, 2 & 3 ... not very interesting

After fixing buggy MCDSAnalyser.filterDichotScheme (now _indexOfWorstOneCriterion)

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal2, then R.CLGrpOrdClTrQuaBal3
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal1 <= QuaBal1 and not 2/3 :-(

* len(dfComp), len(dfDiag3), dfDiag3.sum().to_dict():
  - CLGrpOrdClTrQuaBal3 => 969 analyses: {sharedRefAct: 410}, 60 samples, {goodRefSel1: 31, goodActSel: 26, betterActSel: 33}
  - CLGrpOrdClTrQuaBal2 => 965 analyses: {sharedRefAct: 414}, 60 samples: {goodRefSel1: 31, goodActSel: 26, betterActSel: 33}
  
#### 2021-11-02 10:20 CLGrpOrdClTrQuaBal3 only

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal3
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal3 <= QuaBal3 :-)

* len(dfComp), len(dfDiag3), dfDiag3.sum().to_dict():
  970 analyses: {sharedRefAct: 405}, 60 samples: {goodRefSel1: 31, goodActSel: 25, betterActSel: 32}
  
#### 2021-11-03 19:00 CLGrpOrdClTrQuaBal3 only

After fixing final sort column (QuaOrd => Qua !) in MCDSAnalysisResultsSet.filterSortOnExCAicMulQua/filterSortOnExCode
=> better !

* Actual Fil/Sor params:
  - whichFinalQua = R.CLGrpOrdClTrQuaBal3
  - whichBestQua = R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv, R.CLGrpOrdClTrQuaBal3 <= QuaBal3 :-)

* len(dfComp), len(dfDiag3), dfDiag3.sum().to_dict():
  859 analyses: {sharedRefAct: 516}, 60 samples: {goodRefSel1: 31, goodActSel: 30, betterActSel: 40}

## N. Development of base functions for quality indicators computation

History (on ACDC 2019 data):
* March 2021
  * Balanced quality 1 : normNTotPars(a=0.2, b=0.6, c=2) & normCVDens(a=12) => ACDC2019-Resultats.2103.ods
* August 2021
  * Balanced quality 2 : normNTotPars(a=0.2, b=0.8, c=1) & normCVDens(a=16) => same final filtering on AicCKCvQua-r925d12
  * Balanced quality 3 : normNTotPars(a=0.3, b=0.7, c=1) & normCVDens(a=20) => ~same final filtering on AicCKCvQua-r925d12
* October 2021
  * Balanced quality 2&3 : same as August 2021 but normNTotPars => normNKeyPars + normKeyFn

In [None]:
#import plotly as ply
import plotly.graph_objs as plygo

In [None]:
def normNTotPars(value, a=0.2, b=0.6, c=2):  #, d=1):
    #return 1 / (a * value + b)  # Trop pénalisant: a=0.2, b=1
    return 1 / (a * max(c, value) + b)  # Mieux: a=0.2, b=0.6, c=2 / a=0.2, b=0.8, c=1
    #return 1 / (a * max(c, value)**d + b)  # Idem si d=1

In [None]:
ax = np.linspace(0, 10, num=11)

ySchemes = dict(bq1mar21=dict(a=0.2, b=0.6, c=2), # Balanced quality 1  March 2021
                bq2aug21=dict(a=0.2, b=0.8, c=1), # Balanced quality 2 August 2021 : no change in final filsorting
                bq3aug21=dict(a=0.3, b=0.7, c=1)) # Balanced quality 3 August 2021 : ~idem

plygo.Figure(data=[plygo.Scatter(x=ax, name=name + ': ' + ', '.join('{}={}'.format(k, v) for k, v in sch.items()),
                                 y=[normNTotPars(x, **sch) for x in ax])
                   for name, sch in ySchemes.items()],
             layout=dict(title='normNTotPars', height=320, width=768,
                         margin=plygo.layout.Margin(l=40, r=40, b=40, t=40, pad=0)))

In [None]:
def normNAdjPars(value, a=0.1): #, b=0.6):
    #return 1 / (a * value + b)  # Pénalise trop à faible x, pas assez après
    return math.exp(-a * value ** 2)

In [None]:
ax = np.linspace(0, 7, num=8)

ySchemes = dict(tbq2oct21=dict(a=0.10), # Balanced quality 2 October 2021
                bq2oct21=dict(a=0.15), # Balanced quality 2 October 2021
                bq3oct21=dict(a=0.20), # Balanced quality 2 October 2021
                tbq3oct21=dict(a=0.25)) # Balanced quality 3 October 2021

plygo.Figure(data=[plygo.Scatter(x=ax, name=name + ': ' + ', '.join('{}={}'.format(k, v) for k, v in sch.items()),
                                 y=[normNAdjPars(x, **sch) for x in ax])
                   for name, sch in ySchemes.items()],
             layout=dict(title='normNAdjPars', height=320, width=768,
                         margin=plygo.layout.Margin(l=40, r=40, b=40, t=40, pad=0)))

In [None]:
def normCVDens(value, a=12, b=2):
    #return max(0, 1 - a * value) # Pas très pénalisant: a=1
    return math.exp(-a * value ** b) # Mieux : déjà ~0.33 à 30% (a=12)

In [None]:
ax = np.linspace(0.0, 1.0, num=11)
ySchemes = dict(bq1mar21=dict(a=12, b=2), # Balanced quality 1 March  2021
                tbq2oct21=dict(a=16, b=2), # Test Balanced quality 2 October 2021
                bq2oct21=dict(a=20, b=2), # Balanced quality 2 October 2021
                #tbq3oct21=dict(a=25, b=2), # Test Balanced quality 3 October 2021
                tbq3oct21b=dict(a=55, b=2.6), # Test Balanced quality 3 October 2021
                bq3oct21c=dict(a=63, b=2.8)) # Test Balanced quality 3 October 2021
plygo.Figure(data=[plygo.Scatter(x=ax, name=name + ': ' + ', '.join('{}={}'.format(k, v) for k, v in sch.items()),
                                 y=[normCVDens(x, **sch) for x in ax])
                   for name, sch in ySchemes.items()],
             layout=dict(title='normCVDens', height=320, width=768,
                         margin=plygo.layout.Margin(l=40, r=40, b=40, t=40, pad=0)))

# Non-regression: MCDS(TruncOpt)AnalysisResultsSet

* Non-regression tests between [protopype below](#D%C3%A9veloppement-%3A-Filtrage-et-tri-automatis%C3%A9-des-r%C3%A9sultats-d'optanalyses) and "industrialised" version
  (Ehrrr ... well ... between the buggy notebook prototype and the industrialised-derived version with same bugs :-(
* Quality tests for "industrialised" version (TODO)

In [None]:
#ads.logger('ads.dat', level=ads.INFO, reset=True)
_ = ads.logger('ads.anr', level=ads.DEBUG3, reset=True)
_ = ads.logger('ads.onr', level=ads.DEBUG3, reset=True)

In [None]:
# The industrialised-derived optanalyser results set for reproducing the buggy prototype version
class PrototypeConformResultsSet(ads.MCDSTruncOptanalysisResultsSet):

    def __init__(self, miCustomCols=None, dfCustomColTrans=None, miSampleCols=None, sampleIndCol=None,
                       sortCols=[], sortAscend=[], distanceUnit='Meter', areaUnit='Hectare',
                       surveyType='Point', distanceType='Radial', clustering=False,
                       ldTruncIntrvSpecs=[dict(col='left', minDist=5.0, maxLen=5.0),
                                          dict(col='right', minDist=25.0, maxLen=25.0)],
                       truncIntrvEpsilon=1e-6):
        
        """
        """
        super().__init__(miCustomCols=miCustomCols, dfCustomColTrans=dfCustomColTrans,
                         miSampleCols=miSampleCols, sampleIndCol=sampleIndCol,
                         sortCols=sortCols, sortAscend=sortAscend, distanceUnit=distanceUnit, areaUnit=areaUnit,
                         surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                         ldTruncIntrvSpecs=ldTruncIntrvSpecs,
                         truncIntrvEpsilon=truncIntrvEpsilon)

    @classmethod
    def _combinedQualityMoreChi2(cls, sRes):  # Prototype bug: (x*y*...*z)^(1/8) ; should be: x*y*...*z^(1/8)
        return sRes[[cls.CLChi2, cls.CLChi2, cls.CLKS, cls.CLCvMUw, cls.CLCvMCw]].prod() \
               * cls._normNObs(sRes) * cls._normNTotPars(sRes, a=0.2, b=0.6) \
               * cls._normCVDens(sRes, a=12) ** (1.0/8)

    @classmethod
    def _combinedQualityMoreKS(cls, sRes):  # Prototype bug: idem
        return sRes[[cls.CLChi2, cls.CLKS, cls.CLKS, cls.CLCvMUw, cls.CLCvMCw]].prod() \
               * cls._normNObs(sRes) * cls._normNTotPars(sRes, a=0.2, b=0.6) \
               * cls._normCVDens(sRes, a=12) ** (1.0/8)

    @classmethod
    def _combinedQualityMoreDCv(cls, sRes):  # Prototype bug: idem
        return sRes[[cls.CLChi2, cls.CLKS, cls.CLCvMUw, cls.CLCvMCw]].prod() \
               * cls._normNObs(sRes) * cls._normNTotPars(sRes, a=0.2, b=0.6) \
               * (cls._normCVDens(sRes, a=12) ** 2) ** (1.0/8)

    def _postComputeQualityIndicators(self):
        
        logger.debug('Post-computing Quality Indicators')

        self._dfData[self.CLSightRate] = 100 * self._dfData.apply(self._normNObs, axis='columns') # [0,1] => %

        # Prepare data for computations
        miCompCols = [cls.CLNObs, cls.CLNTotObs, cls.CLNTotPars, 
                      cls.CLChi2, cls.CLKS, cls.CLCvMUw, cls.CLCvMCw, cls.CLDCv]
        dfCompData = self._dfData[miCompCols].copy()

        logger.debug1('* Balanced quality 1')
        self._dfData[self.CLCmbQuaBal1] = dfCompData.apply(self._combinedQualityBalanced1, axis='columns')

        logger.debug1('* Balanced quality 2')
        self._dfData[self.CLCmbQuaBal2] = dfCompData.apply(self._combinedQualityBalanced2, axis='columns')

        logger.debug1('* Balanced quality 3')
        self._dfData[self.CLCmbQuaBal3] = dfCompData.apply(self._combinedQualityBalanced3, axis='columns')

        logger.debug1('* Balanced quality Chi2+')
        self._dfData[self.CLCmbQuaChi2] = dfCompData.apply(self._combinedQualityMoreChi2, axis='columns')

        logger.debug1('* Balanced quality KS+')
        self._dfData[self.CLCmbQuaKS]   = dfCompData.apply(self._combinedQualityMoreKS, axis='columns')

        logger.debug1('* Balanced quality DCv+')
        self._dfData[self.CLCmbQuaDCv]  = dfCompData.apply(self._combinedQualityMoreDCv, axis='columns')

# And the industrialised-derived optanalyser for instanciating it easily
class PrototypeConformOptanalyser(ads.MCDSTruncationOptanalyser):

    def __init__(self, dfMonoCatObs, dfTransects=None, effortConstVal=1, dSurveyArea=dict(), 
                 transectPlaceCols=['Transect'], passIdCol='Pass', effortCol='Effort',
                 sampleSelCols=['Species', 'Pass', 'Adult', 'Duration'], 
                 sampleDecCols=['Effort', 'Distance'], sampleDistCol='Distance', anlysSpecCustCols=[],
                 abbrevCol='AnlysAbbrev', abbrevBuilder=None, anlysIndCol='AnlysNum', sampleIndCol='SampleNum',
                 distanceUnit='Meter', areaUnit='Hectare',
                 surveyType='Point', distanceType='Radial', clustering=False,
                 resultsHeadCols=dict(before=['AnlysNum', 'SampleNum'], after=['AnlysAbbrev'], 
                                      sample=['Species', 'Pass', 'Adult', 'Duration']),
                 ldTruncIntrvSpecs=[dict(col='left', minDist=5.0, maxLen=5.0),
                                    dict(col='right', minDist=25.0, maxLen=25.0)], truncIntrvEpsilon=1e-6,
                 workDir='.', runMethod='subprocess.run', runTimeOut=300, logData=False,
                 logAnlysProgressEvery=50, logOptimProgressEvery=5, backupOptimEvery=50, autoClean=True,
                 defEstimKeyFn=ads.MCDSEngine.EstKeyFnDef, defEstimAdjustFn=ads.MCDSEngine.EstAdjustFnDef,
                 defEstimCriterion=ads.MCDSEngine.EstCriterionDef, defCVInterval=ads.MCDSEngine.EstCVIntervalDef,
                 defMinDist=ads.MCDSEngine.DistMinDef, defMaxDist=ads.MCDSEngine.DistMaxDef, 
                 defFitDistCuts=ads.MCDSEngine.DistFitCutsDef, defDiscrDistCuts=ads.MCDSEngine.DistDiscrCutsDef,
                 defExpr2Optimise='chi2', defMinimiseExpr=False,
                 defOutliersMethod='tucquant', defOutliersQuantCutPct=5,
                 defFitDistCutsFctr=dict(min=2/3, max=3/2),
                 defDiscrDistCutsFctr=dict(min=1/3, max=1),
                 defSubmitTimes=1, defSubmitOnlyBest=None, dDefSubmitOtherParams=dict(),
                 dDefOptimCoreParams=dict(core='zoopt', maxIters=100, termExprValue=None,
                                          algorithm='racos', maxRetries=0)):

        
        super().__init__(dfMonoCatObs, dfTransects=dfTransects, effortConstVal=effortConstVal, dSurveyArea=dSurveyArea, 
                         transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                         sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                         sampleDistCol=sampleDistCol, anlysSpecCustCols=anlysSpecCustCols,
                         abbrevCol=abbrevCol, abbrevBuilder=abbrevBuilder,
                         anlysIndCol=anlysIndCol, sampleIndCol=sampleIndCol,
                         distanceUnit=distanceUnit, areaUnit=areaUnit,
                         surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                         resultsHeadCols=resultsHeadCols,
                         ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon,
                         workDir=workDir, runMethod=runMethod, runTimeOut=runTimeOut, logData=logData,
                         logAnlysProgressEvery=logAnlysProgressEvery, logOptimProgressEvery=logOptimProgressEvery,
                         backupOptimEvery=backupOptimEvery, autoClean=autoClean,
                         defEstimKeyFn=defEstimKeyFn, defEstimAdjustFn=defEstimAdjustFn,
                         defEstimCriterion=defEstimCriterion, defCVInterval=defCVInterval,
                         defMinDist=defMinDist, defMaxDist=defMaxDist, 
                         defFitDistCuts=defFitDistCuts, defDiscrDistCuts=defDiscrDistCuts,
                         defExpr2Optimise=defExpr2Optimise, defMinimiseExpr=defMinimiseExpr,
                         defOutliersMethod=defOutliersMethod, defOutliersQuantCutPct=defOutliersQuantCutPct,
                         defFitDistCutsFctr=defFitDistCutsFctr, defDiscrDistCutsFctr=defDiscrDistCutsFctr,
                         defSubmitTimes=defSubmitTimes, defSubmitOnlyBest=defSubmitOnlyBest,
                         dDefSubmitOtherParams=dDefSubmitOtherParams, dDefOptimCoreParams=dDefOptimCoreParams)

    def setupResults(self):
    
        """Build an empty results objects.
        """

        miCustCols, dfCustColTrans, miSampCols, sampIndMCol, sortCols, sortAscend = \
            self.prepareResultsColumns()

        return PrototypeConformResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                                          miSampleCols=miSampCols, sampleIndCol=sampIndMCol,
                                          sortCols=sortCols, sortAscend=sortAscend,
                                          distanceUnit=self.distanceUnit, areaUnit=self.areaUnit,
                                          surveyType=self.surveyType, distanceType=self.distanceType,
                                          clustering=self.clustering,
                                          ldTruncIntrvSpecs=self.ldTruncIntrvSpecs,
                                          truncIntrvEpsilon=self.truncIntrvEpsilon)

## 1. Load results to postCompute from file

In [None]:
dossier = pl.Path('../../perso/donnees/acdc')

nomEtude = 'ACDC2019'
sousEtude = '-Nat'

varEtude = ''

In [None]:
colEspece = 'Espèce'
colPassage = 'Passage'
colDistance = 'Distance'

groupage = False
effortConst = 1 # Valeur d'effort constante = 1 par passage sur chaque point.

In [None]:
# Chargement des données
fpn = dossier / f'{nomEtude}{sousEtude}-ObsIndivDist.xlsx'
with pd.ExcelFile(fpn) as xlsFile:
    dfObsCatIndiv = pd.read_excel(xlsFile, sheet_name='Donnees')
    dfTransects = pd.read_excel(xlsFile, sheet_name='Inventaires')

print(dict(etude=nomEtude+sousEtude, donnees=len(dfObsCatIndiv), inventaires=len(dfTransects)))

In [None]:
# 1. Description des données
transectPlaceCols = ['Point']
passIdCol = colPassage

assert 'effortCol' not in dir() or effortCol == 'Effort'  # In rare cases, needs to be defined before here, but the same way !
effortCol = 'Effort'

colsSpeSelEchant = ['Adulte', 'Durée']  # Colonnes de sélection des échantillons : en plus de Espèce et Passage. 
sampleDistCol = colDistance
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'Echant'
sampleSelCols = [colEspece, passIdCol] + colsSpeSelEchant

#sampleAbbrevCol = 'Abrev. Echant'

In [None]:
# 2. Compléments pour les analyses.
distanceUnit = 'Meter'
areaUnit = 'Sq. Kilometer'
surveyType = 'Point'
distanceType = 'Radial'

dZoneEtude = dict(Zone='ACDC', Surface=24) # km2

In [None]:
# 3. Compléments pour les optanalyses.
anlysIndCol = 'Analyse'
anlysAbbrevCol = 'Abrev. Analyse'
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

In [None]:
# Paramètres pour le groupage des troncatures (indicateurs qualités)
ldTruncIntrvSpecs = [dict(col='left', minDist=5.0, maxLen=5.0),  dict(col='right', minDist=25.0, maxLen=25.0)]
truncIntrvEpsilon = 1e-6

In [None]:
# An optanalyser object knowns how to build an empty results object ...
optanlr = \
    PrototypeConformOptanalyser(dfObsCatIndiv, dfTransects=dfTransects,
                                effortConstVal=effortConst, dSurveyArea=dZoneEtude, 
                                transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                                abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                anlysIndCol=anlysIndCol, sampleIndCol=sampleNumCol,
                                distanceUnit=distanceUnit, areaUnit=areaUnit,
                                surveyType=surveyType, distanceType=distanceType, clustering=groupage,
                                ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon,
                                resultsHeadCols=dict(before=[anlysIndCol, sampleNumCol], sample=sampleSelCols,
                                                     after=anlysParamCols + [anlysAbbrevCol]))

results = optanlr.setupResults()

In [None]:
resFileName = f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-resultats.xlsx'

resFolders = [fn.name for fn in dossier.glob('[0-9]'*6+'-'+'[0-9]'*4) if (fn / resFileName).is_file()]

print('Résultats historiques disponibles:', ', '.join(resFolders))

workDir = dossier / resFolders[0]  # <=== Choisir le dossier de résultats ici.

updatedResFileNameExists = (pl.Path('tmp') / resFileName).is_file()
if not updatedResFileNameExists:

    resFileName = workDir / resFileName

    print(f'Fichier choisi : {resFileName.as_posix()}')
    
else:
    
    resFileName = pl.Path('tmp') / resFileName

    print(f'... mais résultats mis à jour aussi: {resFileName.as_posix()}')

In [None]:
%%time

# Load results from file
print('Lecture du fichier choisi:', resFileName.as_posix(), '...')

results.fromFile(resFileName, postComputed=updatedResFileNameExists)

print('... terminé.')

In [None]:
if not updatedResFileNameExists:
    
    # Add sample stats a posteriori (these stats had not been implemented when the historical results were saved to disk)
    dfSampleStats = pd.read_excel(dossier / f'{nomEtude}{sousEtude}-StatsEchantillons.xlsx')
    dfSampleStats.rename(columns={'NTot Obs': 'NTot Obs0'}, inplace=True)
    dfSampleStats.insert(dfSampleStats.columns.to_list().index('Distance Min'), 'NTot Obs', dfSampleStats['NTot Obs0'])
    dfSampleStats.drop(columns=['NTot Obs0'], inplace=True)

    miSampleCols = pd.MultiIndex.from_tuples([('header (sample)', colEspece, 'Value'),
                                              ('header (sample)', colPassage, 'Value'),
                                              ('header (sample)', colsSpeSelEchant[0], 'Value'),
                                              ('header (sample)', colsSpeSelEchant[1], 'Value')])
    dfSampleStats.columns = miSampleCols.append(ads.MCDSEngine.MIStatSampCols)

    results.dfData = results._dfData.join(dfSampleStats.set_index(miSampleCols.to_list()), on=miSampleCols.to_list())
    
    print(len(results._dfData))
    
    print('Ecriture du fichier mis à jour:', (pl.Path('tmp') / resFileName.name).as_posix(), '...')
    results.toExcel(pl.Path('tmp') / resFileName.name)
    print('... terminé.')

In [None]:
results._dfData

In [None]:
%%time

# Trigger pos-computations now !
dfRes = results.dfTransData('fr')
dfRes

In [None]:
%%time

fpn = tmpDir / resFileName.name.replace('resultats', 'resultats-postcalc')

print('Ecriture du tableau de résultats post-calculé :',fpn.as_posix(), '...')

dfRes.to_excel(fpn)

print('... terminé.')

## 2. Load prototype enriched simple results report 

In [None]:
repFileName = workDir / f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-rapenrich.xlsx'

print(f'Fichier choisi : {repFileName.as_posix()}')

In [None]:
%%time

dfRefRep = pd.read_excel(repFileName, index_col=0)

DRefRep2ResCols = {'Distance Min': 'Min Dist',
                   'Distance Max': 'Max Dist',
                   
                   'Qual Equi': 'Qual Equi 1',
                   'Qual Chi2': 'Qual Chi2+',
                   'Qual DCV': 'Qual DCv+',
                   'Qual KS': 'Qual KS+',
                   
                   'Grp Dist Tronc Gche': 'Groupe Tronc Gche',
                   'Grp Dist Tronc Drte': 'Groupe Tronc Drte',
                   
                   'Meil AIC Tronc Id': 'Ordre Tronc Ident AIC',
                   
                   'Meil CKCv Tronc Proch'     : 'Ordre Tronc Proch Chi2 KS DCv',
                   'Meil CVDens Tronc Proch'   : 'Ordre Tronc Proch DCv',
                   'Meil Qual Equi Tronc Proch': 'Ordre Tronc Proch Qual Equi 1',
                   'Meil Qual Chi2 Tronc Proch': 'Ordre Tronc Proch Qual Equi Chi2+',
                   'Meil Qual KS Tronc Proch'  : 'Ordre Tronc Proch Qual Equi KS+',
                   'Meil Qual DCV Tronc Proch' : 'Ordre Tronc Proch Qual Equi DCv+',
                   
                   'Ord CKCv'     : 'Ordre Global Chi2 KS DCv',
                   'Ord Qual Equi': 'Ordre Global Qual Equi 1',
                   'Ord Qual Chi2': 'Ordre Global Qual Equi Chi2+',
                   'Ord Qual KS'  : 'Ordre Global Qual Equi KS+',
                   'Ord Qual DCV' : 'Ordre Global Qual Equi DCv+',
                   'Ord Simpl Tronc': 'Ordre Global DeltaAIC Chi2 KS DCv'}
dfRefRep.rename(columns=DRefRep2ResCols, inplace=True)

assert all(col in dfRefRep.columns for col in DRefRep2ResCols.values())

dfRefRep

## 3. Compare loaded and post-computed to reference prototype intermediate report

In [None]:
# Results columns
resFrCols = dfRes.columns
', '.join(resFrCols)

In [None]:
# Columns in results, but not in reference intermediate report
diffCols = set(resFrCols) - set(dfRefRep.columns)
assert diffCols == {'Ordre Global Qual Equi 2', 'Ordre Global Qual Equi 3',
                    'Ordre Tronc Proch Qual Equi 2', 'Ordre Tronc Proch Qual Equi 3',
                    'Qual Equi 2', 'Qual Equi 3'}
diffCols

In [None]:
', '.join(sorted(resFrCols))

In [None]:
', '.join(sorted(dfRefRep.columns))

In [None]:
# Columns in reference intermediate report, but not in results
diffCols = set(dfRefRep.columns) - set(resFrCols)
assert not diffCols
diffCols

In [None]:
# Index columns for comparison
indexCols = [sampleNumCol] + sampleSelCols + [anlysIndCol, anlysAbbrevCol] + anlysParamCols
', '.join(indexCols)

In [None]:
# Round truncation distance parameters in order to be able to use them as part of the index columns
# (Excel I/O changed some least significant after dot figures)
dfRes['TrGche'] = dfRes['TrGche'].round(5)
dfRes['TrDrte'] = dfRes['TrDrte'].round(5)

dfRefRep['TrGche'] = dfRefRep['TrGche'].round(5)
dfRefRep['TrDrte'] = dfRefRep['TrDrte'].round(5)

In [None]:
# Columns to compare: let's ignore ...
# * DeltaDCV et DeltaAIC: they depend on actual analyses sets done at once, may differing from ref to actual results,
# * other string columns (comparison not implemented)
# * other neglectible (run time, ... etc) or newly implemented (not in ref) columns
subsetCols = [col for col in dfRefRep.columns \
              if col not in indexCols + ['HeureExec', 'DuréeExec', 'DossierExec',
                                         'Fn Clé Mod', 'Sér Ajust Mod', 'Crit Chx Mod', 'Interv Conf',
                                         'Fn Clé', 'Sér Ajust',
                                         'Delta AIC', 'Delta CoefVar Densité',
                                         'Max Dist', 'Min Dist',
                                         'Qual Equi 2', 'Qual Equi 3',
                                         'Ordre Tronc Proch Qual Equi 2', 'Ordre Tronc Proch Qual Equi 3',
                                         'Ordre Global Qual Equi 2', 'Ordre Global Qual Equi 3']]

In [None]:
%%time

# Comparison 1: DataSet.compareDataFrames
# => 21% analyses differing, and with only mostly-slightly different order indicators :-)
#    (see histogram of order differences below)
dfRelDiff = ads.DataSet.compareDataFrames(dfRes, dfRefRep, dropCloser=13, dropNans=True, dropCloserCols=True,
                                          subsetCols=subsetCols, indexCols=indexCols)
dict(refRows=len(dfRefRep), resRows=len(dfRes), diffRows=len(dfRelDiff), diffCols=len(dfRelDiff.columns))

In [None]:
dfRelDiff

In [None]:
# Comparison 2: DataFrame.compare
# Same diagnosis
ordDiffCols = ['Ordre Tronc Proch DCv', 'Ordre Tronc Proch Qual Equi 1',
               'Ordre Tronc Proch Qual Equi Chi2+', 'Ordre Tronc Proch Qual Equi KS+', 'Ordre Tronc Proch Qual Equi DCv+',
               'Ordre Global Qual Equi Chi2+',
               'Ordre Global Qual Equi KS+', 'Ordre Global Qual Equi DCv+', 'Ordre Global DeltaAIC Chi2 KS DCv']
assert set(dfRelDiff.columns) == set(ordDiffCols)

absCompCols = ['Analyse'] + ordDiffCols
dfAbsDiff = dfRes[absCompCols].set_index('Analyse').sort_index() \
              .compare(dfRefRep[absCompCols].set_index('Analyse').sort_index())
dfAbsDiff

In [None]:
# Check 1 order column (example)
ordr = 'Ordre Tronc Proch DCv'
dfAbsDiff.loc[(dfAbsDiff[(ordr, 'self')] - dfAbsDiff[(ordr, 'other')]).notnull(), [(ordr, 'self'), (ordr, 'other')]]

In [None]:
# Compute order column cell-by-cell differences
dfAbsDeltaDiff = pd.DataFrame(index=dfAbsDiff.index)
for ordr in ordDiffCols:
    dfAbsDeltaDiff[ordr] = dfAbsDiff[(ordr, 'self')] - dfAbsDiff[(ordr, 'other')]
dfAbsDeltaDiff

In [None]:
dfAbsDeltaDiff.max().max(), dfAbsDeltaDiff.min().min(), dfAbsDeltaDiff.notnull().sum().sum()

In [None]:
# Histogram for all order columns => mostly +/-1 differences
hist, bins = np.histogram(dfAbsDeltaDiff.values, bins=int(dfAbsDeltaDiff.max().max() - dfAbsDeltaDiff.min().min()),
                          range=(dfAbsDeltaDiff.min().min(), dfAbsDeltaDiff.max().max()))
dfHist = pd.DataFrame(data=hist, index=bins[:-1])
hist, bins

In [None]:
dict(nAnlys=len(dfRes), nDiffAnlys=len(dfAbsDiff), pctDiffAnlys=round(100*len(dfAbsDiff)/len(dfRes), 1),
     pctDiffAnlys1=round(100 * dfHist.loc[-1:1].values.sum() / dfHist.loc[:, 0].sum(), 1),
     pctDiffAnlys2=round(100 * dfHist.loc[-2:2].values.sum() / dfHist.loc[:, 0].sum(), 1),
     pctDiffAnlys3=round(100 * dfHist.loc[-3:3].values.sum() / dfHist.loc[:, 0].sum(), 1),
     pctDiffAnlys4=round(100 * dfHist.loc[-4:4].values.sum() / dfHist.loc[:, 0].sum(), 1),
     pctDiffAnlys5=round(100 * dfHist.loc[-5:5].values.sum() / dfHist.loc[:, 0].sum(), 1))

In [None]:
fpn = dossier / f'{nomEtude}{sousEtude}-autofilsor-indicators-diffs.xlsx'
with pd.ExcelWriter(fpn) as xlsWrtr:
    dfRelDiff.to_excel(xlsWrtr, sheet_name='rel-diff')
    dfAbsDiff.to_excel(xlsWrtr, sheet_name='abs-diff')
    dfHist.to_excel(xlsWrtr, sheet_name='hist-diff')
    
fpn.as_posix()

## 4. Quality and non-regression of filterSort*

(prerequisite: run a.)

In [None]:
# Deduplication algorithm params.
R = results

miDupSubsetDef = pd.MultiIndex.from_tuples([R.CLNObs, R.CLEffort, R.CLDeltaAic,
                                            R.CLChi2, R.CLKS, R.CLCvMUw, R.CLCvMCw, R.CLDCv, 
                                            R.CLPDetec, R.CLPDetecMin, R.CLPDetecMax,
                                            R.CLDensity, R.CLDensityMin, R.CLDensityMax])
dDupRoundsDef = {R.CLDeltaAic: 1, R.CLChi2: 2, R.CLKS: 2, R.CLCvMUw: 2, R.CLCvMCw: 2, R.CLDCv: 2, 
                 R.CLPDetec: 3, R.CLPDetecMin: 3, R.CLPDetecMax: 3, R.CLDensity: 2, R.CLDensityMin: 2, R.CLDensityMax: 2}

In [None]:
# Index columns for comparison
anlysParamCols = ['Fn Clé Mod', 'Sér Ajust Mod', 'Dist Tronc Gche', 'Dist Tronc Drte', 'Tranch Dist Mod']
indexCols = [sampleNumCol] + sampleSelCols + [anlysIndCol] + anlysParamCols
', '.join(indexCols)

### a. Load reference prototype final report

In [None]:
repFileName = workDir / f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-raptousech.ods'

print(f'Fichier choisi : {repFileName.as_posix()}')

In [None]:
%%time

ddfRefFinRep = pd.read_excel(repFileName, sheet_name=None)
', '.join(ddfRefFinRep.keys())

In [None]:
# Check that all sub-reports (1 per method) display the same columns
assert all(ddfRefFinRep[meth].columns.to_list() == ddfRefFinRep['codexec'].columns.to_list()
           for meth in ddfRefFinRep.keys() if meth.startswith('c') and meth != 'codexec')

In [None]:
# Prepare ref. sub-reports for comparison :
# * Drop pre-selection columns (added later in report module)
# * Rename columns to "industrialised" names
for meth in ddfRefFinRep:
    if meth.startswith('c'):
        ddfRefFinRep[meth].drop(columns=['Sélection finale', 'Sélection Qual Equi'], inplace=True)
        ddfRefFinRep[meth].rename(columns=DRefRep2ResCols, inplace=True)

In [None]:
dBadAnalyses = dict()

### b. Check reference prototype final report

In [None]:
# Now some few checks ...
#meth = 'ckcvqual975d8'
#meth = 'ckcvqual950d10'
meth = 'ckcvqual925d12'
#meth = 'ckcvqual900d15'
#meth = 'ckcvqual900d20'
#meth = 'codexec'

df = ddfRefFinRep[meth].copy()
len(df)

In [None]:
mainIndicCols = ['Delta AIC', 'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P', 'NObs', 'NTot Obs', 'CoefVar Densité', 'NbTot Pars']
qualIndicCols = [col for col in df.columns if col.startswith('Qual')]
resultCols = ['Densité', 'EDR/ESW', 'PDetec']

In [None]:
# There should be no analysis with NaN values for main MCDS goodness params
# (should be filtered out at first, just as ExecCode > 2 ones)
df = df.loc[df[mainIndicCols].isnull().any(axis='columns'),
            ['Analyse'] + mainIndicCols + qualIndicCols + resultCols]
df

In [None]:
# But there are, and seems they are all due to NaN Chi2 ...
assert df['Chi2 P'].isnull().all()

In [None]:
dBadAnalyses[meth] = df['Analyse'].to_list()

### c. Generate report (apply methods to test)

In [None]:
mtoars = PrototypeConformResultsSet
#mtoars = ads.MCDSTruncOptanalysisResultsSet

whichBestQua = [mtoars.CLGrpOrdClTrChi2KSDCv, mtoars.CLGrpOrdClTrDCv, mtoars.CLGrpOrdClTrQuaBal1,
                mtoars.CLGrpOrdClTrQuaChi2, mtoars.CLGrpOrdClTrQuaKS, mtoars.CLGrpOrdClTrQuaDCv]
whichFinalQua = mtoars.CLCmbQuaBal1  # Was CLGrpOrdClTrQuaBal1, bad !!!

filterSortReportSpecs = \
[dict(name='ExCode', 
      method=mtoars.filterSortOnExecCode,
      deduplicate=dict(dupSubset=miDupSubsetDef, dDupRounds=dDupRoundsDef),
      filterSort=dict(whichFinalQua=whichFinalQua, ascFinalQua=False)),
 dict(name='ExAicMQua-r{sightRate:.1f}d{nFinalRes}', 
      method=mtoars.filterSortOnExCAicMulQua,
      deduplicate=dict(dupSubset=miDupSubsetDef, dDupRounds=dDupRoundsDef),
      filterSort=dict(sightRate=92.5, nBestAIC=3, nBestQua=1, whichBestQua=whichBestQua,
                      nFinalRes=12, whichFinalQua=whichFinalQua, ascFinalQua=False)),
 # ... etc.
 ]

In [None]:
ddfFinRep = dict()
repLog = list()
for spec in filterSortReportSpecs:

    subRepName = spec['name'].format_map(spec['filterSort']).replace('.', '')
    
    iSubRep, subSteps = spec['method'](results, **spec['filterSort'], **spec['deduplicate'])
    
    ddfFinRep[subRepName] = results.dfTransData(lang='fr', index=iSubRep)
    repLog += subSteps

', '.join(ddfFinRep.keys())

In [None]:
# Export to Excel.
fpn = pl.Path('tmp') / f'{nomEtude}{sousEtude}-autofilsor-raptousech.xlsx'
with pd.ExcelWriter(fpn) as xlsWrtr:
    for subRepName, dfSubRepData in ddfFinRep.items():
        dfSubRepData.to_excel(xlsWrtr, sheet_name=subRepName, index=True)            

fpn.as_posix()

### d. filterSortOn* checks

In [None]:
# Switch methods here ...
#methIndus = 'ExecCode'
methIndus = 'AicMQua-r925d12'

In [None]:
dfRes = ddfFinRep[methIndus].copy()
dfRes

In [None]:
# Results columns
resFrCols = dfRes.columns
', '.join(resFrCols)

In [None]:
# Round truncation distance parameters in order to be able to use them as part of the index columns
# (Excel I/O changed some least significant after dot figures)
dfRes['Dist Tronc Gche'] = dfRes['Dist Tronc Gche'].round(5)
dfRes['Dist Tronc Drte'] = dfRes['Dist Tronc Drte'].round(5)

In [None]:
# There should be no analysis with NaN values for main MCDS goodness params
# (should be filtered out at first, just as ExecCode > 2 ones)
_mainIndicCols = [col for col in mainIndicCols if col in dfRes.columns]
_qualIndicCols = [col for col in qualIndicCols if col in dfRes.columns]
_resultCols = [col for col in resultCols if col in df.columns]

df = dfRes.loc[dfRes[_mainIndicCols].isnull().any(axis='columns'), ['Analyse'] + _mainIndicCols + _qualIndicCols + _resultCols]
df

In [None]:
# But there are, and seems they are all due to NaN Chi2 ...
assert df['Chi2 P'].isnull().all()

In [None]:
# Are there all the same ones as in Ref report ?
# * ExecCode: no (46 not in common)
# * AicCKCvQua-r925d12: yes !
len(set(df['Analyse']) - set(dBadAnalyses[meth])), len(set(dBadAnalyses[meth]) - set(df['Analyse']))

In [None]:
# Get reference sub-report and check that all its columns are also in the "industrialised" sub-report
dfRefRep = ddfRefFinRep[meth]
assert all(col in resFrCols for col in dfRefRep.columns)

', '.join(dfRefRep.columns)

In [None]:
# Round truncation distance parameters in order to be able to use them as part of the index columns
# (Excel I/O changed some least significant after dot figures)
dfRefRep['Dist Tronc Gche'] = dfRefRep['Dist Tronc Gche'].round(5)
dfRefRep['Dist Tronc Drte'] = dfRefRep['Dist Tronc Drte'].round(5)

In [None]:
# Columns to compare: let's ignore ...
# * DeltaAIC: depends on actual analyses sets done at once, may be differing from ref to actual results,
# * other duplicate columns (analysis params)
subsetCols = [col for col in dfRefRep.columns
              if col not in indexCols + ['Delta AIC', 'FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']]

In [None]:
%%time

# Comparaison => bilan = 28% d'analyses non présentes dans les 2 rapports ou à valeurs différentes
# C'est assez cohérent avec les 21% présentant des différences de classement par ordre d'indicateurs ...
dfRelDiff = ads.DataSet.compareDataFrames(dfRefRep, dfRes, indexCols=indexCols, subsetCols=subsetCols,
                                          dropCloser=6, dropNans=True, dropCloserCols=True)
#assert len(dfRelDiff) == 0
dict(refRows=len(dfRefRep), resRows=len(dfRes), diffRows=len(dfRelDiff), diffCols=len(dfRelDiff.columns),
     pctDiff=round(100 * len(dfRelDiff) / len(dfRes), 1))

In [None]:
dfRelDiff

In [None]:
dfRelDiff.to_excel('tmp/{}-reldiff.xlsx'.format(meth))

In [None]:
# List diffing analyses that are not in the 2 reports
def allNullOrZero(s):
    return sum(s.isnull() | (s == 0)) == len(s)

df = dfRelDiff.loc[dfRelDiff.apply(allNullOrZero, axis='columns')]
print('pctNotBoth:', round(100 * len(df) / len(dfRes), 1))
df

In [None]:
# Remove them => remains those in the 2 reports, but with diffs
df = dfRelDiff.drop(dfRelDiff.loc[dfRelDiff.apply(allNullOrZero, axis='columns')].index)
print('pctBothButDiffs:', round(100 * len(df) / len(dfRes), 1))
df

In [None]:
# Remove bad analyses (with NaN in main MCDS results)
df = df.dropna(axis='index', how='any', subset=[col for col in _mainIndicCols if col in df.columns])
df

In [None]:
# Remove columns with rel diffs lower than 1e-6 all along
df = df.drop(columns=[col for col in df.columns if df[col].gt(6).all()])
df

In [None]:
# And there only remains 'Ordre *' columns
assert all(col.startswith('Ordre') for col in df.columns)

In [None]:
# At the end good news : differences are only due to :
# * see above non-regression tests on enriched results/reports :
#    (only) partially explained order differences (but mostly small) => different lists of analyses (20-25%)
# * prototype bug that keeps in the race ... bad analyses with NaN as main MCDS goodness indicator values (Chi2 ... etc)

In [None]:
df.to_excel('tmp/{}-reldiff-both-but-nan-or-diff6.xlsx'.format(meth))

## 5. Quality tests of (industrialised) filterSort*

(prerequisite: run 1. above)

1. load results with the real industrialised MCDSTruncationOptanalysisResultsSet
2. apply filter-sort methods
3. check output quality
    * TODO: define what to check

In [None]:
ddfSubRep['ExecCode']

In [None]:
ddfSubRep['AicCKCvQua-r925d12']

In [None]:
repLog

# Development : Optimise MCDSAnalyser._postComputeQualityIndicators

## 1. Prepare stuff for creating MCDSAnalysisResultsSet objects

In [None]:
# Source / Results data
transectPlaceCols = ['Point']
passIdCol = 'Passage'
effortCol = 'Effort'

sampleDistCol = 'Distance'
sampleDecCols = [effortCol, sampleDistCol]

sampleNumCol = 'NumEchant'
sampleSelCols = ['Espèce', passIdCol, 'Adulte', 'Durée']

sampleAbbrevCol = 'AbrevEchant'

dSurveyArea = dict(Zone='ACDC', Surface='2400')

# General DS analysis parameters
varIndCol = 'NumAnlys'
anlysAbbrevCol = 'AbrevAnlys'
anlysParamCols = ['FonctionClé', 'SérieAjust', 'TrGche', 'TrDrte', 'NbTrchMod']

distanceUnit = 'Meter'
areaUnit = 'Hectare'
surveyType = 'Point'
distanceType = 'Radial'
clustering = False

# Results post-computation parameters
ldTruncIntrvSpecs = [dict(col='left', minDist=5.0, maxLen=5.0), dict(col='right', minDist=25.0, maxLen=25.0)]
truncIntrvEpsilon = 1e-6

# Load individualised observations and actual transects
indivObsFile = 'refin/ACDC2019-Naturalist-ExtraitObsIndiv.ods'

dfObsIndiv = ads.DataSet(indivObsFile, sheet='DonnéesIndiv').dfData

dfTransects = ads.DataSet(indivObsFile, sheet='Inventaires').dfData

dict(indivObs=len(dfObsIndiv), transects=len(dfTransects))

In [None]:
# What's better to create an MCDSAnalysisResultsSet objets than a MCDSAnalyser instance ?
anlr = \
    ads.MCDSAnalyser(dfObsIndiv, dfTransects=dfTransects, dSurveyArea=dSurveyArea, 
                     transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                     sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols,
                     abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                     anlysIndCol=varIndCol, sampleIndCol=sampleNumCol,
                     distanceUnit=distanceUnit, areaUnit=areaUnit,
                     surveyType=surveyType, distanceType=distanceType, clustering=clustering,
                     resultsHeadCols=dict(before=[varIndCol, sampleNumCol], sample=sampleSelCols,
                                          after=anlysParamCols + [anlysAbbrevCol]),
                     ldTruncIntrvSpecs=ldTruncIntrvSpecs, truncIntrvEpsilon=truncIntrvEpsilon)

## 2. _postComputeQualityIndicators

In [None]:
# Load results to play with ...
# Note: Okay, it's actually an MCDSTruncOptAnalysisResultsSet file ... but we'll ignore the extra columns, promised :-)
resFileName = 'refin/ACDC2019-Naturalist-UnitestOptResultats.ods'
print('Loading results from {} ...'.format(resFileName))

results = anlr.setupResults()

results.fromOpenDoc(resFileName, postComputed=True)  # Prevent re-post-computation : not a problem here, but longer

In [None]:
class ResultsOpt1(ads.MCDSAnalysisResultsSet):
    
    Super = ads.MCDSAnalysisResultsSet
    
    def __init__(self, results):
        
        self._dfData = results._dfData
    
    # Post computations : Quality indicators.
    DNormKeyFn = dict(HNORMAL=1.0, UNIFORM=0.9, HAZARD=0.6, NEXPON=0.1)
    # DNormKeyFn = dict(HNORMAL=1.00, UNIFORM=0.75, HAZARD=0.5, NEXPON=0.1)  # Not better

    CLsQuaIndicSources = [Super.CLKeyFn, Super.CLNAdjPars, Super.CLNTotPars, Super.CLNObs, Super.CLNTotObs,
                          Super.CLChi2, Super.CLKS, Super.CLCvMUw, Super.CLCvMCw, Super.CLDCv]
    
    CIKeyFn = CLsQuaIndicSources.index(Super.CLKeyFn)
    CINAdjPars = CLsQuaIndicSources.index(Super.CLNAdjPars)
    CINTotPars = CLsQuaIndicSources.index(Super.CLNTotPars)
    CINObs = CLsQuaIndicSources.index(Super.CLNObs)
    CINTotObs = CLsQuaIndicSources.index(Super.CLNTotObs)
    CIChi2 = CLsQuaIndicSources.index(Super.CLChi2)
    CIKS = CLsQuaIndicSources.index(Super.CLKS)
    CICvMUw = CLsQuaIndicSources.index(Super.CLCvMUw)
    CICvMCw = CLsQuaIndicSources.index(Super.CLCvMCw)
    CIDCv = CLsQuaIndicSources.index(Super.CLDCv)
    
    @classmethod
    def _combinedQualityBalanced1(cls, aRes):  # The one used for ACDC 2019 filtering & sorting in jan/feb 2021

        # MCDS results (or so).
        chi2 = aRes[cls.CIChi2]
        ks = aRes[cls.CIKS]
        chi2 = aRes[cls.CIChi2]
        chi2KsCvMs = aRes[cls.CIChi2:cls.CICvMCw + 1].prod()
        normNObs = aRes[cls.CINObs] / aRes[cls.CINTotObs]
        normNTotPars = 1 / (0.2 * max(2, aRes[cls.CINTotPars]) + 0.6)
        normCVDens = math.exp(-12 * aRes[cls.CIDCv] * aRes[cls.CIDCv])
        
        return (chi2KsCvMs * normNObs * normNTotPars * normCVDens) ** (1.0/7)

    @classmethod
    def _combinedQualityAll(cls, aRes):
        
        """Does NOT work, because of a pandas 1.1+ regression on DataFrame.apply(..., raw=True, ...),
        apparently not fixed yet (see https://github.com/pandas-dev/pandas/issues/34822) :
        
        ValueError                                Traceback (most recent call last)
        C:\PortableApps\MiniConda3\envs\py38\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
           1674                 blocks = [
        -> 1675                     make_block(
           1676                         values=blocks[0], placement=slice(0, len(axes[0])), ndim=2

        C:\PortableApps\MiniConda3\envs\py38\lib\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype)
           2750 
        -> 2751     return klass(values, ndim=ndim, placement=placement)
           2752 

        C:\PortableApps\MiniConda3\envs\py38\lib\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
            141         if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
        --> 142             raise ValueError(
            143                 f"Wrong number of items passed {len(self.values)}, "

        ValueError: Wrong number of items passed 5, placement implies 10
        
        See also https://stackoverflow.com/questions/67678210/raw-true-causes-valueerror-in-pandas-dataframe-apply
        for description and code for reproducing.
        """
        
        # MCDS results (or so).
        chi2 = aRes[cls.CIChi2]
        ks = aRes[cls.CIKS]
        chi2 = aRes[cls.CIChi2]
        chi2KsCvMs = aRes[cls.CIChi2:cls.CICvMCw + 1].prod()
        normNObs = aRes[cls.CINObs] / aRes[cls.CINTotObs]
        
        # October 2021
        normKeyFn = cls.DNormKeyFn.get(aRes[cls.CIKeyFn], 0.0)

        # A more devaluating version for NAdjPars, CVDens, also using KeyFn
        normNAdjPars = math.exp(-0.15 * aRes[cls.CINAdjPars] * aRes[cls.CINAdjPars])
        normCVDens = math.exp(-20 * aRes[cls.CIDCv] * aRes[cls.CIDCv])
        prodAll8NormSrcIndics = chi2KsCvMs * normNObs * normNAdjPars * normCVDens * normKeyFn
        quaBal2 = prodAll8NormSrcIndics ** 0.125

        # An even more devaluating version for NAdjPars, CVDens, also using 
        normNAdjPars = math.exp(-0.17 * aRes[cls.CINAdjPars] * aRes[cls.CINAdjPars])
        normCVDens = math.exp(-63 * aRes[cls.CIDCv] ** 2.8)
        prodAll8NormSrcIndics = chi2KsCvMs * normNObs * normNAdjPars * normCVDens * normKeyFn
        quaBal3 = prodAll8NormSrcIndics ** 0.125

        # Follow _combinedQualityBalanced3 update (were based on _combinedQualityBalanced1)
        moreChi2 = (prodAll8NormSrcIndics * chi2) ** (1.0/9)
        moreKS = (prodAll8NormSrcIndics * ks) ** (1.0/9)
        moreDCv = (prodAll8NormSrcIndics * normCVDens) ** (1.0/9)
        
        return quaBal2, quaBal3, moreChi2, moreKS, moreDCv  # Must be same order as in CLsNewQuaIndics !

    CLsNewQuaIndics = [Super.CLCmbQuaBal2, Super.CLCmbQuaBal3, Super.CLCmbQuaChi2, Super.CLCmbQuaKS, Super.CLCmbQuaDCv]

    def _postComputeQualityIndicators(self):
        
        """Does not work because of a pandas 1.1+ bug, see above _combinedQualityAllOpt1"""
        
        cls = self

        logger.debug('Post-computing Quality Indicators (opt1)')

        self._dfData[cls.CLSightRate] = 100 * self._dfData[cls.CLNObs] / self._dfData[cls.CLNTotObs]  # [0,1] => %

        # Prepare data for computations
        logger.debug1('* Pre-processing source data')

        # a. extract the useful columns, after adding them if not present
        #    (NaN value, except for CLKeyFn, that MUST be there anyway)
        for miCol in cls.CLsQuaIndicSources:
            if miCol not in self._dfData.columns and miCol != cls.CLKeyFn:
                self._dfData[miCol] = np.nan
        dfCompData = self._dfData[cls.CLsQuaIndicSources].copy()

        # b. historical bal qua 1
        logger.debug1('* Balanced quality 1')
        self._dfData[cls.CLCmbQuaBal1] = dfCompData.apply(cls._combinedQualityBalanced1, axis='columns')

        # c. newer quality indicators
        #    (NaN value MUST kill down these indicators to compute => we have to enforce this)
        dfCompData.fillna({cls.CLNObs: cls.KilrNObs,
                           cls.CLChi2: cls.KilrStaTest, cls.CLKS: cls.KilrStaTest,
                           cls.CLCvMUw: cls.KilrStaTest, cls.CLCvMCw: cls.KilrStaTest,
                           cls.CLDCv: cls.KilrDensCv,  # Usually considered good under 0.3
                           cls.CLNTotObs: cls.KilrNTotObs,  # Should slap down _normObs whatever NObs
                           cls.CLNAdjPars: cls.KilrNPars,  # Should slap down _normNAdjPars whatever NObs
                           cls.CLNTotPars: cls.KilrNPars},
                          inplace=True)

        logger.debug1('* Balanced quality 2, 3, Chi2+, KS+, DCv+')
        self._dfData.drop(columns=cls.CLsNewQuaIndics, inplace=True, errors='ignore')  # Cleanup
        aNewQuaIndics = dfCompData.apply(cls._combinedQualityAll, axis='columns', raw=True, result_type='reduce').values
        self._dfData = self._dfData.join(pd.DataFrame(aNewQuaIndics, index=self._dfData.index,
                                                      columns=pd.MultiIndex.from_tuples(cls.CLsNewQuaIndics)))
        

In [None]:
class ResultsOpt2(ResultsOpt1):
    
    Super = ResultsOpt1
    
    def __init__(self, results):
        
        super().__init__(results)
    
    @classmethod
    def _combinedQualityBalanced1(cls, aRes):  # The one used for ACDC 2019 filtering & sorting in jan/feb 2021

        chi2KsCvMs = aRes[:, cls.CIChi2:cls.CICvMCw + 1].prod(axis=1)
        normNObs = aRes[:, cls.CINObs].astype(float) / aRes[:, cls.CINTotObs].astype(float)
        normNTotPars = 1 / (0.2 * np.maximum(2, aRes[:, cls.CINTotPars].astype(float)) + 0.6)
        normCVDens = np.exp(-12 * np.square(aRes[:, cls.CIDCv].astype(float)))

        return np.power(chi2KsCvMs * normNObs * normNTotPars * normCVDens, 1 / 7.0) # shape: aRes rows, 1 column

    ufnNormKeyFn = np.frompyfunc(lambda keyFn: ResultsOpt2.DNormKeyFn.get(keyFn, 0.0), 1, 1)
    
    @classmethod
    def _combinedQualityAll(cls, aRes):
        
        chi2 = aRes[:, cls.CIChi2].astype(float)
        ks = aRes[:, cls.CIKS].astype(float)
        dcv = aRes[:, cls.CIDCv].astype(float)
        chi2KsCvMs = aRes[:, cls.CIChi2:cls.CICvMCw + 1].astype(float).prod(axis=1)
        normNObs = aRes[:, cls.CINObs].astype(float) / aRes[:, cls.CINTotObs].astype(float)

        # October 2021
        nAdjPars2 = np.square(aRes[:, cls.CINAdjPars]).astype(float)
        normKeyFn = cls.ufnNormKeyFn(aRes[:, cls.CIKeyFn])
        normChi2KsCvMsNObsKFn = chi2KsCvMs * normNObs * normKeyFn

        # QualBal2 : A more devaluating version for NAdjPars, CVDens, also using KeyFn
        normNAdjPars2 = np.exp(-0.15 * nAdjPars2)
        normCVDens2 = np.exp(-20 * np.square(dcv))
        quaBal2 = np.power(normChi2KsCvMsNObsKFn * normNAdjPars2 * normCVDens2, 1 / 8.0)

        # QualBal3 : An even more devaluating version for NAdjPars, CVDens, also using 
        normNAdjPars3 = np.exp(-0.17 * nAdjPars2)
        normCVDens3 = np.exp(-63 * np.power(dcv, 2.8))
        normChi2KsCvMsNObsKFnAdjPDcv3 = normChi2KsCvMsNObsKFn * normNAdjPars3 * normCVDens3
        quaBal3 = np.power(normChi2KsCvMsNObsKFnAdjPDcv3, 1 / 8.0)

        # QualMoreX : Follow _combinedQualityBalanced3 update (were based on _combinedQualityBalanced1)
        moreChi2 = np.power(normChi2KsCvMsNObsKFnAdjPDcv3 * chi2, 1 / 9.0)
        moreKS = np.power(normChi2KsCvMsNObsKFnAdjPDcv3 * ks, 1 / 9.0)
        moreDCv = np.power(normChi2KsCvMsNObsKFnAdjPDcv3 * normCVDens3, 1 / 9.0)
        
        return quaBal2, quaBal3, moreChi2, moreKS, moreDCv  # shape: aRes rows, 1 column each + order of CLsNewQuaIndics !
        
    def _postComputeQualityIndicators(self):
               
        cls = self

        logger.debug('Post-computing Quality Indicators (opt2)')

        self._dfData[cls.CLSightRate] = 100 * self._dfData[cls.CLNObs] / self._dfData[cls.CLNTotObs]  # [0,1] => %

        # Prepare data for computations
        logger.debug1('* Pre-processing source data')

        # a. extract the useful columns, after adding them if not present
        #    (NaN value, except for CLKeyFn, that MUST be there anyway)
        for miCol in cls.CLsQuaIndicSources:
            if miCol not in self._dfData.columns and miCol != cls.CLKeyFn:
                self._dfData[miCol] = np.nan
        dfCompData = self._dfData[cls.CLsQuaIndicSources].copy()

        # b. historical bal qua 1
        logger.debug1('* Balanced quality 1')
        self._dfData[cls.CLCmbQuaBal1] = cls._combinedQualityBalanced1(dfCompData.values)

        # c. newer quality indicators
        #    (NaN value MUST kill down these indicators to compute => we have to enforce this)
        dfCompData.fillna({cls.CLNObs: cls.KilrNObs,
                           cls.CLChi2: cls.KilrStaTest, cls.CLKS: cls.KilrStaTest,
                           cls.CLCvMUw: cls.KilrStaTest, cls.CLCvMCw: cls.KilrStaTest,
                           cls.CLDCv: cls.KilrDensCv,  # Usually considered good under 0.3
                           cls.CLNTotObs: cls.KilrNTotObs,  # Should slap down _normObs whatever NObs
                           cls.CLNAdjPars: cls.KilrNPars,  # Should slap down _normNAdjPars whatever NObs
                           cls.CLNTotPars: cls.KilrNPars},
                          inplace=True)

        logger.debug1('* Balanced quality 2, 3, Chi2+, KS+, DCv+')
        self._dfData[cls.CLsNewQuaIndics] = np.stack(cls._combinedQualityAll(dfCompData.values), axis=1)

In [None]:
%%timeit

# Initial version => 975 +/- 11 ms on a 6-HT-core i7-10850H
results._postComputeQualityIndicators()

In [None]:
# #1 optimised version (does not work)
#opt1Res = ResultsOpt1(results.copy())

In [None]:
%%time

#opt1Res._postComputeQualityIndicators()

In [None]:
# #2 optimised version
opt2Res = ResultsOpt2(results.copy())

In [None]:
%%timeit

opt2Res._postComputeQualityIndicators()

# => 9.5 +/- 0.2 ms on a 6-HT-core i7-10850H => a x100 boost :-)

In [None]:
# Compare to original method results
quaIndCols = [results.CLCmbQuaBal1] + opt2Res.CLsNewQuaIndics

results._dfData[quaIndCols].compare(opt2Res._dfData[quaIndCols])

In [None]:
assert ads.DataSet.compareDataFrames(results._dfData, opt2Res._dfData,
                                     indexCols=[('header (head)', 'NumAnlys', 'Value')],
                                     subsetCols=quaIndCols, dropCloser=15, dropCloserCols=True).empty

# Success 2021-11-28 PM

In [None]:
results._dfData

# Development : Automated filtering and sorting of optanalysis results

En entrée :
* soit : export Excel des résultats d'opt-analyses (via [Visionature-ds-point / XVI. Analyses automatiques / 2a ou 2b](../Visionature-ds-point.ipynb#XVI.-Analyses-automatiques)),
* soit : rapport Excel 'full' des résultats d'opt-analyses généré une autre fois (via b. ci-dessous).

N.B. Code historique de développement et d'essais, maintenant industrialisé et enrichi via MCDSTruncationOptanalysisResultsSet et MDCDResultsFilterSortReport

## 1. Chargement / Génération des données d'entrée

### a. Choix étude / sous-étude / variante

(Cf. [Visionature-ds-point / I. Paramètres de l'étude : import / filtrage des données](../Visionature-ds-point.ipynb#I.-Param%C3%A8tres-de-l'%C3%A9tude-%3A-import-%2F-filtrage-des-donn%C3%A9es%2C-...) et [Visionature-ds-point / XVI. 1. c. Optanalyses à faire : variante d'études](../Visionature-ds-point.ipynb#c.-Analyses-%C3%A0-faire-%3A-variante-d'%C3%A9tudes))

In [None]:
nomEtude = 'ACDC2019'

In [None]:
sousEtude = '-Nat'
#sousEtude = '-Pap'

In [None]:
varEtude = ''

### b. Si pas déjà disponible, génération du rapport Excel 'Full' des optanalyses à partir d'un export Excel des résultats

(à partir des résultats exportés en Excel via [Visionature-ds-point / XVI. Analyses automatiques / 2a ou 2b](../Visionature-ds-point.ipynb#XVI.-Analyses-automatiques))

N.B. Si ces résultats ont été générés via pyaudisam >= 12/08/2021, la suite n'a pas grand intérêt, puisqu'elle produit les informations et la mise en forme qui sont maintenant auto-calculées / faites par MCDSTruncationOptanalysisResultsSet et MDCDResultsFilterSortReport.

In [None]:
# Liste des dossiers/résultats disponibles pour l'étude / sous-étude / variante.
resFolders = [fn.name for fn in dossier.glob('[0-9]'*6+'-'+'[0-9]'*4)
              if (fn / f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-resultats.xlsx').is_file()]

print('Résultats disponibles:', ', '.join(resFolders))

In [None]:
# Choix du résultats à traiter
workDir = dossier / resFolders[0]  # <=== Choisir le dossier de résultats ici.

resFileName = workDir / f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-resultats.xlsx'

print(f'Fichier choisi : {resFileName.as_posix()}')

In [None]:
# Chargement des résultats
optanlr = \
    ads.MCDSTruncationOptanalyser(dfObsCatIndiv, dfTransects=dfTransects,
                                  effortConstVal=effortConst, dSurveyArea=dZoneEtude, 
                                  transectPlaceCols=transectPlaceCols, passIdCol=passIdCol, effortCol=effortCol,
                                  sampleSelCols=sampleSelCols, sampleDecCols=sampleDecCols, sampleDistCol=sampleDistCol,
                                  abbrevCol=anlysAbbrevCol, abbrevBuilder=analysisAbbrev,
                                  anlysIndCol=anlysIndCol, sampleIndCol=sampleNumCol,
                                  distanceUnit=distanceUnit, areaUnit=areaUnit,
                                  surveyType=surveyType, distanceType=distanceType, clustering=groupage,
                                  resultsHeadCols=dict(before=[anlysIndCol, sampleNumCol], sample=sampleSelCols,
                                                       #after=anlysParamCols + [optimTruncCol, anlysAbbrevCol]))
                                                       after=anlysParamCols + [anlysAbbrevCol])) # TODO: test !

results = optanlr.setupResults()

results.fromExcel(resFileName)

In [None]:
# Sélection des colonnes pour les tableaux de synthèse du rapport
synthRepCols = \
[('header (head)', col, 'Value') for col in [anlysIndCol, sampleNumCol]] \
+ [('header (sample)', col, 'Value') for col in samplingCols] \
+ [('parameters', 'estimator key function', 'Value'),
   ('parameters', 'estimator adjustment series', 'Value'),
   ('parameters', 'left truncation distance', 'Value'),
   ('parameters', 'right truncation distance', 'Value'),
   ('parameters', 'model fitting distance cut points', 'Value'),
   
   ('run output', 'run status', 'Value'),
   
   ('encounter rate', 'number of observations (n)', 'Value'),
   ('encounter rate', 'right truncation distance (w)', 'Value'),
   ('encounter rate', 'effort (L or K or T)', 'Value'),
   
   ('detection probability', 'Delta AIC', 'Value'),
   ('detection probability', 'AIC value', 'Value'),
   ('detection probability', 'chi-square test probability determined', 'Value'),
   ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
   ('detection probability', 'Cramér-von Mises (uniform weighting) test probability', 'Value'),
   ('detection probability', 'Cramér-von Mises (cosine weighting) test probability', 'Value'),
   ('density/abundance', 'density of animals', 'Cv'),
   
   ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
   ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
   ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
   
   ('density/abundance', 'density of animals', 'Value'),
   ('density/abundance', 'density of animals', 'Lcl'),
   ('density/abundance', 'density of animals', 'Ucl'),
   ('density/abundance', 'density of animals', 'Delta Cv'),
   
   ('detection probability', 'probability of detection (Pw)', 'Value'),
   ('detection probability', 'probability of detection (Pw)', 'Lcl'),
   ('detection probability', 'probability of detection (Pw)', 'Ucl'),
   ('detection probability', 'probability of detection (Pw)', 'Df'),

   ('density/abundance', 'number of animals, if survey area is specified', 'Value'),
   ('density/abundance', 'number of animals, if survey area is specified', 'Lcl'),
   ('density/abundance', 'number of animals, if survey area is specified', 'Ucl'),
   ('density/abundance', 'number of animals, if survey area is specified', 'Df'),
   
   ('run output', 'run folder', 'Value')
]

sortRepCols = \
[('header (head)', sampleNumCol, 'Value')] \
+ [('header (sample)', col, 'Value') for col in samplingCols] \
+ [('parameters', 'left truncation distance', 'Value'),
   ('parameters', 'right truncation distance', 'Value'),
   ('detection probability', 'Delta AIC', 'Value'),
   ('detection probability', 'chi-square test probability determined', 'Value'), # For same AIC !
   ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'), # For same Chi2 !
   ('run output', 'run status', 'Value'), # For same KS !
]
#   ('density/abundance', 'density of animals', 'Delta Cv')]

sortRepAscend = [True]*(1+len(samplingCols)+3) + [False]*2 + [True]

assert len(sortRepCols) == len(sortRepAscend)

In [None]:
report = ads.MCDSResultsFullReport(resultsSet=results, synthCols=synthRepCols,
                                   sortCols=sortRepCols, sortAscend=sortRepAscend,
                                   title=titreEtude, subTitle='Rapport d\'analyse',
                                   anlysSubTitle='Détail des analyses', description=descrEtude,
                                   keywords=motsClesEtude, pySources=['Visionature-ds-points.ipynb'],
                                   lang='fr', plotImgSize=(768, 384),
                                   #plotImgQuality=80, plotImgFormat='jpg', # Same final size as raw PNG :-(
                                   tgtFolder=workDir, tgtPrefix=f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-rapport')

In [None]:
%%time

xlsxRep = report.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxRep}" target="blank">{xlsxRep}</a>')

In [None]:
#%%time
#
#htmlRep = report.toHtml()
#
#HTML(f'Rapport HTML : <a href="{htmlRep}" target="blank">{htmlRep}</a>')

### b. Sélection et chargement du rapport 'Full' Excel à traiter

[XVI.1.c. Analyses à faire : variante d'études](#c.-Analyses-%C3%A0-faire-%3A-variante-d'%C3%A9tudes)

In [None]:
fn = f'{nomEtude}{sousEtude}-OptAnalyses{varEtude}-rapport.xlsx'
print(fn)
repFolders = [dn.name for dn in dossier.glob('[0-9]'*6+'-'+'[0-9]'*4)
              if (dn / fn).is_file()]

print('Rapports disponibles:', ', '.join(repFolders))

In [None]:
repFolder = dossier / repFolders[0]  # <=== Choisir le dossier du rapport à exploiter ici.

xlsxRep = repFolder / fn
    
print(f'Fichier choisi : {xlsxRep.as_posix()}')

In [None]:
dfRep = pd.read_excel(xlsxRep, sheet_name='Détails', index_col=0)
dfRep.head()

In [None]:
dfRep.columns, len(dfRep)

In [None]:
#dfExplOptAnlysSpecs = ads.DSAnalyser.explicitVariantSpecs(optAnlysSpecs) 
#dfExplOptAnlysSpecs

## 2. Enrichissement

* données absentes des réusltats : nb total d'individus contactés par échantillon, distance max par ééchantillon,
* indicateurs qualité supplémentaires

In [None]:
dfSampleStats

### a. Données absentes du rapport en entrée

(TODO: à ajouter automatiquement)

In [None]:
indexCols

In [None]:
# Stats sur les échantillons (nbre d'individus, distances max)
dfStatsEch = dfRep[['Echant'] + indexCols].drop_duplicates().set_index('Echant')
dfStatsEch = dfStatsEch.join(dfSampleStats.set_index(indexCols), on=indexCols)
dfStatsEch.insert(0, 'Abréviation', dfStatsEch[indexCols].apply(sampleAbbrev, axis='columns'))
dfStatsEch

In [None]:
#dfStatsEch.reset_index().to_excel('tmp/stats-ech.xlsx', index=False)

In [None]:
dfRep = dfRep.join(dfStatsEch[indexCols + ['Distance Min', 'Distance Max', 'NTot Obs']].set_index(indexCols), on=indexCols)

In [None]:
# Taux d'individus conservés par les troncatures
dfRep['Taux Obs'] = dfRep.apply(lambda s: 100 * s['NObs'] / s['NTot Obs'], axis='columns')

### b. Obsolète : détection des résultats avec troncatures optimisées

Maintenant (27/12/2020), une colonne pour ça existe en sortie de l'optanalyseur.

In [None]:
optimTruncCol = ads.MCDSTruncationOptanalyser.OptimTruncFlagCol

if optimTruncCol not in dfRep.columns or dfRep[optimTruncCol].isnull().all():
    
    def isTruncationOptimised(sRes):  # np.modf(x, 1) => decimal part of x
        return 1 if sRes[['Dist Tronc Gche', 'Dist Tronc Drte']].fillna(0).mod(1).sum() > 0 \
                    or (sRes[['Dist Tronc Gche', 'Dist Tronc Drte']].isnull().all() 
                        and not pd.isnull(sRes['Tranch Dist Mod'])) \
                 else 0
    dfRep[optimTruncCol] = dfRep.apply(isTruncationOptimised, axis='columns')
    
else:
    
    print(f'Colonne "{optimTruncCol}" déja présente dans le rapport, rien à faire de plus')

len(dfRep), dfRep[optimTruncCol].value_counts()

### c. Des indicateurs qualité composés

(différentes recette privilégiant ou pas tel ou tel indicateur de base)

In [None]:
def normNObs(sRes):
    return sRes['NObs'] / sRes['NTot Obs']

def normNTotPars(sRes, a=0.2, b=0.6):
    #return 1 / (a * sRes['NbTot Pars'] + b)  # Trop pénalisant: a=0.2, b=1
    return 1 / (a * max(2, sRes['NbTot Pars']) + b)  # Mieux: a=0.2, b=0.6

def normCVDens(sRes, a=12):
    #return max(0, 1 - a * sRes['CoefVar Densité']) # Pas très pénalisant: a=1
    return math.exp(-a * sRes['CoefVar Densité'] ** 2) # Mieux : déjà ~0.33 à 30% (a=12)

In [None]:
def combinedGoodnessBalanced(sRes):
    return (sRes[['Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P']].prod() \
            * normNObs(sRes) * normNTotPars(sRes, a=0.2, b=0.6) * normCVDens(sRes, a=12)) ** (1.0/7)

dfRep['Qual Equi'] = dfRep.apply(combinedGoodnessBalanced, axis='columns')

In [None]:
def combinedGoodnessMoreChi2(sRes):
    return sRes[['Chi2 P', 'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P']].prod() \
           * normNObs(sRes) * normNTotPars(sRes, a=0.2, b=0.6) * normCVDens(sRes, a=12) ** (1.0/8)

dfRep['Qual Chi2'] = dfRep.apply(combinedGoodnessMoreChi2, axis='columns')

In [None]:
def combinedGoodnessMoreKS(sRes):
    return sRes[['Chi2 P', 'KS P', 'KS P', 'CvM Uw P', 'CvM Cw P']].prod() \
           * normNObs(sRes) * normNTotPars(sRes, a=0.2, b=0.6) * normCVDens(sRes, a=12) ** (1.0/8)

dfRep['Qual KS'] = dfRep.apply(combinedGoodnessMoreKS, axis='columns')

In [None]:
def combinedGoodnessMoreDCV(sRes):
    return sRes[['Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P']].prod() \
           * normNObs(sRes) * normNTotPars(sRes, a=0.2, b=0.6) \
           * (normCVDens(sRes, a=12) ** 2) ** (1.0/8)

dfRep['Qual DCV'] = dfRep.apply(combinedGoodnessMoreDCV, axis='columns')

### d. Groupes de valeurs de distances de troncature

Pour pouvoir regrouper les troncatures proches, et ne garder qu'un résultat par groupe

In [None]:
epsDist = 1e-6

ldTruncIntrvSpecs = [dict(col='Dist Tronc Gche', minDist=5.0, maxLen=5.0),
                     dict(col='Dist Tronc Drte', minDist=25.0, maxLen=25.0)]

In [None]:
# Pour chaque échantillon,
for lblEch, sEch in dfStatsEch.iterrows():
    
    print(f'#{lblEch}', sEch['Abréviation'], end=': ')

    # Pour chaque type de troncature (optimisée ou non),
    for isOpt in sorted(dfRep.loc[dfRep.Echant == lblEch, optimTruncCol].unique()):
        
        print('{}optim'.format('' if isOpt else 'non ').title(), end=' : ')

        # Sélectionner les résultats associés, et uniquement ceux-là
        dfSelRep = dfRep[(dfRep.Echant == lblEch) & (dfRep[optimTruncCol] == isOpt)]

        for dTrunc in ldTruncIntrvSpecs:

            truncCol = dTrunc['col']
            minIntrvDist = dTrunc['minDist']
            maxIntrvLen = dTrunc['maxLen']

            print(truncCol, end=', ')

            dfIntrv = dfSelRep[[truncCol]].dropna().sort_values(by=truncCol).copy()

            # Ecarts non nuls de distances entre distances consécutives triées
            dfIntrv['deltaDist'] = dfIntrv[truncCol].diff()
            dfIntrv.loc[dfIntrv[truncCol].idxmin(), 'deltaDist'] = np.inf
            dfIntrv.dropna(inplace=True)
            dfIntrv = dfIntrv[dfIntrv.deltaDist > 0].copy()

            # Début et fin de chaque intervalle (fermé à gauche = dMin, ouvert à droite = dSup)
            dfIntrv['dMin'] = dfIntrv.loc[dfIntrv.deltaDist > minIntrvDist, truncCol]
            dfIntrv['dSup'] = dfIntrv.loc[dfIntrv.deltaDist > minIntrvDist, truncCol].shift(-1).dropna()
            dfIntrv.loc[dfIntrv['dMin'].idxmax(), 'dSup'] = np.inf
            dfIntrv.dropna(inplace=True)

            sSelDist = dfSelRep[truncCol]
            dfIntrv['dSup'] = dfIntrv['dSup'].apply(lambda supV: sSelDist[sSelDist < supV].max() + epsDist)

            dfIntrv = dfIntrv[['dMin', 'dSup']].reset_index(drop=True)

            # Si les intervalles ainsi détectés sont trop larges, on les découpe en tranches égales
            lsNewIntrvs = list()
            for _, sIntrv in dfIntrv.iterrows():

                if sIntrv.dSup - sIntrv.dMin > maxIntrvLen:
                    nSubIntrvs = (sIntrv.dSup - sIntrv.dMin) / maxIntrvLen
                    nSubIntrvs = int(nSubIntrvs) if nSubIntrvs - int(nSubIntrvs) < 0.5 else int(nSubIntrvs) + 1
                    subIntrvLen = (sIntrv.dSup - sIntrv.dMin) / nSubIntrvs
                    lsNewIntrvs += [pd.Series(dict(dMin=sIntrv.dMin + nInd * subIntrvLen, 
                                                   dSup=min(sIntrv.dMin + (nInd + 1) * subIntrvLen, sIntrv.dSup)))
                                    for nInd in range(nSubIntrvs)]
                else:
                    lsNewIntrvs.append(sIntrv)

            dfIntrv = pd.DataFrame(lsNewIntrvs).reset_index(drop=True)
            dfIntrv.sort_values(by='dMin', inplace=True)

            # Attribution du numéro de groupe de troncatures à chaque distance mesurée (0 = pas de troncature)
            dfRep.loc[(dfRep.Echant == lblEch) & (dfRep[optimTruncCol] == isOpt), 'Grp ' + truncCol] = \
                dfSelRep[truncCol].apply(lambda d: 0 if pd.isnull(d) \
                                                   else 1 + dfIntrv[(dfIntrv.dMin <= d) & (dfIntrv.dSup > d)].index[0])

        print(len(dfSelRep), end=' ; ')
        
    print()

len(dfRep)

### e. Correction résultats "Nombre *" ACDC2019-Nat d'avant le 29/12/2020 (erreur surface zone => facteur 100)

In [None]:
if repFolder.stem < '201229':
    
    print("Correction des nombres d'individus, suite erreur unité surface zone ACDC 2019.")
    for colNb in ['Nombre', 'Min Nombre', 'Max Nombre']:
        dfRep[colNb] /= 100 # ha => km2
        
    print(dfRep['Nombre'].describe())

else:
    print('Rien à corriger, suffisamment récent')

## 3. Ajout colonnes de filtrage et tri

* Ici, on ne supprime pas les lignes, on leur attribue un indice suivant un ordre de tri, global ou par/dans groupe spécifié, 
* ce qui permet ensuite de filtrer si besoin en ne gardant que les N meilleures lignes (via pandas ou dans un tableur).

In [None]:
# Spécification des schémas de filtrage / tri
filSorSchemes = \
[  # Ordre dans groupe.
   dict(name='Meil AIC Tronc Id',  # Meilleur AIC, à troncatures D et G identiques (avec variantes de nb tranches)
         sort=['Dist Tronc Gche', 'Dist Tronc Drte',
               'Delta AIC', 'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx'],
         ascend=[True, True, True, False, False, True, False, True],
         group=['Dist Tronc Gche', 'Dist Tronc Drte', 'Tranch Dist Mod']),
    
   dict(name='Meil CKCv Tronc Proch',  # Meilleur Chi2&KS&DCV par groupe de troncatures proches
        sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
              'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx'],
        ascend=[True, True, True, False, False, True, False, True],
        group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']),
#   dict(name='Meil Chi2 Tronc Proch',  # Meilleur Chi2 par groupe de troncatures proches
#        sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
#              'Chi2 P'],
#        ascend=[True, True, True, False],
#        group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']),
   dict(name='Meil CVDens Tronc Proch',  # Meilleur DCV par groupe de troncatures proches
        sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
              'CoefVar Densité'],
        ascend=[True, True, True, True],
        group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']),
    
   dict(name='Meil Qual Equi Tronc Proch',  # Meilleur Qualité combinée équilibrée par groupe de troncatures proches
        sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
              'Qual Equi'],
        ascend=[True, True, True, False],
        group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']),
   dict(name='Meil Qual Chi2 Tronc Proch',  # Meilleur Qualité combinée Chi2+ par groupe de troncatures proches
        sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
              'Qual Chi2'],
        ascend=[True, True, True, False],
        group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']),
   dict(name='Meil Qual KS Tronc Proch',  # Meilleur Qualité combinée KS+ par groupe de troncatures proches
        sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
              'Qual KS'],
        ascend=[True, True, True, False],
        group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']),
   dict(name='Meil Qual DCV Tronc Proch',  # Meilleur Qualité combinée DCV+ par gro  b     
        sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
              'Qual DCV'],
        ascend=[True, True, True, False],
        group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']),
    
   # Ordre global (sans groupage)
   dict(name='Ord CKCv',
        sort=['Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx'],
        ascend=[False, False, True, False, True]),
   dict(name='Ord Qual Equi',
        sort='Qual Equi', ascend=False),
   dict(name='Ord Qual Chi2',
        sort='Qual Chi2', ascend=False),
   dict(name='Ord Qual KS',
        sort='Qual KS', ascend=False),
   dict(name='Ord Qual DCV',
        sort='Qual DCV', ascend=False),
   dict(name='Ord Simpl Tronc',
        sort=['Dist Tronc Gche', 'Dist Tronc Drte', 'Tranch Dist Mod',
              'Delta AIC', 'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx'],
        ascend=[True, True, True, True, False, False, True, False, True], napos='first'),
]

# N.B. C'est le dernier ordre qui résulte à la fin ...

In [None]:
# Ajout des colonnes permettant d'appliquer plus tard ces schémas de filtrage / tri
ldfEchReps = list()

for lblEch, sEch in dfStatsEch.iterrows():

    # Sélection des données de l'échantillon
    print(f'#{lblEch}', sEch['Abréviation'], end=': ')
    dfEchRep = dfRep[dfRep.Echant == lblEch].copy()

    # Application des schémas de filtrage / tri
    for scheme in filSorSchemes:
        dfEchRep.sort_values(by=scheme['sort'], ascending=scheme['ascend'], 
                             na_position=scheme.get('napos', 'last'), inplace=True)
        dfEchRep[scheme['name']] = dfEchRep.groupby(scheme['group'], dropna=False).cumcount() \
                                   if 'group' in scheme else range(len(dfEchRep))

    print(len(dfEchRep))
    
    # Sauvegarde du tableau résultat
    ldfEchReps.append(dfEchRep)
        
# Résultat.
dfRep = pd.concat(ldfEchReps) # Keep original report index (row #)

len(dfRep)

In [None]:
# Export avant filtrage
fpn = pl.Path(xlsxRep)
fpn = fpn.with_name(fpn.name.replace('rapport', f'rapenrich')) #.with_suffix('.ods')

print('=>', fpn.as_posix())

dfRep.to_excel(fpn)

## 4. Filtrage et tri grâce aux colonnes ad hoc

In [None]:
def filterDuplicates(dfRes, keep='first', subset=list(), round2decs=dict()):
    
    if round2decs:
        #dfRes = dfRes.round(round2decs) # Buggy (pandas 1.0.x up to 1.1.2): forgets columns !?!?!?
        dfRes = dfRes.copy()
        for col, dec in round2decs.items():
            if subset and col in subset:  # No useless work !
                dfRes[col] = dfRes[col].apply(lambda x: x if pd.isnull(x) else round(x, ndigits=dec))

        # Don't use df.round ... because it does not work, at least with pandas 1.0.x up to 1.1.2 !?!?!?
        #df = df.round(decimals={ col: dec for col, dec in self.trEnColNames(dColDecimals).items() \
        #                                  if col in df.columns })
        
    return dfRes[dfRes.duplicated(keep=keep, subset=subset)].index

dupSubset = ['Echant', 'NObs', 'Effort', 'Delta AIC',
             'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P', 'CoefVar Densité', 
             'PDetec', 'Min PDetec', 'Max PDetec', 'Densité', 'Min Densité', 'Max Densité']
dupRounds = {'Delta AIC': 1, 'Chi2 P': 2, 'KS P': 2, 'CvM Uw P': 2, 'CvM Cw P': 2, 'CoefVar Densité': 2, 
             'PDetec': 3, 'Min PDetec': 3, 'Max PDetec': 3, 'Densité': 2, 'Min Densité': 2, 'Max Densité': 2}

In [None]:
def filterFixScheme(dfRes, sampleIds, sampleIdCol, nMinRes, sbIsToBeDroppped, dfDropThresholds):
    
    """Fonction générique de filtrage avec stratégie de contrôle du nombre de résultats conservé
    via un schéma prédéfini de seuillage sur un critère
    """
    
    i2Drop = []
    for sampId in sampleIds:

        dfSampRes = dfRes[dfRes[sampleIdCol] == sampId]
        #print('#{} : {}'.format(sampId, len(dfSampRes)), end=' => ')

        for _, sThreshold in dfDropThresholds.iterrows():
            i2DropSamp = dfSampRes[sbIsToBeDroppped(dfSampRes, nMinRes, **sThreshold)].index
            #print(len(i2DropSamp), end=', ')
            if len(dfSampRes) - len(i2DropSamp) >= nMinRes:
                break

        #print(' => ', chi2, len(i2DropSamp))
        i2Drop = i2DropSamp if not len(i2Drop) else i2Drop.append(i2DropSamp)
 
    return i2Drop

In [None]:
def isToBeDroppedOnChi2(dfRes, nMinRes, chi2):
    
    return dfRes['Chi2 P'] < chi2

In [None]:
def isToBeDroppedOnCombinedQuality(dfRes, nMinRes, quality):
    
    return dfRes['Qualité'] < quality

In [None]:
def filterDichotScheme(dfRes, sampleIds, sampleIdCol, critCol, ascendCrit=True,
                       minCritStep=0.001, nMinRes=10, verbose=False):
    
    """Fonction générique de filtrage avec stratégie de contrôle du nombre de résultats conservé
    via un schéma adaptatif dichotomique de seuillage sur 1 critère (fonction de son domaine réel de valeurs)
    """
    
    # For each sample ...
    i2Drop = []
    for sampId in sampleIds:

        # Extract results.
        dfSampRes = dfRes[dfRes[sampleIdCol] == sampId]
        if verbose: print('#{}: {} results'.format(sampId, len(dfSampRes)), end=' => ')

        # Compute criteria threshold variation scheme from actual value domain
        start = dfSampRes[critCol].max() if ascendCrit else dfSampRes[critCol].min()
        stop = dfSampRes[critCol].min() if ascendCrit else dfSampRes[critCol].max()
        if verbose: print(f'{critCol} [{start:.3f},{stop:.3f}]', end=': ')

        # No need for tweeking criteria thresholds, we won't get more results.
        if len(dfSampRes) <= nMinRes:
            if verbose: print('t={:.3f}/k={}'.format(stop, len(dfSampRes)), end=', ')
            if verbose: print('done, no more possible.')
            continue
        
        # For each step of the scheme ...
        i2DropSamp, thresh = [], start
        while True:
            
            # Next try : middle of the interval to explore.
            threshTry = (start + stop) / 2

            # Try and apply the threshold step : number of dropped results if ...
            if ascendCrit:
                i2DropSampTry = dfSampRes[dfSampRes[critCol] < threshTry].index
            else:
                i2DropSampTry = dfSampRes[dfSampRes[critCol] > threshTry].index

            if verbose: print('t={:.3f}/k={}'.format(threshTry, len(dfSampRes) - len(i2DropSampTry)), end=', ')

            # Stop here if the min number expected of results would be reached
            if len(dfSampRes) - len(i2DropSampTry) == nMinRes:
                i2DropSamp, thresh = i2DropSampTry, threshTry
                if verbose: print('done, target reached.')
                break
                
            # Stop when no change in list to drop and above the min number expected of results.
            elif len(i2DropSampTry) == len(i2DropSamp) and abs(start - stop) < minCritStep:
                if verbose: print('done, no more change.')
                break
                            
            # Update criteria interval to explore according to whether we would be
            #  below or above the min number expected of results if ...
            if len(dfSampRes) - len(i2DropSampTry) > nMinRes:
                if ascendCrit:
                    stop = threshTry
                else:
                    start = threshTry
            else:
                if ascendCrit:
                    start = threshTry
                else:
                    stop = threshTry
                    
            # Or else, save current try, and go on.
            i2DropSamp, thresh = i2DropSampTry, threshTry

        # Append index to drop for sample to the final one
        i2Drop = i2DropSamp if not len(i2Drop) else i2Drop.append(i2DropSamp)
 
    return i2Drop

In [None]:
indexCols

In [None]:
previewCols = ['Echant'] + indexCols \
              + ['Dist Tronc Gche', 'Dist Tronc Drte', 'Tranch Dist Mod', 'CodEx', 'NObs',
                 'Delta AIC', 'Chi2 P', 'KS P', 'CoefVar Densité',
                 'Sélection Qual Equi',
                 'Qual Equi', 'Qual Chi2', 'Qual KS', 'Qual DCV',
                 'Densité', 'Min Densité', 'Max Densité']

In [None]:
ddfFilSorRep = dict()  # { method: dfFilSorRep}
ldFilSorSteps = list()  # [(method, step, param, value)]

### Méthode "codexec"

Fitrage minimal et tri :
1. Eliminer CodEx 3 et +

In [None]:
nPreSel = 5
preSelCol = 'Qual Equi'

In [None]:
method = 'codexec'

print(f'Méthode "{method}"')

dfFilSorRep = dfRep.copy()
ldFilSorSteps.append((method, 'Avant', 'Résultats', len(dfFilSorRep)))
print('* Avant :', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[dfFilSorRep.CodEx > 2].index,
                 inplace=True)
ldFilSorSteps.append((method, 'CodEx', 'Max', 2))
ldFilSorSteps.append((method, 'CodEx', 'Résultats', len(dfFilSorRep)))
print('* CodEx :', len(dfFilSorRep))

dfFilSorRep.sort_values(by=['Echant', 'Dist Tronc Gche', 'Dist Tronc Drte', 'CodEx'],
                        ascending=True, na_position='first', inplace=True)
#dfFilSorRep.drop_duplicates(subset=['Echant', 'NObs', 'Effort', 'Delta AIC',
#                                    'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P', 'CoefVar Densité', 
#                                    'PDetec', 'Min PDetec', 'Max PDetec', 'Densité', 'Min Densité', 'Max Densité'],
#                            keep='first', inplace=True)
dfFilSorRep.drop(filterDuplicates(dfFilSorRep, keep='first', subset=dupSubset, round2decs=dupRounds),
                 inplace=True)
ldFilSorSteps.append((method, 'Doublons', 'Résultats', len(dfFilSorRep)))
print('* Doublons :', len(dfFilSorRep))

selPreSelCol = 'Sélection ' + preSelCol
dfFilSorRep[selPreSelCol] = dfFilSorRep.groupby(['Echant'] + indexCols)[preSelCol] \
                                       .transform(lambda s: s.rank(ascending=False, method='dense'))
dfFilSorRep.loc[dfFilSorRep[selPreSelCol] > nPreSel, selPreSelCol] = np.nan

ldFilSorSteps.append((method, 'Pré-sélection auto', 'NbPréSélections', nPreSel))
ldFilSorSteps.append((method, 'Pré-sélection auto', 'ColonnePréSélection', preSelCol))
print('* Pré-sélection auto: {}{}'.format(nPreSel, preSelCol.replace(' ', '')))

dfFilSorRep.sort_values(by=['Echant', 'Dist Tronc Gche', 'Dist Tronc Drte', 'Ord Qual Equi'],
                        ascending=True, na_position='first', inplace=True)
ldFilSorSteps.append((method, 'Tri', 'Colonnes', 'TroncGche, TroncDrte, QualEqui'))
print('* Tri : TroncGche, TroncDrte, QualEqui')

ddfFilSorRep[method] = dfFilSorRep

dfFilSorRep[previewCols].head(15)

### Méthodes "ckcvN-fN" (obsolètes) : filtrage sur Chi2&KS&DCV ... adaptatif à seuils

Fitrage et tri :
1. Eliminer CodEx 3 et +,
2. Par groupe de troncatures Gche et Drte et nb tranches fitting identiques,
   garder le meilleur AIC (et Chi2 & KS & DCV & CodEx & NObs),
3. Par groupe de troncatures optimisées Gche et Drte proches (algo. de groupage à seuils, + gdrs à droite),
   garder le meilleur Chi2 & KS & DCV & CodEx & NObs,
4. Garder les Taux d'obs conservés >= 95%,
5. Garder les Chi2 >= 0.8 (sauf si moins de 5 résultats : baisser le seuil jusqu'à ...),
6. Trier par absence / simplicité des troncatures (sans < sans gche < sans drte < avec gche et dte) et CKCv.

In [None]:
nPreSel = 3
preSelCol = 'Qual Equi'

In [None]:
sightRate = 95
nBestAIC = 1
nBestCKCv = 1
nResults = 5
startChi2 = 0.8
stopChi2 = 0.1
nChi2Steps = 8

In [None]:
method = f'ckcv{int(sightRate*10)}f{nResults}'

print(f'Méthode "{method}"')

dfFilSorRep = dfRep.copy()
ldFilSorSteps.append((method, 'Avant', 'Résultats', len(dfFilSorRep)))
print('* Avant :', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[dfFilSorRep.CodEx > 2].index,
                 inplace=True)
ldFilSorSteps.append((method, 'CodEx', 'Max', 2))
ldFilSorSteps.append((method, 'CodEx', 'Résultats', len(dfFilSorRep)))
print('* CodEx :', len(dfFilSorRep))

dfFilSorRep.sort_values(by=['Echant', 'Dist Tronc Gche', 'Dist Tronc Drte', 'CodEx'],
                        ascending=True, na_position='first', inplace=True)
dfFilSorRep.drop(filterDuplicates(dfFilSorRep, keep='first', subset=dupSubset, round2decs=dupRounds),
                 inplace=True)
ldFilSorSteps.append((method, 'Doublons', 'Résultats', len(dfFilSorRep)))
print('* Doublons :', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[dfFilSorRep['Meil AIC Tronc Id'] >= nBestAIC].index,
                 inplace=True)
ldFilSorSteps.append((method, 'AIC TroncId', 'NbMeilleurs', nBestAIC))
ldFilSorSteps.append((method, 'AIC TroncId', 'Résultats', len(dfFilSorRep)))
print(f'* Meil{nBestAIC}AIC TroncId:', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[(dfFilSorRep[optimTruncCol] == 1)
                             & (dfFilSorRep['Meil CKCv Tronc Proch'] >= nBestCKCv)].index,
                 inplace=True)
ldFilSorSteps.append((method, 'TroncOpt & MeilleursCKCv TroncProch', 'NbMeilleurs', nBestCKCv))
ldFilSorSteps.append((method, 'TroncOpt & MeilleursCKCv TroncProch', 'Résultats', len(dfFilSorRep)))
print(f'* TrOpt & {nBestCKCv}CKCv:', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[dfFilSorRep['Taux Obs'] < sightRate].index,
                 inplace=True)
ldFilSorSteps.append((method, 'Taux Obs conservées', 'Min', sightRate))
ldFilSorSteps.append((method, 'Taux Obs conservées', 'Résultats', len(dfFilSorRep)))
print(f'* TauxObs{sightRate} :', len(dfFilSorRep))

dfFilSorRep.drop(filterFixScheme(dfFilSorRep, sampleIds=dfFilSorRep.Echant.unique(), sampleIdCol='Echant',
                                 nMinRes=nResults, sbIsToBeDroppped=isToBeDroppedOnChi2,
                                 dfDropThresholds=pd.DataFrame(dict(chi2=np.linspace(start=startChi2,
                                                                                     stop=stopChi2, num=nChi2Steps)))),
                 inplace=True)
ldFilSorSteps.append((method, 'Meilleurs Chi2', 'Début', startChi2))
ldFilSorSteps.append((method, 'Meilleurs Chi2', 'Fin', stopChi2))
ldFilSorSteps.append((method, 'Meilleurs Chi2', 'NbPas', nChi2Steps))
ldFilSorSteps.append((method, 'Meilleurs Chi2', 'NbCible', nResults))
ldFilSorSteps.append((method, 'Meilleurs Chi2', 'Résultats', len(dfFilSorRep)))
print(f'* {nResults}fChi2 :', len(dfFilSorRep))

selPreSelCol = 'Sélection ' + preSelCol
dfFilSorRep[selPreSelCol] = dfFilSorRep.groupby(['Echant'] + indexCols)[preSelCol] \
                                       .transform(lambda s: s.rank(ascending=False, method='dense'))
dfFilSorRep.loc[dfFilSorRep[selPreSelCol] > nPreSel, selPreSelCol] = np.nan

ldFilSorSteps.append((method, 'Pré-sélection auto', 'NbPréSélections', nPreSel))
ldFilSorSteps.append((method, 'Pré-sélection auto', 'ColonnePréSélection', preSelCol))
print('* Pré-sélection auto: {}{}'.format(nPreSel, preSelCol.replace(' ', '')))

dfFilSorRep.sort_values(by=['Echant', 'Dist Tronc Gche', 'Dist Tronc Drte', 'Ord CKCv'],
                        ascending=True, na_position='first', inplace=True)
ldFilSorSteps.append((method, 'Tri', 'Colonnes', 'TroncGche, TroncDrte, Chi2&KS&...DCV'))
print('* Tri : TroncGche, TroncDrte, Chi2&KS...DCV')
                     
ddfFilSorRep[method] = dfFilSorRep

dfFilSorRep[previewCols].head(15)

### Méthodes "ckcvqualN-dN" : Filtrage sur plusieurs critères, et adaptatif dichotomique

Filtrage et tri proche de 1 (?) mais moins méchant, pour action manuelles de filtrage a posteriori
1. Eliminer CodEx 3 et +,
2. Par groupe de troncatures Gche et Drte et nb tranches fitting identiques,
   garder les N1 meilleurs AIC & Chi2 & KS & DCV & NObs & CodEx,
3. Par groupe de troncatures Gche et Drte proches (algo. de groupage à seuils, analyses optim / non optim séparées), garder :
    * les N2 meilleur Chi2 & KS & DCV & NObs & CodEx,
    * les N2 meilleur DCV & Chi2 & KS & NObs & CodEx,
    * les N2 meilleur indicateurQualitéCombiné(Chi2, KS, DCV, NObs, CodEx),
4. Garder les Taux d'obs conservés >= N3%,
5. Garder les N4 meilleurs résultats selon indicateurQualitéCombiné(Chi2, KS, DCV, NObs, CodEx),
6. Trier par absence / simplicité des troncatures (sans < sans gche < sans drte < avec gche et dte) et ce même indicateur.

In [None]:
preSelCol = 'Qual Equi'

In [None]:
nPreSel = 3
sightRate = 97.5
nBestAIC = 2
nBestQua = 1
nResults = 8

In [None]:
nPreSel = 3
sightRate = 95
nBestAIC = 2
nBestQua = 1
nResults = 10

In [None]:
nPreSel = 4
sightRate = 92.5
nBestAIC = 3
nBestQua = 1
nResults = 12

In [None]:
nPreSel = 4
sightRate = 90
nBestAIC = 3
nBestQua = 1
nResults = 15

In [None]:
nPreSel = 5
sightRate = 90
nBestAIC = 4
nBestQua = 1
nResults = 20

In [None]:
# A exécuter pour chacune des cellules de paramétrage ci-dessus retenues
method = f'ckcvqual{int(sightRate*10)}d{nResults}'

print(f'Méthode "{method}"')

dfFilSorRep = dfRep.copy()
ldFilSorSteps.append((method, 'Avant', 'Résultats', len(dfFilSorRep)))
print('* Avant :', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[dfFilSorRep.CodEx > 2].index,
                 inplace=True)
ldFilSorSteps.append((method, 'CodEx', 'Max', 2))
ldFilSorSteps.append((method, 'CodEx', 'Résultats', len(dfFilSorRep)))
print('* CodEx :', len(dfFilSorRep))

dfFilSorRep.sort_values(by=['Echant', 'Dist Tronc Gche', 'Dist Tronc Drte', 'CodEx'],
                        ascending=True, na_position='first', inplace=True)
dfFilSorRep.drop(filterDuplicates(dfFilSorRep, keep='first', subset=dupSubset, round2decs=dupRounds),
                 inplace=True)
ldFilSorSteps.append((method, 'Doublons', 'Résultats', len(dfFilSorRep)))
print('* Doublons :', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[dfFilSorRep['Meil AIC Tronc Id'] >= nBestAIC].index,
                 inplace=True)
ldFilSorSteps.append((method, 'AIC TroncId', 'NbMeilleurs', nBestAIC))
ldFilSorSteps.append((method, 'AIC TroncId', 'Résultats', len(dfFilSorRep)))
print(f'* {nBestAIC}AIC TroncId:', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[(dfFilSorRep['Meil CKCv Tronc Proch'] >= nBestQua)
                             & (dfFilSorRep['Meil CVDens Tronc Proch'] >= nBestQua)
                             & (dfFilSorRep['Meil Qual Equi Tronc Proch'] >= nBestQua)
                             & (dfFilSorRep['Meil Qual Chi2 Tronc Proch'] >= nBestQua)
                             & (dfFilSorRep['Meil Qual KS Tronc Proch'] >= nBestQua)
                             & (dfFilSorRep['Meil Qual DCV Tronc Proch'] >= nBestQua)].index,
                           # & (dfFilSorRep['Meil AIC Tronc Id'] > 0)].index,
                 inplace=True)
ldFilSorSteps.append((method, 'MeilleursCKCv+CVDens+QualEqui+Chi2+KS+DCV TroncProch', 'NbMeilleurs', nBestQua))
ldFilSorSteps.append((method, 'MeilleursCKCv+CVDens+QualEqui+Chi2+KS+DCV TroncProch', 'Résultats', len(dfFilSorRep)))
print(f'* {nBestQua}CKCv+CVDens+QualEqui+Chi2+KS+DCV TroncProc:', len(dfFilSorRep))

dfFilSorRep.drop(dfFilSorRep[dfFilSorRep['Taux Obs'] < sightRate].index,
                 inplace=True)
ldFilSorSteps.append((method, 'Taux Obs conservées', 'Min', sightRate))
ldFilSorSteps.append((method, 'Taux Obs conservées', 'Résultats', len(dfFilSorRep)))
print(f'* TauxObs{sightRate} :', len(dfFilSorRep))

dfFilSorRep.drop(filterDichotScheme(dfFilSorRep, sampleIds=dfFilSorRep.Echant.unique(), sampleIdCol='Echant',
                                    critCol='Qual Equi', ascendCrit=True, nMinRes=nResults),
                 inplace=True)
ldFilSorSteps.append((method, 'Meilleurs QualEqui', 'NbCible', nResults))
ldFilSorSteps.append((method, 'Meilleurs QualEqui', 'Résultats', len(dfFilSorRep)))

print(f'* {nResults}dQual :', len(dfFilSorRep))

selPreSelCol = 'Sélection ' + preSelCol
dfFilSorRep[selPreSelCol] = dfFilSorRep.groupby(['Echant'] + indexCols)[preSelCol] \
                                       .transform(lambda s: s.rank(ascending=False, method='dense'))
dfFilSorRep.loc[dfFilSorRep[selPreSelCol] > nPreSel, selPreSelCol] = np.nan

ldFilSorSteps.append((method, 'Pré-sélection auto', 'NbPréSélections', nPreSel))
ldFilSorSteps.append((method, 'Pré-sélection auto', 'ColonnePréSélection', preSelCol))
print('* Pré-sélection auto: {}{}'.format(nPreSel, preSelCol.replace(' ', '')))

dfFilSorRep.sort_values(by=['Echant', 'Dist Tronc Gche', 'Dist Tronc Drte', 'Ord Qual Equi'],
                        ascending=True, na_position='first', inplace=True)
ldFilSorSteps.append((method, 'Tri', 'Colonnes', 'TroncGche, TroncDrte, QualEqui'))
print('* Tri : TroncGche, TroncDrte, QualEqui')

ddfFilSorRep[method] = dfFilSorRep

dfFilSorRep[previewCols].head(15)

## 5. Exports

In [None]:
#expCols1 = ['Analyse', 'Fn Clé Mod', 'Sér Ajust Mod', optimTruncCol,
#            'Dist Tronc Gche', 'Dist Tronc Drte', 'Tranch Dist Mod', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
#            'CodEx', 'NObs', 'NTot Obs', 'Taux Obs'] \
#           + [scheme['name'] for scheme in filSorSchemes] \
#           + ['Delta AIC', 'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P', 'Qualité', 'NbTot Pars',
#              'CoefVar Densité', 'Densité', 'Min Densité', 'Max Densité',
#              'PDetec', 'Min PDetec', 'Max PDetec']

In [None]:
# Pour valider les méthodes de filtrage
#expCols2 = ['Analyse', 'Fn Clé Mod', 'Sér Ajust Mod', 'Dist Tronc Gche', 'Dist Tronc Drte', 'Tranch Dist Mod'] \
#           + ['NObs', 'Delta AIC', 'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P', 'Taux Obs',
#              'CoefVar Densité', 'NbTot Pars', 'Qual Equi', 'Qual Chi2', 'Qual KS', 'Qual DCV'] \
#           + ['Densité', 'Min Densité', 'Max Densité'] \
#           + [scheme['name'] for scheme in filSorSchemes] \
#           + [optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte'] \
#           + ['CodEx', 'NTot Obs'] \
#           + ['PDetec', 'Min PDetec', 'Max PDetec']

In [None]:
# Pour équivalent rapport (en terme de colonnes)
expCols3 = ['Analyse', 'Fn Clé Mod', 'Sér Ajust Mod', 'Dist Tronc Gche', 'Dist Tronc Drte', 'Tranch Dist Mod'] \
           + ['Delta AIC', 'Chi2 P', 'KS P', 'CvM Uw P', 'CvM Cw P', 'NObs', 'Taux Obs', 'CoefVar Densité', 'NbTot Pars'] \
           + ['Sélection finale', 'Sélection Qual Equi'] \
           + ['Qual Equi', 'Qual Chi2', 'Qual KS', 'Qual DCV'] \
           + ['Densité', 'Min Densité', 'Max Densité', 'EDR/ESW', 'Min EDR/ESW', 'Max EDR/ESW',
              'Nombre', 'Min Nombre', 'Max Nombre', 'PDetec', 'Min PDetec', 'Max PDetec'] \
           + ['CodEx', 'NTot Obs'] \
           + [scheme['name'] for scheme in filSorSchemes] \
           + [optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte']

In [None]:
expCols = expCols3

assert len(expCols) == len(set(expCols)), 'Ho, ho ... some duplicated columns ?'

In [None]:
ddfFilSorRep.keys()

In [None]:
methods = ['ckcvqual975d8', 'ckcvqual950d10', 'ckcvqual925d12', 'ckcvqual900d15', 'ckcvqual900d20', 'codexec']

assert all(meth in ddfFilSorRep for meth in methods), \
       ','.join(meth for meth in methods if meth not in ddfFilSorRep) + ' not computed'

In [None]:
# Historique des étapes, paramètres et résultats des différents filtrages / tris (traçabilité).
dfFilSorHist = pd.DataFrame(ldFilSorSteps, columns=['Méthode', 'Etape', 'Variable', 'Valeur']).set_index(['Méthode'])
dfFilSorHist = dfFilSorHist.loc[methods]
dfFilSorHist.reset_index(inplace=True)
dfFilSorHist.set_index(['Méthode', 'Etape'], inplace=True)
dfFilSorHist

### a. 1 seul onglet par méthode, tous les échantillons à chaque fois

In [None]:
fpn = pl.Path(xlsxRep)
fpn = fpn.with_name(fpn.name.replace('rapport', f'raptousech')) #.with_suffix('.ods')

selectCol = 'Sélection Qual Equi'
print('All {} samples:'.format(len(dfStatsEch)))
with pd.ExcelWriter(fpn) as xlsWrtr:
    for meth in methods:
        dfFilSorRep = ddfFilSorRep[meth].copy()
        dfFilSorRep.insert(dfFilSorRep.columns.get_loc(selectCol), 'Sélection finale',
                           dfFilSorRep[selectCol].where(dfFilSorRep[selectCol] == 1))
        dfFilSorRep[['Echant'] + indexCols + expCols].to_excel(xlsWrtr, sheet_name=meth, index=False)
        print('* {}: {} results'.format(meth, len(dfFilSorRep)))
    dfFilSorHist.to_excel(xlsWrtr, sheet_name='paramètres', index=True)
    dfStatsEch.reset_index().to_excel(xlsWrtr, sheet_name='échantillons', index=False)

print('=>', fpn.as_posix())

### b. 1 classeur par méthode, 1 onglet par échantillon

In [None]:
selectCol = 'Sélection Qual Equi'
for meth in methods:
    dfFilSorRep = ddfFilSorRep[meth].copy()
    dfFilSorRep.insert(dfFilSorRep.columns.get_loc(selectCol), 'Sélection finale',
                       dfFilSorRep[selectCol].where(dfFilSorRep[selectCol] == 1))
    fpn = pl.Path(xlsxRep)
    fpn = fpn.with_name(fpn.name.replace('rapport', f'raparech-{meth}')) #.with_suffix('.ods')
    with pd.ExcelWriter(fpn) as xlsWrtr:
        for lblEch, sEch in dfStatsEch.iterrows():
            print('* #{} {}'.format(lblEch, sEch['Abréviation']), end=': ')
            dfFSEchRep = dfFilSorRep[dfFilSorRep.Echant == lblEch]
            dfFSEchRep[expCols].to_excel(xlsWrtr, index=False,
                                         sheet_name='{} {}'.format(sEch['Abréviation'], lblEch)) #, engine='odf')
            print(len(dfFSEchRep), 'results')
        dfFilSorHist.loc[meth].to_excel(xlsWrtr, sheet_name='histoire', index=False)
        dfStatsEch.reset_index().to_excel(xlsWrtr, sheet_name='échantillons', index=False)
    print('=>', fpn.as_posix())

## 6. Exports avec résultats manuels intercalés

In [None]:
fpn = pl.Path(xlsxRep)
fpn = fpn.with_name(fpn.name.replace('rapport', f'raptousech-comp')) #.with_suffix('.ods')
fpn.as_posix()

### a. Chargement et mise en forme résultats manuels

#### i. ZPS Crêtes Cantal 2020

In [None]:
# Chargement
dfManuRep = pd.read_excel(dossier / f'{nomEtude}{sousEtude}-AnalysesD73Mathis-resultats.xlsx')

# Nom des colonnes
dfManuRep.rename(columns={'Id EchTronc': 'AbrevEch', 'Nb données': 'NObs', 'Modèle': 'Fn Clé Mod',
                          'GOF Chi-p': 'Chi2 P', 'D CV': 'CoefVar Densité',
                          'D': 'Densité', 'D LCL': 'Min Densité', 'D UCL': 'Max Densité'}, inplace=True)

# Colonnes inutiles pour la comparaison
dfManuRep.drop(columns=['N', 'N LCL', 'N UCL', 'SURF HAB FAVORABLE', 'D / 10 ha', 'Remarques'], inplace=True)

# Suppression des lignes sans intérêt (commentaires)
dfManuRep.dropna(subset=['AbrevEch'], inplace=True)

# Conversions diverses
for col in ['Densité', 'Min Densité', 'Max Densité']:
    dfManuRep[col] /= 100

# Identification de l'échantillon : à partir de l'id. de l'analyse
dfManuRep.AbrevEch = dfManuRep.AbrevEch.apply(lambda s: '-'.join(s.split('-')[:2]))

# Colonne pour identifier la source
dfManuRep.insert(0, 'Source', 0)

# Colonne pour pouvoir conserver l'ordre de tri initial
dfManuRep.insert(0, 'Ordre', range(len(dfManuRep)))

# Colonne numéro d'échantillon.
dfManuRep = dfManuRep.join(dfStatsEch[['Abréviation']].reset_index().set_index('Abréviation'), on='AbrevEch')

dfManuRep

### b. 1 seul onglet par méthode, tous les échantillons à chaque fois

In [None]:
fpn = pl.Path(xlsxRep)
fpn = fpn.with_name(fpn.name.replace('rapport', f'raptousech-comp')) #.with_suffix('.ods')

print('All {} samples ({} manual results):'.format(len(dfStatsEch), len(dfManuRep)))

with pd.ExcelWriter(fpn) as xlsWrtr:
    
    for meth in methods:
        
        dfFilSorRep = ddfFilSorRep[meth].copy()
        
        dfFilSorRep.insert(1, 'AbrevEch', dfFilSorRep[['Espèce', 'Adulte']].apply(sampleAbbrev, axis='columns'))
        dfFilSorRep.insert(0, 'Source', 1)
        dfFilSorRep.insert(0, 'Ordre', range(len(dfFilSorRep)))
        nAutoRes = len(dfFilSorRep)
        
        dfFilSorRep = dfFilSorRep.append(dfManuRep, ignore_index=True)
        
        dfFilSorRep.sort_values(by=['Echant', 'Source', 'Ordre'], ascending=True, inplace=True)
        
        dfFilSorRep[['Source', 'Echant'] + indexCols + expCols].to_excel(xlsWrtr, sheet_name=meth, index=False) #, engine='odf')
        
        print('* {}: {} results (auto: {})'.format(meth, len(dfFilSorRep), nAutoRes))

    dfFilSorHist.to_excel(xlsWrtr, sheet_name='histoire', index=True)

print('=>', fpn.as_posix())

## 7. Rapports Excel et HTML

Pré-requis :
* export filtré prêt pour chargement (1 à 5 ci-dessus) => fichier <etude>-raptousech.ods
* résultats d'optanalyses produits ou chargés (XVI.2a/b) => variable results

In [None]:
# Rechargement de l'export filtré
fpn = pl.Path(xlsxRep)
fpn = fpn.with_name(fpn.name.replace('rapport', f'raptousech')).with_suffix('.ods')
print(fpn.as_posix())

ddfFilSorExp = pd.read_excel(fpn, sheet_name=None)
ddfFilSorExp.keys()

In [None]:
# On travaille avec la méthode 'ckcvqual925d12'
selectCol = 'Sélection Qual Equi'
selectMeth = 'ckcvqual925d12'

dfFilSorExp = ddfFilSorExp[selectMeth]
selAnlysIds = dfFilSorExp[dfFilSorExp[selectCol].notnull()].Analyse.tolist()
print(dict(ciblees=len(selAnlysIds)))

In [None]:
# Résultats pour le rapport = résultats d'opt-analyse, après filtrage : Supprimer les analyses hors 'Sélection Qual Equi'
filSorRes = results.copy()
sResAnalysIds = filSorRes.dfData[('header (head)', 'Analyse', 'Value')]
print(dict(optanalyses=len(sResAnalysIds), ciblees=sResAnalysIds.isin(selAnlysIds).sum()))
     
filSorRes.dropRows(~sResAnalysIds.isin(selAnlysIds))
print(dict(filtrees=len(filSorRes)))

In [None]:
# Complément des résultats pour rapport : colonnes 'Qual Equi'
dfFilSorRes2Join = dfFilSorExp[['Analyse', 'Qual Equi']]
dfFilSorRes2Join.columns = pd.MultiIndex.from_tuples([('header', 'Analyse', 'Value'),
                                                      ('filtering', 'Qual Equi', 'Value')])
dfFilSorRes2Join.set_index(('header', 'Analyse', 'Value'), inplace=True)

# Forcer le calcul des colonnes ... calculées, si pas déjà fait.
_ = filSorRes.dfData

# Bricolage : les données
filSorRes._dfData = filSorRes._dfData.join(dfFilSorRes2Join, on=[('header (head)', 'Analyse', 'Value')])

# Bricolage : les traductions (nom colonnes)
filSorRes.dfCustomColTrans.loc[('filtering', 'Qual Equi', 'Value')] = pd.Series(dict(en='Bal. Quality', fr='Qualité Equi.'))

In [None]:
filSorRes.dfTransData('fr')

In [None]:
# Sélection des colonnes pour les tableaux du pré-rapport
# a. Page principale : Colonne 1 (haut), de description de l'échantillon
filSorRepSampleCols = \
[('header (head)', 'Echant', 'Value')] \
+ [('header (sample)', col, 'Value') for col in samplingCols] \
+ [('header (head)', 'Analyse', 'Value')]

# b. Page principale : Colonne 1 (bas), des paramètres du modèle d'analyse
filSorRepParamCols = \
[
    ('parameters', 'estimator key function', 'Value'),
    ('parameters', 'estimator adjustment series', 'Value'),
    #('parameters', 'CV interval', 'Value')
    ('parameters', 'left truncation distance', 'Value'),
    ('parameters', 'right truncation distance', 'Value'),
    ('parameters', 'model fitting distance cut points', 'Value'),
]

# c. Page principale : Colonne 2 et 3, des résultats (juste avant les 4, 5, et 6 avec les graphiques)
filSorRepResultCols = \
[
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'right truncation distance (w)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    
    #('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('detection probability', 'number of adjustment term parameters (NAP)', 'Value'),
    ('filtering', 'Qual Equi', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),

    ('density/abundance', 'density of animals', 'Cv'),
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('density/abundance', 'number of animals, if survey area is specified', 'Value'),
    ('density/abundance', 'number of animals, if survey area is specified', 'Lcl'),
    ('density/abundance', 'number of animals, if survey area is specified', 'Ucl')
]

# d. Pages ppale et de détails : Tableau de synthése.
filSorRepSynthCols = filSorRepSampleCols + filSorRepParamCols \
+ [
    ('run output', 'run status', 'Value'),
    
    ('encounter rate', 'number of observations (n)', 'Value'),
    ('encounter rate', 'right truncation distance (w)', 'Value'),
    ('encounter rate', 'effort (L or K or T)', 'Value'),
    ('encounter rate', 'encounter rate (n/L or n/K or n/T)', 'Df'),
    
    ('detection probability', 'AIC value', 'Value'),
    ('detection probability', 'chi-square test probability determined', 'Value'),
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value'),
    ('density/abundance', 'density of animals', 'Cv'),
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl'),
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl'),
    
    ('density/abundance', 'density of animals', 'Value'),
    ('density/abundance', 'density of animals', 'Lcl'),
    ('density/abundance', 'density of animals', 'Ucl'),
    
    ('detection probability', 'probability of detection (Pw)', 'Value'),
    ('detection probability', 'probability of detection (Pw)', 'Lcl'),
    ('detection probability', 'probability of detection (Pw)', 'Ucl'),
    ('detection probability', 'probability of detection (Pw)', 'Df'),

    ('density/abundance', 'number of animals, if survey area is specified', 'Value'),
    ('density/abundance', 'number of animals, if survey area is specified', 'Lcl'),
    ('density/abundance', 'number of animals, if survey area is specified', 'Ucl'),
    ('density/abundance', 'number of animals, if survey area is specified', 'Df'),
   
    ('run output', 'run folder', 'Value')
]

In [None]:
filSorRepSortCols = \
[('header (head)', sampleNumCol, 'Value')] \
+ [('filtering', 'Qual Equi', 'Value'),
   ('detection probability', 'chi-square test probability determined', 'Value'),
   ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value')
]

filSorRepAscend = [True] + [False]*3

In [None]:
filSorRep = ads.MCDSResultsFullReport(resultsSet=filSorRes, title=titreEtude,
                                      subTitle=f"Rapport d'analyse (après filtrage / sélection '{selectMeth}')",
                                      anlysSubTitle='Détail des analyses filtrées', description=descrEtude,
                                      keywords=motsClesEtude, pySources=['Visionature-ds-points.ipynb'],
                                      lang='fr', superSynthPlotsHeight=288, plotImgSize=(640, 400),
                                      #plotImgQuality=80, plotImgFormat='jpg', # Same final size as raw PNG :-(
                                      sampleCols=filSorRepSampleCols, paramCols=filSorRepParamCols,
                                      resultCols=filSorRepResultCols, synthCols=filSorRepSynthCols,
                                      sortCols=filSorRepSortCols, sortAscend=filSorRepAscend,
                                      tgtFolder=workDir, 
                                      tgtPrefix=f'{nomEtude}{sousEtude}-AnalysesFiltrees-{selectMeth}-rapport')

In [None]:
xlsxFilSorRep = filSorRep.toExcel()

HTML(f'Rapport Excel : <a href="{xlsxFilSorRep}" target="blank">{xlsxFilSorRep}</a>')

In [None]:
%%time

# 4-HT-core i5-8365U PCI-e SSD: 6 generators Naturalist (2021-02-14): 3mn15-20s (n=3)
# 4-HT-core i5-8365U PCI-e SSD: 6 generators Naturalist+Papyrus (2021-02-27): s (n=1)
htmlFilSorRep = filSorRep.toHtml(generators=6)

HTML(f'Pré-rapport HTML : <a href="{htmlFilSorRep}" target="blank">{htmlFilSorRep}</a>')

In [None]:
cls = ads.MCDSTruncOptanalysisResultsSet

In [None]:
df1 = results._dfData.copy()
df2 = results._dfData.copy()
df3 = results._dfData.copy()
len(df1)

In [None]:
nBestQua = 1

In [None]:
df1.drop(df1[(df1[cls.CLGrpOrdClTrChi2KSDCv] >= nBestQua)
                             & (df1[cls.CLGrpOrdClTrDCv] >= nBestQua)
                             & (df1[cls.CLGrpOrdClTrQuaBal1] >= nBestQua)
                             & (df1[cls.CLGrpOrdClTrQuaChi2] >= nBestQua)
                             & (df1[cls.CLGrpOrdClTrQuaKS] >= nBestQua)
                             & (df1[cls.CLGrpOrdClTrQuaDCv] >= nBestQua)].index,
                           # & (df1[cls.CLGrpOrdClTrChi2] > 0)].index,
                 inplace=True)
len(df1)

In [None]:
R = cls
whichBestQua = [R.CLGrpOrdClTrChi2KSDCv, R.CLGrpOrdClTrDCv, R.CLGrpOrdClTrQuaBal1,
                R.CLGrpOrdClTrQuaChi2, R.CLGrpOrdClTrQuaKS, R.CLGrpOrdClTrQuaDCv]

In [None]:
for clQuaIndic in whichBestQua:
    df2.drop(df2[df2[clQuaIndic] >= nBestQua].index, inplace=True)
    print(len(df2))

In [None]:
sb2drop = None
for clQuaIndic in whichBestQua:
    sb2dropIndic = (df3[clQuaIndic] >= nBestQua)
    if sb2drop is None:
        sb2drop = sb2dropIndic
    else:
        sb2drop &= sb2dropIndic
df3.drop(df3[sb2drop].index, inplace=True)
print(len(df3))

In [None]:
df4 = results._dfData.copy()
sb2keep = pd.Series(data=False, index=df4.index)
for clQuaIndic in whichBestQua:
    sb2keep |= (df4[clQuaIndic] < nBestQua)
df4 = df4[sb2keep]
print(len(df4))

In [None]:
df4.compare(df3)

In [None]:
df4.drop(df4[df4[cls.CLSightRate] < 92.5].index, inplace=True)
len(df4)

In [None]:
df1 = pd.DataFrame([dict(a=True, b=True), dict(a=False, b=True), dict(a=True, b=False), dict(a=False, b=False)])
df1

In [None]:
df1.drop(df1[(df1.a == True) & (df1.b == True)].index)

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame([dict(a=1), dict(a=2)])
a = np.array([[2, 3], [4, 5]])

In [None]:
df

In [None]:
a

In [None]:
df[['b', 'c']] = a
df