<!-- Auto table of contents -->
<h1 class='tocIgnore'>AutoDS : Mise au point et tests unitaires</h1>
<p>(module <b>autods</b> d'interface python à MCDS.exe)</p>
<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table des matières</h2>
  <div id="toc"></div>
</div>

In [None]:
%%javascript
$.getScript('ipython_notebook_toc.js')

In [None]:
%matplotlib inline

In [None]:
import sys
import os
import pathlib as pl

import re

from collections import OrderedDict as odict, namedtuple as ntuple

import math
import numpy as np
import pandas as pd

from tqdm import tqdm

from IPython.display import HTML

import matplotlib.pyplot as plt

import plotly as ply
import plotly.graph_objs as plygo

In [None]:
# Activate Warnings as Exception
#import warnings
#warnings.filterwarnings('error')

# Tests unitaires

## 0. Détection de Distance

In [None]:
sys.path.insert(0, '..')

In [None]:
import autods as ads

## 1. Classe DataSet

In [None]:
# Excel source (path as simple string)
ds = ads.SampleDataSet(source='refin/ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                       decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])
ds.dfData.head()

In [None]:
# CSV source with ',' as decimal point (path as pl.Path)
ds = ads.SampleDataSet(source=pl.Path('refin/ACDC2019-Papyrus-TURMER-AB-5mn-1dec-dist.txt'),
                       decimalFields=['Point transect*Survey effort', 'Observation*Radial distance'])

assert not any(ds.dfData[col].dropna().apply(lambda v: isinstance(v, str)).any() for col in ds.decimalFields), \
       'Error: Some strings found in declared decimal fields ... any decimal format issue ?'

ds.dfData.head()

In [None]:
# CSV source with '.' as decimal point
ds = ads.SampleDataSet(source=pl.Path('refin/ACDC2019-Papyrus-ALAARV-AB-10mn-1dotdec-dist.txt'),
                       decimalFields=['Point transect*Survey effort', 'Observation*Radial distance'])

assert not any(ds.dfData[col].dropna().apply(lambda v: isinstance(v, str)).any() for col in ds.decimalFields), \
       'Error: Some strings found in declared decimal fields ... any decimal format issue ?'

ds.dfData.head()

In [None]:
# DataFrame source.
dfData = pd.DataFrame(columns=['Date', 'TrucDec', 'Espece', 'Point', 'Effort', 'Distance'],
                      data=[('2019-05-13', 3.5, 'TURMER', 23, 2,   83),
                            ('2019-05-15', np.nan, 'TURMER', 23, 2,   27.355),
                            ('2019-05-13', 0, 'ALAARV', 29, 2,   56.85),
                            ('2019-04-03', 1.325, 'PRUMOD', 53, 1.3,  7.2),
                            ('2019-06-01', 2, 'PHICOL', 12, 1,  np.nan),
                            ('2019-06-19', np.nan, 'PHICOL', 17, 0.5, np.nan),
                           ])
dfData['Region'] = 'ACDC'
dfData['Surface'] = '2400'
dfData

In [None]:
ds = ads.SampleDataSet(source=dfData, decimalFields=['Effort', 'Distance', 'TrucDec'])
ds.dfData

## 2. Classes XXEngine

### a. Instanciation et chargement des spécifs sur les stats en sortie

In [None]:
try:
    eng = ads.MCDSEngine(workDir='tmp/test out') # Simple string path
    print('Error: Should have raised an AssertionError !')
except AssertionError as exc:
    print('Good forbidden chars detection:', exc)

In [None]:
try:
    eng = ads.MCDSEngine(workDir=pl.Path('tmp', 'test out')) # pl.Path path
    print('Error: Should have raised an AssertionError !')
except AssertionError as exc:
    print('Good forbidden chars detection:', exc)

In [None]:
eng = ads.MCDSEngine(workDir=pl.Path('tmp', 'mcds-out'))

In [None]:
_ = eng.setupRunFolder(runPrefix='uni') # Unit tests

### b. Génération fichier de données en entrée de MCDS

In [None]:
dataFileName = eng.buildDataFile(dataSet=ds)

### c. Génération fichier de "commandes"

In [None]:
cmdFileName = eng.buildCmdFile(estimKeyFn='HNORMAL', estimAdjustFn='COSINE', estimCriterion='AIC', cvInterval=95)

### d. Execution en mode "debug"

(génération des fichiers cmd et data, mais pas d'appel à l'exécutable)

In [None]:
runCode, runTime, runDir = eng.run(ds, realRun=False, runPrefix='int',
                                   estimKeyFn='UNIFORM', estimAdjustFn='POLY',
                                   estimCriterion='AIC', cvInterval=95)
assert runCode == 0, 'Should have NOT run (run code = 0)'
dict(runCode=runCode, runDir=runDir, runTime=runTime)

### e. Exécution réelle

In [None]:
runCode, runTime, runDir = eng.run(ds, realRun=True, runPrefix='int',
                                   estimKeyFn='UNIFORM', estimAdjustFn='POLY',
                                   estimCriterion='AIC', cvInterval=95)
assert runCode == 2, 'Should have run with warnings (run code = 2)'
dict(runCode=runCode, runDir=runDir, runTime=runTime)

### f. Génération fichier de données en entrée pour Distance

(mode 'point transect' uniquement pour le moment)

In [None]:
pl.Path(eng.workDir, 'distance-in').mkdir(exist_ok=True)

In [None]:
distDataFileName = \
    eng.buildDistanceDataFile(ds, tgtFilePathName=os.path.join(eng.workDir, 'distance-in', 'import-data-noextra.txt'))

In [None]:
distDataFileName = \
    eng.buildDistanceDataFile(ds, tgtFilePathName=pl.Path(eng.workDir, 'distance-in', 'import-data-withextra.txt'),
                              withExtraFields=True)

## 3. Classes XXResultsSet

In [None]:
from autods.data import ResultsSet

# A specialized results set for the tests = with extra. post-computed columns : Delta AIC
class TestResultsSet(ResultsSet):
    
    def __init__(self, miCustomCols=None, dfCustomColTrans=None,
                       dComputedCols=None, dfComputedColTrans=None):
        
        # Initialise base.
        super().__init__(ads.MCDSAnalysis, miCustomCols, dfCustomColTrans, dComputedCols, dfComputedColTrans)
        
    # Post-computations.
    def postComputeColumns(self):
        
        # Compute Delta AIC (AIC - min(group)) per { species, sample, precision, duration } group.
        # a. Minimum AIC per group
        aicColInd = ('detection probability', 'AIC value', 'Value')
        aicGroupColInds = [('sample', 'species', 'Value'), ('sample', 'periods', 'Value'),
                           ('sample', 'duration', 'Value'), ('variant', 'precision', 'Value')]
        df2Join = self._dfData.groupby(aicGroupColInds)[[aicColInd]].min()
        
        # b. Rename computed columns to target
        deltaAicColInd = ('detection probability', 'Delta AIC', 'Value')
        df2Join.columns = pd.MultiIndex.from_tuples([deltaAicColInd])
        
        # c. Join the column to the target data-frame
        self._dfData = self._dfData.join(df2Join, on=aicGroupColInds)
        
        # d. Compute delta-AIC in-place
        self._dfData[deltaAicColInd] = self._dfData[aicColInd] - self._dfData[deltaAicColInd]

# Results object construction
miCustCols = pd.MultiIndex.from_tuples([('id', 'index', 'Value'),
                                        ('sample', 'species', 'Value'),
                                        ('sample', 'periods', 'Value'),
                                        ('sample', 'duration', 'Value'),
                                        ('variant', 'precision', 'Value')])
dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=['index', 'species', 'periods', 'duration', 'precision'],
                           fr=['numéro', 'espèce', 'périodes', 'durée', 'précision']))

dCompCols = { ('detection probability', 'Delta AIC', 'Value'): 18 } # Right before AIC
dfCompColTrans = \
    pd.DataFrame(index=dCompCols.keys(),
                 data=dict(en=['Delta AIC'], fr=['Delta AIC']))

rs = TestResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans,
                    dComputedCols=dCompCols, dfComputedColTrans=dfCompColTrans)

In [None]:
assert rs.dfData.empty

In [None]:
sHead = pd.Series(index=miCustCols, data=list(range(len(miCustCols))))
miResCols = ads.MCDSAnalysis.MIRunColumns.append(ads.MCDSEngine.statModCols())
sResult = pd.Series(index=miResCols, data=list(range(len(miResCols))))
rs.append(sResult, sCustomHead=sHead)

In [None]:
dfRaw = rs.dfData
dfRaw

In [None]:
dfTrans = rs.dfTransData('fr')
dfTrans

In [None]:
assert len(dfRaw.columns) == len(dfTrans.columns)

In [None]:
dfRaw.columns.to_list()

## 4. AutoDS data tools

### a. Load data sample

In [None]:
dfObs = pd.read_csv('refin/ACDC2019-Naturalist-ExtraitObsBrutesAvecDist.txt', sep='\t', decimal=',')
dfObs.head()

In [None]:
countCols =  ['nMalAd10', 'nAutAd10', 'nMalAd5', 'nAutAd5']
sCounts = dfObs[countCols].sum()

In [None]:
len(dfObs), sCounts.to_dict()

In [None]:
assert len(dfObs) == 724
assert not any(sCounts - pd.Series({'nMalAd10': 613, 'nAutAd10': 192, 'nMalAd5': 326, 'nAutAd5': 102}))

### b. separateMultiCategoricalCounts

In [None]:
%%time

dfObsMonoCat = ads.separateMultiCategoricalCounts(dfObs, countCols)
len(dfObsMonoCat), dfObsMonoCat[countCols].sum()

In [None]:
assert len(dfObsMonoCat) == 1125
assert not any(dfObsMonoCat[countCols].sum() - sCounts)

In [None]:
dfObsMonoCat[countCols].head()

In [None]:
dfObsMonoCat

### c. individualiseMonoCategoricalCounts

In [None]:
%%time

dfObsIndiv = ads.individualiseMonoCategoricalCounts(dfObsMonoCat, countCols)
len(dfObsIndiv), dfObsIndiv[countCols].sum()

In [None]:
assert len(dfObsIndiv) == 1233
assert not any(dfObsIndiv[countCols].sum() - sCounts)

In [None]:
dfObsIndiv.head()

### d. Categorise sightings

Needed for adding absence data below

(no more counts - by the way, all 0 or 1 - => only catgories)

In [None]:
# Should not see any sightings with all null counts
assert dfObsIndiv[~dfObsIndiv[countCols].any(axis='columns')].empty

In [None]:
dfObsIndiv['Adulte'] = \
  dfObsIndiv[countCols].apply(lambda sNb: 'm' if 'Mal' in sNb[sNb > 0].index[0] else 'a', axis='columns')
dfObsIndiv['Duree'] = \
  dfObsIndiv[countCols].apply(lambda sNb: '5' if '5' in sNb[sNb > 0].index[0] else '10', axis='columns')

dfObsIndiv.tail()

In [None]:
dfObsIndiv.drop(columns=countCols, inplace=True)
dfObsIndiv.tail()

### e. addTransectEffort

In [None]:
dfObsIndiv = ads.addTransectEffort(dfObsIndiv, transectCol='Point', passesCol='Passage')
dfObsIndiv.tail()

In [None]:
assert all(dfObsIndiv[dfObsIndiv.Effort != 2].Point.unique() == [42])

In [None]:
# Juste pour voir ...
#dfObsIndiv.to_excel('AutoDS/tmp/tools-unitests-obs-indiv.xlsx', index=False)

### e. Extract transect info

In [None]:
transectIdCol = 'Point'
transectCols = [transectIdCol, 'Effort']

In [None]:
dfTransects = dfObsIndiv[transectCols].drop_duplicates(subset=transectIdCol)
dfTransects.set_index(transectIdCol, inplace=True)
dfTransects

In [None]:
assert len(dfTransects) == 21

### f. Add abscence sightings

In [None]:
# Define sample columns
sampleCols = ['Passage', 'Adulte', 'Duree']

In [None]:
dfObsIndiv.head()

In [None]:
# Select 1 random sample
espece = 'Fauvette à tête noire'
passage = 'a'
adulte = 'm'
duree = '10'
dfObsIndivSmpl = dfObsIndiv[(dfObsIndiv.Passage == passage) & (dfObsIndiv.Adulte == adulte) & (dfObsIndiv.Duree == duree)
                            & (dfObsIndiv.Espece == espece)]

assert len(dfObsIndivSmpl) == 36 and dfObsIndivSmpl[transectIdCol].nunique() == 18

In [None]:
%%time

dfObsIndivAbscSmpl = ads.addAbsenceSightings(dfObsIndivSmpl, sampleCols, dfTransects)
len(dfObsIndivAbscSmpl)

In [None]:
# Check for no change in sample columns
assert list(dfObsIndivAbscSmpl.columns) == list(dfObsIndivSmpl.columns)

# Check for number of added rows
assert len(dfObsIndivAbscSmpl) == 39 # 36 sightings + 3 missings transects

# Check for final number of transects
assert dfObsIndivAbscSmpl[dfTransects.index.name].nunique() == 21

# Check for no change in sample identification
assert list(dfObsIndivAbscSmpl.Espece.unique()) == [espece, None] # Noe for absence sightings !
assert list(dfObsIndivAbscSmpl.Passage.unique()) == [passage]
assert list(dfObsIndivAbscSmpl.Adulte.unique()) == [adulte]
assert list(dfObsIndivAbscSmpl.Duree.unique()) == [duree]

In [None]:
dfObsIndivSmpl.sort_values(by=['Passage', 'Observateur', 'Point', 'Espece', 'distMem']).head()

In [None]:
dfObsIndivAbscSmpl.sort_values(by=['Passage', 'Observateur', 'Point', 'Espece', 'distMem'])

In [None]:
%%time

# Performance test
print('Espece      Passage  Adulte Duree NbDonnees')

for espece in ['Fauvette à tête noire', 'Alouette des champs', 'Fauvette grisette', 'Pouillot véloce']: 
    
    for passage in ['a', 'b', 'a+b']: 

        for adulte in ['m', 'a', 'm+a']:

            for duree in ['5', '10']:

                passages = passage.split('+')
                adultes = adulte.split('+')
                dfObsIndivSmpl = dfObsIndiv[dfObsIndiv.Passage.isin(passages) & dfObsIndiv.Adulte.isin(adultes) \
                                            & (dfObsIndiv.Duree == duree) & (dfObsIndiv.Espece == espece)]

                try:
                    print(espece, passage, adulte, duree, ':', len(dfObsIndivSmpl), '=> ', end='')
                    dfObsIndivAbscSmpl = ads.addAbsenceSightings(dfObsIndivSmpl, sampleCols, dfTransects)
                    print(len(dfObsIndivAbscSmpl))
                except Exception as e:
                    print(e)

### g. Generate implicit variant combination table

In [None]:
# Nombre d'individus par espèce, pour voir quelles espèces on va analyser
dfIndivCounts = dfObsIndiv.loc[dfObsIndiv.Adulte == 'm', ['Espece', 'Adulte']].groupby('Espece').count()

dfIndivCounts.rename(columns=dict(Adulte='Males'), inplace=True)
dfIndivCounts.sort_values(by='Males', ascending=False, inplace=True)

dfIndivCounts[dfIndivCounts.Males >= 20]

In [None]:
nMaxMal10 = 30
varEspeces = list(dfIndivCounts[dfIndivCounts.Males >= nMaxMal10].index) # 1 variante par espèce

varPassages = ['a+b'] # Passage a ou b => 1 seule variante
varAdultes = ['m', 'm+a'] # Les mâles, et ensuite les mâles et autres adultes (=> 2 variantes)
varDurees = ['5', '10'] # 5 1ères mn, ou toutes les 10 => 2 variantes

dfImplSampSpecs = ads.implicitPartialVariantSpecs(dict(Especes=varEspeces, Passages=varPassages,
                                                       Adultes=varAdultes, Durees=varDurees))
dfImplSampSpecs

### h. Explicit variant combination generation

In [None]:
dfExplSampSpecs = ads.explicitPartialVariantSpecs(dfImplSampSpecs)
dfExplSampSpecs

### i. Final explicitation of all variants

from user specs (implicit and explict)

In [None]:
oddfUserVariantSpecs = pd.read_excel('refin/ACDC2019-Naturalist-SpecsAnalyses.xlsx', sheet_name=None)
print('sheets:', ', '.join(oddfUserVariantSpecs.keys()))

In [None]:
dfFinalExplSpecs = ads.explicitVariantSpecs(oddfUserVariantSpecs)
dfFinalExplSpecs

In [None]:
# Just to see by eye
dfFinalExplSpecs.to_excel('tmp/tools-unitests-final-expl-specs.xlsx', index=False)

In [None]:
# Computational checks
nEch1Vars = 1
df = oddfUserVariantSpecs['Echant1_impl']
for col in df.columns:
    nEch1Vars *= len(df[col].dropna())

    
nEch2Vars = 1
df = oddfUserVariantSpecs['Echant2_impl']
for col in df.columns:
    nEch2Vars *= len(df[col].dropna())
    
nModVars = 1
df = oddfUserVariantSpecs['Modl_impl']
for col in df.columns:
    nModVars *= len(df[col].dropna())

nEch1ParWithVars = \
  len(oddfUserVariantSpecs['Params1_expl'].drop_duplicates(subset=oddfUserVariantSpecs['Echant1_impl'].columns))

nEch1Pars = len(oddfUserVariantSpecs['Params1_expl'])

nEch2ParWithVars = \
  len(oddfUserVariantSpecs['Params2_expl'].drop_duplicates(subset=oddfUserVariantSpecs['Echant2_impl'].columns))

nEch2Pars = len(oddfUserVariantSpecs['Params2_expl'])

nExpdVars = nModVars * (nEch1Pars + nEch1Vars - nEch1ParWithVars + nEch2Pars + nEch2Vars - nEch2ParWithVars)
assert len(dfFinalExplSpecs) == nExpdVars

nModVars, nEch1Pars, nEch1Vars, nEch1ParWithVars, nEch2Pars, nEch2Vars, nEch2ParWithVars, nExpdVars

# Tests d'intégration module autods

## 1. MCDSEngine : Génération de fichiers d'entrée pour Distance

* via un jeu de fichiers d'entrée bruts Excel, et leur export de référence, éprouvé dans Distance,
* et comparaison du produit de XXEngine.buildDistanceDataFile à cette référence.

Nécessite 2.g. ci-dessus.

In [None]:
dfDistCases = pd.DataFrame([dict(inFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'],
                                 refOutFileName='ACDC2019-Papyrus-ALAARV-saisie-5-cols.txt', withExtraFields=False),
                            dict(inFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'],
                                 refOutFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.txt', withExtraFields=True)])
dfDistCases

In [None]:
eng = ads.MCDSEngine(workDir=pl.Path('tmp/mcds-out'))

In [None]:
fails = 0
for ind, sCase in dfDistCases.iterrows():
    
    print('#', ind, ':', sCase.inFileName)

    # Create data set
    ds = ads.SampleDataSet(source=pl.Path('refin', sCase.inFileName),
                           decimalFields=sCase.decimalFields)
    
    # Build distance import data file
    ofn = pl.Path(eng.workDir, 'distance-in', sCase.refOutFileName)
    ofn = eng.buildDistanceDataFile(dataSet=ds, tgtFilePathName=ofn, withExtraFields=sCase.withExtraFields)
    
    # Compare generated file to reference
    rfn = pl.Path('refout', sCase.refOutFileName)
    with open(ofn, 'r') as fOut, open(rfn, 'r') as fRef:
        if fOut.read() == fRef.read():
            print('Success : Conform to reference.')
        else:
            print('Error: Generated file differs from reference', rfn)
            fails += 1
            
    print()
    
print('All test cases succeeded !' if fails == 0 else 'Error: {} test case(s) failed.'.format(fails))

## 2. MCDSEngine : Exécution avec de vraies données

In [None]:
ds = ads.SampleDataSet(source=pl.Path('refin', 'ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx'),
                       decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])

eng = ads.MCDSEngine(workDir='tmp/mcds-out')

runCode, runTime, runDir = eng.run(ds, realRun=True, runPrefix='int',
                                   estimKeyFn='UNIFORM', estimAdjustFn='POLY',
                                   estimCriterion='AIC', cvInterval=95)
assert runCode == 2, 'Should have run with warnings (run code = 2)'
dict(runCode=runCode, runDir=runDir, runTime=runTime)

In [None]:
# Usefull for later plot file decoding developpement
realRunWorkDir = runDir

# Mise au point du code du module autods

## Détection de Distance

In [None]:
# Distance software detection params.
DistanceSuppVers = [7, 6] # Lastest first.
DistancePossInstPaths = [pl.Path().resolve(), pl.Path('C:\\Program files (x86)'), pl.Path('C:\\Program files')]

# Find given executable installation dir.
def findExecutable(exeFileName):

    exeFilePathName = None
    print('Looking for {} ...'.format(exeFileName))
    for path in DistancePossInstPaths:
        for ver in DistanceSuppVers:
            exeFileDir = path / 'Distance {}'.format(ver)
            print(' - checking {} : '.format(exeFileDir), end='')
            exeFN = exeFileDir / exeFileName
            if not exeFN.exists():
                print('no.')
            else:
                print('yes !')
                exeFilePathName = exeFN
                break
        if exeFilePathName:
            break

    if exeFilePathName:
        print('{} found in {}'.format(exeFileName, exeFileDir))
    else:
        raise Exception('Could not find {} ; please install Distance software (V6 or later)'.format(exeFileName))

    return exeFilePathName

In [None]:
findExecutable('MCDS.exe')

## Results reports styling

(to stress interesting and/or important things)

In [None]:
dfTrSynRes = results.dfTransData('fr', subset=synthCols)
dfTrSynRes

In [None]:
cChrGray = '#869074'
cBckGreen, cBckGray = '#e0ef8c', '#dae3cb'
cSclGreen, cSclOrange, cSclRed = '#cbef8c', '#f9da56', '#fe835a' #'#f25e2d'
scaledColors = [cSclGreen, cSclOrange, cSclRed]
scaledColorsRvd = list(reversed(scaledColors))

dExCodeColors = dict(zip([1, 2, 3], scaledColors))
def colorExecCodes(sCodes):
    return ['background-color: ' + dExCodeColors.get(c, dExCodeColors[3]) for c in sCodes]

def scaledColorV(v, thresholds, colors): # len(thresholds) == len(colors) - 1
    if pd.isnull(v):
        return cBckGray
    for ind, thresh in enumerate(thresholds):
        if v > thresh:
            return colors[ind]
    return colors[-1]
def scaledColorS(sValues, thresholds, colors):
    return ['background-color: ' + scaledColorV(v, thresholds, colors) for v in sValues]

densCVThresholds = [0.4, 0.1]

dfs = dfTrSynRes \
        .sort_values(by=['Espèce', 'Echantillon', 'Précision', 'Durée', 'Delta AIC']) \
        .style \
        .set_precision(3) \
        .set_properties(subset=pd.IndexSlice[dfTrSynRes[dfTrSynRes['Delta AIC'] == 0].index, :],
                        **{'background-color': cBckGreen}) \
        .apply(colorExecCodes, subset=['CodEx'], axis='columns') \
        .apply(scaledColorS, subset=['CoefVar Densité'], axis='columns',
               thresholds=densCVThresholds, colors=scaledColors) \
        .set_properties(subset=pd.IndexSlice[dfTrSynRes[~dfTrSynRes.CodEx.isin([1, 2])].index, :],
                         **{'color': cChrGray}) \
        .where(pd.isnull, 'color: transparent')

    #.format(lambda v: v if not pd.isnull(v) else '') # Détruit une partie des arrondis, auugmente la précision ???

    #.set_precision(3) # Not really usable, as only for the whole frame

    #.apply(lambda s: ['color: grey']*len(s), subset=pd.IndexSlice[dfTrSynRes[~dfTrSynRes.CodEx.isin([1, 2])].index, :],
    #       axis='index') # OK
    
    #.apply(lambda s: ['color: grey']*len(s), subset=dfTrSynRes[~dfTrSynRes.CodEx.isin([1, 2])].index,
    #       axis='index') # KO
    
dfs.to_excel('tmp/styled-results.xlsx')

dfs

## Decode MCDS plots file

In [None]:
srcFileName = pl.Path(realRunWorkDir, 'plots.txt')

In [None]:
lines = open(srcFileName, 'r').readlines()
lines = [line.strip() for line in lines]
len(lines)

In [None]:
lines[:10]

In [None]:
itLines = iter(lines)
chapters = list()
for title in itLines:
    #title = next(itLines)
    subTitle = next(itLines)
    xLabel = next(itLines)
    yLabel = next(itLines)
    xMin, xMax, yMin, yMax = [float(s) for s in next(itLines).split()]
    nDataRows = int(next(itLines))
    dataRows = list()
    for l in range(nDataRows):
        dataRows.append([float(s) for s in next(itLines).split()])
    chapters.append(dict(title=title, subTitle=subTitle, dataRows=dataRows, #nDataRows=nDataRows,
                         xLabel=xLabel, yLabel=yLabel, xMin=xMin, xMax=xMax, yMin=yMin, yMax=yMax))
len(chapters), chapters[0]

In [None]:
## QQ-plot
chapter = chapters[0]
chapter

In [None]:
n = len(chapter['dataRows'])
dfQqData = pd.DataFrame(data=chapter['dataRows'], columns=['If the fit was perfect ...', 'Real observations'],
                        index=np.linspace(0.5/n, 1.0-0.5/n, n))
dfQqData

In [None]:
# Option 1 : OK
#fig = plt.figure(figsize=(16, 6))
#axes = fig.subplots()
#_ = dfQqData.plot(ax=axes, color=['blue', 'red'], grid=True,
#                  xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))

# Option 2 : OK
axes = dfQqData.plot(figsize=(16, 6), color=['blue', 'red'], grid=True,
                     
                     xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))
fig = axes.figure

axes.legend(['If the fit was perfect ...', 'Real observations'], fontsize=12)
axes.set_facecolor('#f9fbf3')
axes.figure.patch.set_facecolor('#f9fbf3')
axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)
axes.set_xlabel(chapter['xLabel'], fontsize=12)
_ = axes.set_ylabel(chapter['yLabel'], fontsize=12)

In [None]:
axes.figure.savefig('tmp/mlb-qqplot.jpg', box_inches='tight')
axes.figure.savefig('tmp/mlb-qqplot.png', box_inches='tight', transparent=True)

In [None]:
plt.close(fig)

In [None]:
# Plotly 4
fig = plygo.Figure()

fig.add_trace(plygo.Scatter(x=dfQqData.index, y=dfQqData['If the fit was perfect ...'],
                            name='If the fit was perfect ...', line=dict(color='blue', width=2), opacity=0.7))
fig.add_trace(plygo.Scatter(x=dfQqData.index, y=dfQqData['Real observations'],
                            name='Real observations', line=dict(color='red', width=2)))

fig.update_layout(title=chapter['title'] + ' : ' + chapter['subTitle'],
                  xaxis=dict(title=chapter['xLabel'], range=(chapter['xMin'], chapter['xMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  yaxis=dict(title=chapter['yLabel'], range=(chapter['yMin'], chapter['yMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  legend=plygo.layout.Legend(x=0.09, y=0.90, bordercolor='black', borderwidth=1),
                  shapes=[plygo.layout.Shape(type='line', x0=chapter['xMax'], y0=chapter['yMin'],
                                                          x1=chapter['xMax'], y1=chapter['yMax']),
                          plygo.layout.Shape(type='line', x0=chapter['xMin'], y0=chapter['yMax'],
                                                          x1=chapter['xMax'], y1=chapter['yMax'])],
                  template='none')

fig.show()

In [None]:
# Wow ... VERY slooooooooow !
fig.write_image("tmp/ply-qqplot.svg")
fig.write_image("tmp/ply-qqplot.png")

In [None]:
# Detection probability
chapter = chapters[1]
chapter

In [None]:
dfDetProbData = pd.DataFrame(data=chapter['dataRows'], 
                             columns=[chapter['xLabel'], chapter['yLabel'] + ' (sampled)', chapter['yLabel'] + ' (fitted)'])
dfDetProbData.set_index(chapter['xLabel'], inplace=True)
dfDetProbData

In [None]:
axes = dfDetProbData.plot(figsize=(16, 6), color=['blue', 'red'], grid=True,
                          xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))

axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)
axes.legend(dfDetProbData.columns, fontsize=12)
axes.set_xlabel(chapter['xLabel'], fontsize=12)
_ = axes.set_ylabel(chapter['yLabel'], fontsize=12)

In [None]:
# Plotly 4
fig = plygo.Figure()

fig.add_trace(plygo.Scatter(x=dfDetProbData.index, y=dfDetProbData[chapter['yLabel'] + ' (sampled)'],
                            name=chapter['yLabel'] + ' (sampled)', line=dict(color='blue', width=2), opacity=0.7))
fig.add_trace(plygo.Scatter(x=dfDetProbData.index, y=dfDetProbData[chapter['yLabel'] + ' (fitted)'],
                            name=chapter['yLabel'] + ' (fitted)', line=dict(color='red', width=2)))

fig.update_layout(title=chapter['title'] + ' : ' + chapter['subTitle'],
                  xaxis=dict(title=chapter['xLabel'], range=(chapter['xMin'], chapter['xMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  yaxis=dict(title=chapter['yLabel'], range=(chapter['yMin'], chapter['yMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  legend=plygo.layout.Legend(x=0.65, y=0.85*chapter['yMax'], bordercolor='black', borderwidth=1),
                  shapes=[plygo.layout.Shape(type='line', x0=chapter['xMax'], y0=chapter['yMin'],
                                                          x1=chapter['xMax'], y1=chapter['yMax']),
                          plygo.layout.Shape(type='line', x0=chapter['xMin'], y0=chapter['yMax'],
                                                          x1=chapter['xMax'], y1=chapter['yMax'])],
                  template='none')

fig.show()

In [None]:
# Detection probability
chapter = chapters[2]
chapter

In [None]:
dfProdDensData = pd.DataFrame(data=chapter['dataRows'], 
                              columns=[chapter['xLabel'], chapter['yLabel'] + ' (sampled)', chapter['yLabel'] + ' (fitted)'])
dfProdDensData.set_index(chapter['xLabel'], inplace=True)
dfProdDensData

In [None]:
axes = dfProdDensData.plot(figsize=(16, 6), color=['blue', 'red'],
                           xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))
axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)
axes.legend(dfProdDensData.columns, fontsize=12)
axes.set_xlabel(chapter['xLabel'], fontsize=12)
_ = axes.set_ylabel(chapter['yLabel'], fontsize=12)

In [None]:
# Plotly 4
fig = plygo.Figure()

fig.add_trace(plygo.Scatter(x=dfProdDensData.index, y=dfProdDensData[chapter['yLabel'] + ' (sampled)'],
                            name=chapter['yLabel'] + ' (sampled)', line=dict(color='blue', width=2), opacity=0.7))
fig.add_trace(plygo.Scatter(x=dfProdDensData.index, y=dfProdDensData[chapter['yLabel'] + ' (fitted)'],
                            name=chapter['yLabel'] + ' (fitted)', line=dict(color='red', width=2)))

fig.update_layout(title=chapter['title'] + ' : ' + chapter['subTitle'],
                  xaxis=dict(title=chapter['xLabel'], range=(chapter['xMin'], chapter['xMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  yaxis=dict(title=chapter['yLabel'], range=(chapter['yMin'], chapter['yMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  legend=plygo.layout.Legend(xanchor='right', yanchor='top', bordercolor='black', borderwidth=1),
                  #margin=plygo.layout.Margin(l=40, r=40, b=40, t=40, pad=0),
                  shapes=[plygo.layout.Shape(type='line', x0=chapter['xMax'], y0=chapter['yMin'],
                                                          x1=chapter['xMax'], y1=chapter['yMax']),
                          plygo.layout.Shape(type='line', x0=chapter['xMin'], y0=chapter['yMax'],
                                                          x1=chapter['xMax'], y1=chapter['yMax'])],
                  template='none')

fig.show()

## Decode stats (actual results) from MCDS work folders

In [None]:
#_ = implib.reload(ads)

In [None]:
# Results set to store results into.
miCustCols = pd.MultiIndex.from_tuples([('id', 'ExecCase', 'Value')])
dfCustColTrans = \
    pd.DataFrame(index=miCustCols, data=dict(en=['ExecCase'], fr=['CasExec']))

results = ads.MCDSResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans)

In [None]:
# Analysis engine
mcds = ads.MCDSEngine(workDir='refout/dist-order-sens-min',
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
# Process folders in engine work folder.
for folder in os.listdir(mcds.workDir):
    
    # Skip folders that are not MCDS run ones.
    folderPath = os.path.join(mcds.workDir, folder)
    if not os.path.isdir(folderPath):
        continue
    if os.path.splitext(folder)[1] or 'stats.txt' not in os.listdir(folderPath):
        print(f'Skipping {folderPath}, not an MCDS.exe run folder with a stats.txt file')
        continue
        
    # Tell the engine were it has run (even it does not rember it ;-)
    _ = mcds.setupRunFolder(forceSubFolder=folder)
    
    # Decode results.
    sRes = mcds.decodeStats()
    print()
    
    # Store them for later.
    sHead = pd.Series(data=[folder], index=miCustCols)
    results.append(sRes, sCustomHead=sHead)

# Tadaaaaaaa !
results.dfTransData('fr')

In [None]:
results.dfTransData('en').to_excel(pl.Path('tmp', 'dist-order-sens-auto-results.xlsx'), index=False)

## Unitary tests for reference / actual results comparison

In [None]:
values = [np.nan, -np.inf, -1.0e12, -1.0e5, -1.0-1e-5, -1.0, -1.0+1e-5, -1.0e-8, 0.0, 1.0e-8, 1.0, 1.0e5, 1.0e12, np.inf]

In [None]:
# Actual / reference closeness measure : -round(log10((actual - reference) / max(abs(actual), abs(reference))), 1)
# = Compute the order of magnitude that separate the difference from the absolute max. of the two values
# The greater it is, the lower the relative difference
#    Ex: 3 = 10**3 ratio between difference absolue max. of the two,
#        +inf = NO difference at all,
#        0 = bad, one of the two is 0, and the other not,
# See unitary test below.
def closeness(sRefAct):
    
    x, y = sRefAct.to_list()
    
    # Special cases with 1 NaN, or 1 or more inf => all different
    if np.isnan(x):
        if not np.isnan(y):
            return 0 # All different
    elif np.isnan(y):
        return 0 # All different
    
    if np.isinf(x) or np.isinf(y):
        return 0 # All different
    
    # Normal case
    c = abs(x - y)
    if not np.isnan(c) and c != 0:
        c = c / max(abs(x), abs(y))
    
    return np.inf if c == 0 else round(-np.log10(c), 1)

In [None]:
aClose = np.ndarray(shape=(len(values), len(values)))
for r in range(len(values)):
    for c in range(len(values)):
        try:
            aClose[r, c] = closeness(pd.Series([values[r], values[c]]))
        except Exception as exc:
            print(exc, r, c, values[r], values[c])
pd.DataFrame(data=aClose, index=values, columns=values)

In [None]:
# Proximité infinie sur la diagonale (sauf pour nan et +/-inf)
assert all(np.isnan(values[i]) or np.isinf(values[i]) or np.isinf(aClose[i, i]) for i in range(len(values))), \
       'Error: Inequality on the diagonal'

# Pas de proximité infinie ailleurs
assert all(r == c or not np.isinf(aClose[r, c]) for r in range(len(values)) for c in range(len(values))), \
       'Error: No equality should be found outside the diagonal'

# Bonne proximité uniquement autour de -1
whereClose = [i for i in range(len(values)) if abs(values[i] + 1) <= 1.0e-5]
assert all(aClose[r, c] > 4 for r in whereClose for c in whereClose), 'Error: Unexpectedly bad closeness around -1'

In [None]:
# Ancienne méthode qui ne marche pas.
# Comparaison actual / reference : -round(log10((actual - reference) / max(abs(actual), abs(reference))), 1)
# => Plus c'est grand, plus petite est la différence relative entre les 2
#    Ex: 3 = facteur 10**3 entre différence et valeurs absolues ; +inf = AUCUNE différence
#        0 = pas bon, l'un des 2 est nul n'autre pas du tout
# Cf. tests unitaires plus bas.
#dfRelDif = pd.DataFrame(index=dfRefRes4c.index)
#for col in dfRefRes4c.columns:
#    dfRelDif['NormalCases'] = ~((dfActRes4c[col].isnull() & dfRefRes4c.notnull()) \
#                                | (dfActRes4c[col].notnull() & dfRefRes4c.isnull()) \
#                                | dfActRes4c[col].notnull() | dfRefRes4c.isnull())
#    dfRelDif[col] = abs(dfActRes4c[col] - dfRefRes4c[col])
#    dfRelDif[col].where(dfRelDif[col].isnull() | dfRelDif[col] == 0,
#                        dfRelDif[col] / pd.DataFrame(dict(act=dfActRes4c[col], ref=dfRefRes4c[col])).abs().max(axis='columns'),
#                        inplace=True)
#    dfRelDif[col].where(dfRelDif['NormalCases'], 1, inplace=True) # Force special case to "all different"
#    dfRelDif.drop(columns=['NormalCases'], inplace=True)
#    dfRelDif[col] = np.round(-np.log10(dfRelDif[col]), 1)
#    
#dfRelDif

## Generate stats columns translation file

(from documentation stats & modules specs)

In [None]:
tgtTransFileName = 'tmp/stat-mod-trans.auto.txt'

In [None]:
class Translator(object):
    
    def __init__(self, dTrans, lang='en'):
        assert 'en' in dTrans, 'At least "en" translation must be defined'
        self.dTrans = dTrans
        self.setLang(lang)
        
    def setLang(self, lang):
        self.lang = lang.lower()
        assert self.lang in ['en', 'fr'], 'No support for "{}" language'.format(lang)
        
    def __call__(self, s):
        return self.dTrans.get(self.lang, self.dTrans['en']).get(s, self.dTrans['en'].get(s, s))

In [None]:
DFigureTrans = \
    dict(en=dict(Value='', Cv='CoefVar', Lcl='Min', Ucl='Max', Df='DoF'),
         fr=dict(Value='', Cv='CoefVar', Lcl='Min', Ucl='Max', Df='DegLib'))

figtr = Translator(DFigureTrans, lang='en')

In [None]:
DStatisticTrans = \
    dict(en={ 'number of observations (n)': 'NObs',
              'number of samples (k)': 'NSamp',
              'effort (L or K or T)': 'Effort',
              'encounter rate (n/L or n/K or n/T)': 'EncRate',
              'left truncation distance': 'LeftTruncDist',
              'right truncation distance (w)': 'RightTruncDist',
              'total number of parameters (m)': 'TotNumPars',
              'AIC value': 'AIC',
              'chi-square test probability (distance set 1)': 'Chi2 P 1',
              'chi-square test probability (distance set 2)': 'Chi2 P 2',
              'chi-square test probability (distance set 3)': 'Chi2 P 3',
              'f(0) or h(0)': 'f/h(0)',
              'probability of detection (Pw)': 'PDetec',
              'effective strip width (ESW) or effective detection radius (EDR)': 'EDR/ESW',
              'AICc': 'AICc',
              'BIC': 'BIC',
              'Log likelihood': 'LogLhood',
              'Kolmogorov-Smirnov test probability': 'KS P',
              'Cramér-von Mises (uniform weighting) test probability': 'CvM Uw P',
              'Cramér-von Mises (cosine weighting) test probability': 'CvM Cw P',
              'key function type': 'KeyFn',
              'adjustment series type': 'AdjSer',
              'number of key function parameters (NKP)': 'NumKFnPars',
              'number of adjustment term parameters (NAP)': 'NumASerPars',
              'number of covariate parameters (NCP)': 'NumCovars',
              'estimated value of A(1) adjustment term parameter': 'EstA(1)',
              'estimated value of A(2) adjustment term parameter': 'EstA(2)',
              'estimated value of A(3) adjustment term parameter': 'EstA(3)',
              'estimated value of A(4) adjustment term parameter': 'EstA(4)',
              'estimated value of A(5) adjustment term parameter': 'EstA(5)',
              'estimated value of A(6) adjustment term parameter': 'EstA(6)',
              'estimated value of A(7) adjustment term parameter': 'EstA(7)',
              'estimated value of A(8) adjustment term parameter': 'EstA(8)',
              'estimated value of A(9) adjustment term parameter': 'EstA(9)',
              'estimated value of A(10) adjustment term parameter': 'EstA(10)',
              'average cluster size': 'AvgClustSz',
              'size-bias regression correlation (r)': 'SzBias RegCorr',
              'p-value for correlation significance (r-p)': 'CorSignPVal',
              'estimate of expected cluster size corrected for size bias': 'EstExpFixedCluSz',
              'density of clusters (or animal density if non-clustered)': 'DensClu',
              'density of animals': 'Density',
              'number of animals, if survey area is specified': 'Number',
              'bootstrap density of clusters': 'BootsDensClu',
              'bootstrap density of animals': 'BootDensity',
              'bootstrap number of animals': 'BootNumber' },
         fr={ 'number of samples (k)': 'NEchant',
              'encounter rate (n/L or n/K or n/T)': 'TxContact',
              'left truncation distance': 'DistTroncGche',
              'right truncation distance (w)': 'DistTroncDte',
              'total number of parameters (m)': 'NbTotPars',
              'Log likelihood': 'LogProba',
              'key function type': 'FnClé',
              'adjustment series type': 'SérAjust',
              'number of key function parameters (NKP)': 'NbParsFnClé',
              'number of adjustment term parameters (NAP)': 'NbParsSérAjust',
              'number of covariate parameters (NCP)': 'NbCovars',
              'average cluster size': 'TailMoyClust',
              'size-bias regression correlation (r)': 'CorrReg BiaisTail',
              'p-value for correlation significance (r-p)': 'PVal SignifCorr',
              'estimate of expected cluster size corrected for size bias': 'TailCorrCluAttEst',
              'density of animals': 'Densité',
              'number of animals, if survey area is specified': 'Nombre',
              'bootstrap density of clusters': 'BootsDensClu',
              'bootstrap density of animals': 'DensitéBoot',
              'bootstrap number of animals': 'NombreBoot' })

statr = Translator(DStatisticTrans, lang='en')

In [None]:
dfStatModTrans = ads.MCDSEngine.statModCols().to_frame()
dfStatModTrans.reset_index(drop=True, inplace=True)
dfStatModTrans.rename(columns={ 0: 'Module', 1: 'Statistic', 2: 'Figure' }, inplace=True)
for lang in ['en', 'fr']:
    figtr.setLang(lang)
    statr.setLang(lang)
    dfStatModTrans[lang] = \
        dfStatModTrans.apply(lambda sRow: '{} {}'.format(figtr(sRow.Figure), statr(sRow.Statistic)).strip(),
                             axis='columns')

In [None]:
dfStatModTrans

In [None]:
dfStatModTrans.to_csv(tgtTransFileName, sep='\t', index=False)
tgtTransFileName

In [None]:
pd.DataFrame(index=ads.MCDSAnalysis.MIRunColumns,
             data=dict(en=['ModKeyFn', 'ModAdjSer', 'ModChcCrit', 'ConfInterv', 'LeftTrunc', 'RightTrunc',
                           'FitDistCuts', 'DiscrDistCuts', 'RunCode', 'RunTime', 'RunFolder'],
                       fr=['FnCléMod', 'SérAjustMod', 'CritChxMod', 'IntervConf', 'TroncGauche', 'TroncDroite',
                           'TranchDistFit', 'TranchDistDiscr', 'CodeExec', 'HeureExec', 'DossierExec']))


In [None]:
dfStatModTransExt = pd.read_csv(tgtTransFileName, sep='\t')
dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])

In [None]:
lang = 'fr'
dTrans = dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])[lang].to_dict()
results.dfData.columns = [dTrans.get(col, col) for col in results.dfData.columns]
results.dfData

In [None]:
dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])[lang].to_dict()

## Test case class

(no use actually : pd.DataFrame already does the job !)

In [None]:
# Super-class for test cases
class TestCase(object):
    def __init__(self, **attrs):
        if not hasattr(self.__class__, 'AttributeNames'):
            self.__class__.AttributeNames = set(attrs.keys())
        else:
            assert set(attrs.keys()) == self.AttributeNames, \
                   'Some attribute name not in frozen set {{{}}}'.format(','.join(self.AttributeNames))
        for attrName, AttrValue in attrs.items():
            setattr(self, attrName, AttrValue)
    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, ','.join('{}:{}'.format(k, v) for k, v in self.__dict__.items()))

In [None]:
# Test this super-class.
class TCTest(TestCase):
    pass

tstTestCases = list()
tstTestCases.append(TCTest(x=1, y='a')) # Define attributes
tstTestCases.append(TCTest(x=2, y='b')) # Check attributes
try:
    tstTestCases.append(TCTest(x=2, z=None)) # Refuse new attributes
    assert False, 'Error: New attributes should be refused'
except AssertionError as exc:
    print('Good refuse of new attributes:', exc)
    
[str(tc) for tc in tstTestCases]

## Mise au point décodage sorties de MCDS : fichier de stats

TODO: Add french translation of variables / parameters names and descriptions

### 1. Nom et description des colonnes du tableau de stats

In [None]:
fileName = 'mcds/stat-row-specs.txt'

fStatRowSpecs = open(fileName, mode='r', encoding='utf8')

In [None]:
statRowSpecLines = [line.rstrip('\n') for line in fStatRowSpecs.readlines() if not line.startswith('#')]
statRowSpecs =  [(statRowSpecLines[i].strip(), statRowSpecLines[i+1].strip()) \
                 for i in range(0, len(statRowSpecLines)-2, 3)]
dfStatRowSpecs = pd.DataFrame(columns=['Name', 'Description'], data=statRowSpecs).set_index('Name')

dfStatRowSpecs

In [None]:
dfStatRowSpecs.index

### 2. Numéro et description des modules et statistiques associées

(colonnes Module et Statistic du tableau)

In [None]:
fileName = 'mcds/stat-mod-specs.txt'

fStatModSpecs = open(fileName, mode='r', encoding='utf8')

In [None]:
nMaxAdjParams = 10

statModSpecLines = [line.rstrip('\n') for line in fStatModSpecs.readlines() if not line.startswith('#')]
reModSpecNumName = re.compile('(.+) – (.+)')
statModSpecs = list()
moModule = None
for line in statModSpecLines:
    if not line:
        continue
    if moModule is None:
        moModule = reModSpecNumName.match(line.strip())
        continue
    if line == ' ':
        moModule = None
        continue
    moStatistic = reModSpecNumName.match(line.strip())
    modNum, modDesc, statNum, statDescNotes = \
        moModule.group(1), moModule.group(2), moStatistic.group(1), moStatistic.group(2)
    for i in range(len(statDescNotes)-1, -1, -1):
        if not re.match('[\d ,]', statDescNotes[i]):
            statDesc = statDescNotes[:i+1]
            statNotes = statDescNotes[i+1:].replace(' ', '')
            break
    modNum = int(modNum)
    if statNum.startswith('101 '):
        for num in range(nMaxAdjParams): # Assume no more than that ... a bit hacky !
            statModSpecs.append((modNum, modDesc, 101+num, # Make statDesc unique for later indexing
                                 statDesc.replace('each', 'A({})'.format(num+1)), statNotes))
    else:
        statNum = int(statNum)
        if modNum == 2 and statNum == 3: # Actually, there are 0 or 3 of these ...
            for num in range(3):
                statModSpecs.append((modNum, modDesc, num+201,
                                     # Change statNum & Make statDesc unique for later indexing
                                     statDesc+' (distance set {})'.format(num+1), statNotes))
        else:
            statModSpecs.append((modNum, modDesc, statNum, statDesc, statNotes))
dfStatModSpecs = pd.DataFrame(columns=['modNum', 'modDesc', 'statNum', 'statDesc', 'statNotes'],
                              data=statModSpecs).set_index(['modNum', 'statNum'])

dfStatModSpecs

In [None]:
# Modules
dfStatModSpecs.modDesc.unique()

### 3. Notes sur les statistiques des modules

(infos supplémentaire indiquant comment utiliser ou pas les 5 dernières colonnes Value, Cv, Lcl, Ucl, Df)

In [None]:
fileName = 'mcds/stat-mod-notes.txt'

fStatModNotes = open(fileName, mode='r', encoding='utf8')

In [None]:
statModNoteLines = [line.rstrip('\n') for line in fStatModNotes.readlines() if not line.startswith('#')]
statModNotes =  [(int(line[:2]), line[2:].strip()) for line in statModNoteLines if line]

dfStatModNotes = pd.DataFrame(data=statModNotes, columns=['Note', 'Text']).set_index('Note')

dfStatModNotes

### 4. Lecture du tableau

In [None]:
eng = mcds

In [None]:
eng.statsFileName

In [None]:
dfStatRows = pd.read_csv(eng.statsFileName, sep=' +', engine='python', names=dfStatRowSpecs.index)
dfStatRows

### 5. Décodage du tableau

Attention: On suppose 1 seule strate '0' (Stratum), 1 seul échantillon '0' (Sample) et 1 seul estimateur '1' (Estimator).

#### a. Suppression des colonnes Stratum, Sample et Estimator

(puisqu'on se limite ici aux cas où il n'y a qu'1 de chaque)

In [None]:
dfStatRows.drop(columns=['Stratum', 'Sample', 'Estimator'], inplace=True)
dfStatRows

#### b. Nettoyage des données sans objets

(selon les notes descriptives des statistiques)

In [None]:
# Empilage des "chiffres" (Figures) Value, Cv, Lcl, Ucl, Df pour chaque statistique / module
dfStats = dfStatRows.set_index(['Module', 'Statistic'], append=True).stack() \
                    .reset_index().rename(columns={'level_0': 'id', 'level_3': 'Figure', 0: 'Value'})
dfStats.head(10)

In [None]:
# 4. Fix multiple Module=2 & Statistic=3 rows (before joining with self.DfStatModSpecs)
newStatNum = 200
for lbl, sRow in dfStats[(dfStats.Module == 2) & (dfStats.Statistic == 3)].iterrows():
    if dfStats.loc[lbl, 'Figure'] == 'Value':
        newStatNum += 1
    dfStats.loc[lbl, 'Statistic'] = newStatNum
dfStats[(dfStats.Module == 2)]

In [None]:
# Ajout des colonnes de description/nommage des modules et statistiques
dfStats = dfStats.join(dfStatModSpecs, on=['Module', 'Statistic'])
dfStats.tail(10)

In [None]:
#dfStats[(dfStats.Module == 2) & (dfStats.Statistic > 200)]

In [None]:
# Vérification que les chiffres sans objet le sont vraiment (tous à 0.0 ?)
# Attention: Il doit y avoir un bug dans MCDS avec Module 2 / Statistic 10x : certains Cv ne sont pas nuls ...
sKeepOnlyValueFig = ~dfStats.statNotes.str.contains('1')
sFigs2Drop = (dfStats.Figure != 'Value') & sKeepOnlyValueFig
assert ~dfStats[sFigs2Drop & ((dfStats.Module != 2) | (dfStats.Statistic < 100))].Value.any(), \
       'Attention: Des chiffres supposés "sans objet" on des valeurs non nulles !'

In [None]:
# 2nde vérif. visuelle
dfStats[sFigs2Drop & dfStats.Value != 0].sort_values(by='Value', ascending=False)

In [None]:
# Suppression des lignes / chiffres sans objet.
dfStats.drop(dfStats[sFigs2Drop].index, inplace=True)
dfStats

In [None]:
dfStats.head()

In [None]:
dfStats = dfStats.reindex(columns=['modDesc', 'statDesc', 'Figure', 'Value'])
dfStats.set_index(['modDesc', 'statDesc', 'Figure'], inplace=True)
dfStats

In [None]:
dfStats.T.iloc[0]

## Distance cut specs for MCDS

* Mise au point
* tests unitaires

In [None]:
def distanceCutSpecs(minDist, maxDist, distCuts):
    
    distCutSpecs = ''
        
    #distCuts = params['distCuts']
    if distCuts is not None:

        if isinstance(distCuts, list) and minDist is not None and maxDist is not None:
            distCutSpecs += ' /Int=' + ','.join(str(d) for d in [minDist] + distCuts + [maxDist])
        elif isinstance(distCuts, int):
            distCutSpecs += ' /NClass=' + str(distCuts)

    if not isinstance(distCuts, list): # None or int

        #minDist = params['minDist']
        if minDist is not None:
            distCutSpecs += ' /Left=' + str(minDist)

        #maxDist = params['maxDist']
        if maxDist is not None:
            distCutSpecs += ' /Width=' + str(maxDist)
            
    return distCutSpecs

In [None]:
assert distanceCutSpecs(minDist=None, maxDist=None, distCuts=None) == ''

assert distanceCutSpecs(minDist=5, maxDist=None, distCuts=None) == ' /Left=5'
assert distanceCutSpecs(minDist=None, maxDist=100, distCuts=None) == ' /Width=100'
assert distanceCutSpecs(minDist=25.2, maxDist=100.5, distCuts=None) == ' /Left=25.2 /Width=100.5'

assert distanceCutSpecs(minDist=None, maxDist=None, distCuts=3) == ' /NClass=3'
assert distanceCutSpecs(minDist=None, maxDist=300, distCuts=8) == ' /NClass=8 /Width=300'
assert distanceCutSpecs(minDist=20, maxDist=None, distCuts=8) == ' /NClass=8 /Left=20'
assert distanceCutSpecs(minDist=20, maxDist=300, distCuts=8) == ' /NClass=8 /Left=20 /Width=300'

assert distanceCutSpecs(minDist=20, maxDist=300, distCuts=[100, 200, 230, 290]) == ' /Int=20,100,200,230,290,300'
assert distanceCutSpecs(minDist=None, maxDist=None, distCuts=[1, 2, 3]) == '' # min & maxDist have to be both defined
assert distanceCutSpecs(minDist=0, maxDist=None, distCuts=[1, 2, 3]) == '' # min & maxDist have to be both defined
assert distanceCutSpecs(minDist=None, maxDist=4, distCuts=[1, 2, 3]) == '' # min & maxDist have to be both defined

## Data tools development

### addAbsenceSightings

In [None]:
dfInSightings = dfObsIndiv

In [None]:
transectCol = 'Point'
taxonCol = 'Espece'
sampleCols = ['Passage', 'Adulte', 'Duree']

# The set of expected taxa ... of which we'll look for abscence on every location
expectedTaxa = list(dfObsIndiv[taxonCol].unique())
', '.join(expectedTaxa), len(expectedTaxa)

In [None]:
# Add "abscence" sightings to field data collected on transects for a given sample
#def addAbsenceSightings(dfInSightings, transectCol, taxonCol, expectedTaxa, sampleCols):
    
def absenceSightings(taxonCol, taxon, dAbscSightTmpl):
    dAbscSight = dAbscSightTmpl.copy()
    dAbscSight[taxonCol] = taxon
    return dAbscSight

assert not dfInSightings.empty, 'Error : Empty sightings data to complete !'

ldfAbscSightings = list()

# Use 1st sightings of the sample to build the absence sightings prototype
# (all null columns except for the sample identification ones)
dAbscSightTmpl = dfInSightings.iloc[0].to_dict()
dAbscSightTmpl.update({ k: None for k in dAbscSightTmpl.keys() if k not in sampleCols })

# For each transect
for transect in dfInSightings[transectCol].unique():

    # Update absence sightings template with transect id
    dAbscSightTmpl.update({ transectCol: transect })
    
    # Generate the absence sightings from it : 1 per lacking taxon
    lackingTaxa = set(expectedTaxa) - set(dfInSightings.loc[dfInSightings[transectCol] == transect, taxonCol].unique())
    dfAbscSights = pd.DataFrame([absenceSightings(taxonCol, txn, dAbscSightTmpl) for txn in lackingTaxa])
    
    ldfAbscSightings.append(dfAbscSights)

In [None]:
# Concat all data frames into one.
dfOutSightings = pd.concat([dfInSightings] + ldfAbscSightings)

# Reset index (for unique labels).
dfOutSightings.reset_index(inplace=True, drop=True)

In [None]:
len(dfOutSightings), len(dfInSightings)

## Distance troncations : auto-generation of variants

(at least a try ...)

### Data for playing : Individualised ones ...

(output from "4. AutoDS data tools" above)

In [None]:
dfObsIndiv.head()

In [None]:
dfObsIndiv.groupby('Espece').size().sort_values(ascending=False)[:5]

In [None]:
dfObsIndSpc = dfObsIndiv[dfObsIndiv.Espece == 'Merle noir'].copy()

### Histograms

In [None]:
# Histogramme uniforme
_ = dfObsIndSpc.distMem.hist(figsize=(16, 3), bins=40)

### Empirical distribution

In [None]:
import statsmodels.distributions.empirical_distribution as sted

In [None]:
ecdf = sted.ECDF(dfObsIndSpc.distMem)

In [None]:
sEcdf = pd.Series({ x : ecdf(x) for x in dfObsIndSpc.distMem.unique() }).sort_index()
_ = sEcdf.plot(figsize=(16, 3))

### Quantiles : 2.5, 5 et 10%, left and right sides

In [None]:
aqLims = np.array([0.025, 0.05, 0.1, 0.9, 0.95, 0.975])
aqLims * len(dfObsIndSpc)

In [None]:
np.quantile(a=dfObsIndSpc.distMem, q=aqLims)

In [None]:
dfObsIndSpc[dfObsIndSpc.distMem <= 11.61]

### Brute force combination algorithm

In [None]:
lParams = list() # of dict(ltr=<left trunc dist or None>, rtr=<right trunc dist or None>, nc=<nb of cuts>)

In [None]:
aqtlLims = np.array([1.25, 2.5, 3.75, 5, 7.5, 10])

In [None]:
sLeftTruncs = pd.Series(index=aqtlLims, data=np.percentile(a=dfObsIndSpc.distMem, q=aqtlLims))
for leftPct, leftTrunc in sLeftTruncs.items():
    nRetSights = len(dfObsIndSpc[dfObsIndSpc.distMem >= leftTrunc])
    sqrNRetSights = math.sqrt(nRetSights)
    for nCuts in [2*sqrNRetSights/3, sqrNRetSights, 3*sqrNRetSights/2]:
        lParams.append(dict(ltr=leftTrunc, rtr=None, nc=round(nCuts), nrs=nRetSights, pct=100-leftPct))

In [None]:
sRightTruncs = pd.Series(index=100-aqtlLims, data=np.percentile(a=dfObsIndSpc.distMem, q=100-aqtlLims)).sort_index()
for rightPct, rightTrunc in sRightTruncs.items():
    nRetSights = len(dfObsIndSpc[dfObsIndSpc.distMem <= rightTrunc])
    sqrNRetSights = math.sqrt(nRetSights)
    for nCuts in [2*sqrNRetSights/3, sqrNRetSights, 3*sqrNRetSights/2]:
        lParams.append(dict(ltr=None, rtr=rightTrunc, nc=round(nCuts), nrs=nRetSights, pct=rightPct))

In [None]:
# ... etc ... but, why not use an optimisation engine then ?

In [None]:
pd.DataFrame(lParams)

### Optimising algorithm

In [None]:
dfObsIndSpc.head()

In [None]:
mcds = ads.MCDSEngine(workDir=os.path.join('ACDC', '2019-nat-opt'),
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
sampleDecFields = ['Effort', 'Distance']

dAreaInfo = dict(Zone='ACDC', Surface=2400) # ha
dfObsIndSpc = ads.addSurveyAreaInfo(dfObsIndSpc, dAreaInfo=dAreaInfo)

dfObsIndSpc.rename(columns=dict(distMem='Distance'), inplace=True)
dfObsIndSpc.sort_values(by='Point', inplace=True)

dataSet = ads.DataSet(dfObsIndSpc, decimalFields=sampleDecFields)

In [None]:
sSamp = dfObsIndSpc.iloc[0]
abrvSpe = ''.join(word[:4].title() for word in sSamp['Espece'].split(' '))
sampAbbrev = '{}-{}-{}-{}'.format(abrvSpe, sSamp.Passage.replace('+', ''),
                                  sSamp.Adulte.replace('+', ''), sSamp['Duree'])

In [None]:
KPreEstimCrit = 'AICC'
KPreCVInterval = 95

def dsAnalyser3(aParams, mcdsEngine, dataSet, sampAbbrev, keyFn, adjSer):

    minDist = aParams[0]
    maxDist = aParams[1]
    fitDistCuts = round(aParams[2])
    print(minDist, maxDist, fitDistCuts)
    
    modAbbrev = keyFn[:3].lower() + '-' + adjSer[:3].lower()

    analysis = ads.MCDSAnalysis(engine=mcdsEngine, dataSet=dataSet,
                                name=sampAbbrev + '-' + modAbbrev, logData=False,
                                estimKeyFn=keyFn, estimAdjustFn=adjSer,
                                estimCriterion=KPreEstimCrit, cvInterval=KPreCVInterval,
                                minDist=minDist, maxDist=maxDist, fitDistCuts=fitDistCuts)
    
    sResult = analysis.run()

    aic = sResult[('detection probability', 'AIC value', 'Value')]
    
    return aic

def dsAnalyser2(aParams, mcdsEngine, dataSet, sampAbbrev, keyFn, adjSer, fitDistCuts):

    minDist = aParams[0]
    maxDist = aParams[1]
    print(minDist, maxDist)
    
    modAbbrev = keyFn[:3].lower() + '-' + adjSer[:3].lower()

    analysis = ads.MCDSAnalysis(engine=mcdsEngine, dataSet=dataSet,
                                name=sampAbbrev + '-' + modAbbrev, logData=False,
                                estimKeyFn=keyFn, estimAdjustFn=adjSer,
                                estimCriterion=KPreEstimCrit, cvInterval=KPreCVInterval,
                                minDist=minDist, maxDist=maxDist, fitDistCuts=fitDistCuts)
    
    sResult = analysis.run()
    #print(sResult.to_dict())

    #return sResult[('detection probability', 'AIC value', 'Value')]
    return sResult[('detection probability', 'AICc', 'Value')]

In [None]:
adjSer = 'COSINE'
keyFn = 'HNORMAL'
#keyFn = 'HAZARD'
#keyFn = 'UNIFORM'
#keyFn = 'NEXPON'

In [None]:
# Juste une analyse pour tester la fonction à minimiser (AIC)
#              minDist, maxDist, fitDistCuts
aParams = np.array([0, 250, round(math.sqrt(len(dataSet.dfData)))])
dsAnalyser3(aParams, mcds, dataSet, sampAbbrev, keyFn, adjSer)

In [None]:
# Et maintenant, on lance l'optimisation.
#              minDist, maxDist, fitDistCuts
#maxMinDist, minMaxDist = np.percentile(a=dfObsIndSpc.Distance, q=[20, 80])
#maxMinDist, minMaxDist = np.percentile(a=dfObsIndSpc.Distance, q=[40, 60])
maxMinDist, minMaxDist = np.percentile(a=dfObsIndSpc.Distance, q=[49, 51])
sqrNRetSights = math.sqrt(len(dfObsIndSpc))
minFitDistCuts, maxFitDistCuts = round(2*sqrNSights/3), round(3*sqrNRetSights/2)
paramBounds = [(0, maxMinDist), (minMaxDist, dfObsIndSpc.Distance.max()), (minFitDistCuts, maxFitDistCuts)]
paramBounds

In [None]:
from scipy import optimize

In [None]:
dOptRes = dict()

In [None]:
fitDistCuts = 12
dOptRes['shgo'] = optimize.shgo(func=dsAnalyser2, bounds=paramBounds[:2], iters=2,
                                args=(mcds, dataSet, sampAbbrev, keyFn, adjSer, fitDistCuts))
dOptRes['shgo']

In [None]:
dOptRes['shgo'].keys()

In [None]:
dOptRes['shgo'] = optimize.shgo(func=dsAnalyser3, bounds=paramBounds, iters=2,
                                args=(mcds, dataSet, sampAbbrev, keyFn, adjSer))
dOptRes['shgo']

In [None]:
dOptRes['da'] = optimize.dual_annealing(func=dsAnalyser3, bounds=paramBounds,
                                        args=(mcds, dataSet, sampAbbrev, keyFn, adjSer))
dOptRes['da']

In [None]:
dOptRes['de'] = optimize.differential_evolution(func=dsAnalyser3, bounds=paramBounds,
                                                args=(mcds, dataSet, sampAbbrev, keyFn, adjSer))
dOptRes['de']

In [None]:
dOptRes['bh'] = optimize.basinhopping(func=dsAnalyser3, x0=[(mx+mn)/2 for mx, mn in paramBounds], stepsize=2,
                                      minimizer_kwargs=dict(args=(mcds, dataSet, sampAbbrev, keyFn, adjSer)))
dOptRes['bh']

## Named tuple from dictionary

In [None]:
d = dict(a=1, b=[3, 2], c='xxx')

In [None]:
NT = ntuple('NT', d.keys())

In [None]:
nt = NT(**d)

In [None]:
nt

## Appending series to series ... index order

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
s

In [None]:
s.append(pd.Series(index=[('A', 'b'), ('A', 'a'), ('B', 'c')], data=[1, 2, 3], name=0))

## Appending series to DataFrame ... columns order

### a. Append

In [None]:
df = pd.DataFrame()

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
#df = df.append(s, ignore_index=False) # => df.columns pas MultiIndex !
df = df.append([s], ignore_index=False)
df

In [None]:
s = pd.Series(index=[('A', 'c'), ('B', 'b'), ('B', 'a')], data=[4, 5, 6], name=1)  # Mêmes colonnes : append ne retrie pas
#s = pd.Series(index=[('A', 'a'), ('A', 'b'), ('B', 'c')], data=[4, 5, 6], name=1)  # Nouvelle colonne : append retrie
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('A', 'a'), ('B', 'c')], data=[7, 8])
df = df.append(s, ignore_index=True)
df

In [None]:
s = pd.Series(index=[], data=[])
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('C', 'd')], data=[9])
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('d',)], data=[10])
df = df.append(s, ignore_index=True)
df

In [None]:
df

### b. Concat

In [None]:
df = pd.DataFrame()

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
df = pd.concat([df, s], axis='columns')
df

In [None]:
s = pd.Series(index=[('B', 'b'), ('B', 'a'), ('A', 'c')], data=[4, 5, 6], name=1) # Mêmes colonnes : concat ne retrie pas
#s = pd.Series(index=[('A', 'a'), ('A', 'b'), ('B', 'c')], data=[4, 5, 6], name=1) # Nouvelle colonne : concat retrie
df = pd.concat([df, s], axis='columns')
df

### c. Restore desired columns

* desired order,
* desired list of columns : new ones, and / or ignored ones.

In [None]:
df

In [None]:
# Add new A/b, D/a and remove B/c and C/d
i = pd.MultiIndex.from_tuples([('A', 'c'), ('A', 'b'), ('A', 'a'), ('B', 'b'), ('B', 'a'), ('D', 'a')])
i

In [None]:
# Keep added columns (with no data inside)
df2 = df.reindex(i, axis='columns')
df2

In [None]:
# Remove added columns (with no data inside)
df2 .dropna(how='all', axis='columns')

# Code attic

In [None]:
# Transform a multi-categorical sightings set into an equivalent mono-categorical sightings set,
# that is where no sightings has more that one category with positive count (keeping the same total counts).
# Ex: A sightings set with 2 categorical count columns nMales and nFemales
#     * in the input set, you may have 1 sightings with nMales = 5 and nFemales = 2
#     * in the output set, this sightings have been separated in 2 distinct ones (all other properties left untouched) :
#       the 1st with nMales = 5 and nFemales = 0, the 2nd with nMales = 0 and nFemales = 2.

# A slower version or ads.separateMultiCategoricalCounts :
#  from 9.5s to 0.1s with countColumns = [nMalAd1,nAutAd10,nJuv10,nDetTot10,nMalAd5,nAutAd5,nJuv5,nDetTot5,nTotAd5,nTotAd10]
#  on the "ACDC 2019 Naturalist" multi-categorical data set (~4000 rows)
def separateMultiCategoricalCounts_slow_version(dfInSightings, countColumns):
    
    outSightings = list()

    for lbl, sInSight in dfInSightings.iterrows():
        
        # [a little optimisation ?] If this is already a mono-categorical sightings, simply append it as is.
        sCounts = sInSight[countColumns]
        sCounts = sCounts[sCounts > 0]
        if len(sCounts) == 1:
            
            outSightings.append(sInSight)
            
            continue

        # If it is a multi-categorical sightings, we need to split it down.
        for col in sCounts.index:

            sOutSight = sInSight.copy()
            sOutSight[countColumns] = 0
            sOutSight[col] = sInSight[col]

            outSightings.append(sOutSight)

    return pd.DataFrame(data=outSightings, index=np.arange(len(outSightings)))

In [None]:
len(dfObs)

In [None]:
%%time

dfObsMonoCat_slow = separateMultiCategoricalCounts_slow_version(dfObs, countCols)
len(dfObsMonoCat_slow), dfObsMonoCat_slow[countCols].sum()

In [None]:
# Transform a multi-individual mono-categorical sightings set into an equivalent mono-individual mono-categorical sightings set
# that is where no sightings has more that one individual per category (keeping the same total counts).
# Ex: A sightings set with 2 mono-categorical count columns nMales and nFemales
#     In input set, you may have 1 sightings with nMales = 3 and nFemales = 0 (but none with nMales and nFemales > 0)
#     In out set, no : this sightings have been separated in 3 distinct ones (all other properties left untouched) :
#                      all with nMales = 1 and nFemales = 0.

# A slower version or ads.individualiseMonoCategoricalCounts :
#  from 15.5s to 0.06s with countColumns = [nMalAd1,nAutAd10,nJuv10,nDetTot10,nMalAd5,nAutAd5,nJuv5,nDetTot5,nTotAd5,nTotAd10]
#  on the "ACDC 2019 Naturalist" mono-categorical data set (~20000 rows)
def individualiseMonoCategoricalCounts_slow(dfInSightings, countColumns):

    
    outSightings = list()

    for lbl, sInSight in dfInSightings.iterrows():

        # [a little check] Multi-categorical sightings not supported here.
        sCounts = sInSight[countColumns]
        sCounts = sCounts[sCounts > 0]
        assert len(sCounts) == 1, 'Error: Multi-categorical sightings not supported ' + str(lbl, sInSight)
        
        # Get the positive count column and its value
        posCol = sCounts.index[0]
        count = sCounts[posCol]

        # [a little optimisation ?] If this is a mono-individual sightings, simply append it as is.
        if count == 1:
            
            outSightings.append(sInSight)

            continue

        # If it is a multi-individual sightings, we need to split it down.
        while count > 0:

            sOutSight = sInSight.copy()
            sOutSight[posCol] = 1

            outSightings.append(sOutSight)

            count -= 1

    return pd.DataFrame(data=outSightings, index=np.arange(len(outSightings)))

In [None]:
%%time

dfObsIndiv_slow = individualiseMonoCategoricalCounts_slow(dfObsMonoCat_slow, countCols)
len(dfObsIndiv_slow), dfObsIndiv_slow[countCols].sum()

In [None]:
# Transform a multi-individual multi-categorical sightings set into an equivalent mono-individual multi-categorical sightings set
# that is where no sightings has more that one individual per category (keeping the same total counts).
# Ex: A sightings set with 2 categorical count columns nMales and nFemales
#     In input set, you may have 1 sightings with nMales = 5 and nFemales = 2
#     In out set, no : this sightings have been separated in 5 distinct ones (all other properties left untouched) :
#                      the 2 1st ones with nMales = 1 and nFemales = 1, the last 3 ones with nMales = 1 and nFemales = 0.

# Finally, of no use : simply chain ads.separateMultiCategoricalCounts and ads.individualiseMonoCategoricalCounts
# And from far much slower !
def individualiseMultiCategoricalCounts(dfInSightings, countColumns):
    
    outSightings = list()

    for lbl, sInSight in dfInSightings.iterrows():

        # [a little optimisation ?] If this is a mono-individual sightings, simply append it as is.
        sCounts = sInSight[countColumns]
        if sCounts.max() == 1:
            
            outSightings.append(sInSight)
            
            continue

        # If it is a multi-individual sightings, we need to split it down.
        sCounts = sCounts.copy()

        while sCounts.max() > 0:

            sOutSight = sInSight.copy()
            sOutSight[countColumns] = sCounts.apply(lambda n: 1 if n > 0 else 0)

            outSightings.append(sOutSight)

            sCounts = sCounts.apply(lambda n: n-1 if n > 0 else 0)

    return pd.DataFrame(data=outSightings, index=list(range(len(outSightings))))

In [None]:
# Add "abscence" sightings to field data collected on transects for a given sample
# Warning: A special version for an all-taxon data set
# * dfInSights : input data table
# * transectCol : the name of the transect id column
# * taxonSampleCol : the name of the taxon id column
# * otherSampleCols : the names of the other sample id columns (taxon id not included)
# * expectedTaxa : the expected taxon ids : absence sightings are there to make sure
#                  all of the taxa are found at least once in the output data table
def addAbsenceSightings(dfInSights, transectCol, taxonSampleCol, otherSampleCols, expectedTaxa):
    
    def absenceSightings(taxonCol, taxon, dAbscSightTmpl):
        dAbscSight = dAbscSightTmpl.copy()
        dAbscSight[taxonCol] = taxon
        return dAbscSight

    assert not dfInSights.empty, 'Error : Empty sightings data to complete !'

    ldfAbscSights = list()

    # Use 1st sightings of the sample to build the absence sightings prototype
    # (all null columns except for the sample identification ones)
    dAbscSightTmpl = dfInSights.iloc[0].to_dict()
    dAbscSightTmpl.update({ k: None for k in dAbscSightTmpl.keys() if k not in otherSampleCols })

    # For each transect
    for transect in dfInSights[transectCol].unique():

        # Update absence sightings template with transect id
        dAbscSightTmpl.update({ transectCol: transect })

        # Generate the absence sightings from it : 1 per lacking taxon
        lackingTaxa = \
          set(expectedTaxa) - set(dfInSights.loc[dfInSights[transectCol] == transect, taxonSampleCol].unique())
        dfAbscSights = pd.DataFrame([absenceSightings(taxonSampleCol, txn, dAbscSightTmpl) for txn in lackingTaxa])

        # Save the data frame for later
        ldfAbscSights.append(dfAbscSights)

    # Concat all absence data frames into one.
    dfOutSights = pd.concat([dfInSights] + ldfAbscSights)

    # Reset index (for unique labels).
    dfOutSights.reset_index(inplace=True, drop=True)
    
    # Done.
    return dfOutSights



In [None]:
# Tests unitaires de addAbsenceSightings

In [None]:
# Define transect, taxon and sample columns
transectCol = 'Point'
taxonCol = 'Espece'
sampleCols = ['Passage', 'Adulte', 'Duree']

In [None]:
# The set of expected taxa ... of which we'll look for abscence on every location
expectedTaxa = list(dfObsIndiv[taxonCol].unique())

assert len(expectedTaxa) == 58

', '.join(expectedTaxa), len(expectedTaxa)

In [None]:
# Select 1 random sample
passage = 'a'
adulte = 'm'
duree = '10'
dfObsIndivSmpl = dfObsIndiv[(dfObsIndiv.Passage == passage) & (dfObsIndiv.Adulte == adulte) & (dfObsIndiv.Duree == duree)]

assert len(dfObsIndivSmpl) == 322 and dfObsIndivSmpl[transectCol].nunique() == 21

In [None]:
%%time

dfObsIndivAbscSmpl = ads.addAbsenceSightings(dfObsIndivSmpl, transectCol, taxonCol, expectedTaxa, sampleCols)
len(dfObsIndivAbscSmpl)

In [None]:
# Check for no change in sample columns
assert list(dfObsIndivAbscSmpl.columns) == list(dfObsIndivSmpl.columns)

# Check for number of added rows
assert len(dfObsIndivAbscSmpl) == 1333

# Check for no change in number of transect and taxa
assert dfObsIndivAbscSmpl[transectCol].nunique() == 21 and dfObsIndivAbscSmpl[taxonCol].nunique() == 58

# Check for no change in sample identification
assert list(dfObsIndivAbscSmpl.Passage.unique()) == [passage]
assert list(dfObsIndivAbscSmpl.Adulte.unique()) == [adulte]
assert list(dfObsIndivAbscSmpl.Duree.unique()) == [duree]

In [None]:
dfObsIndivSmpl.sort_values(by=['Passage', 'Observateur', 'Point', 'Espece', 'distMem']).head(20)

In [None]:
dfObsIndivAbscSmpl.sort_values(by=['Passage', 'Observateur', 'Point', 'Espece', 'distMem']).head(30)

In [None]:
%%time

# Performance test
print('Passage  Adulte Duree NbDonnees')

for passage in ['a', 'b', 'a+b']: 
    
    for adulte in ['m', 'a', 'm+a']:
    
        for duree in ['5', '10']:
            
            passages = passage.split('+')
            adultes = adulte.split('+')
            dfObsIndivSmpl = dfObsIndiv[dfObsIndiv.Passage.isin(passages) & dfObsIndiv.Adulte.isin(adultes) \
                                        & (dfObsIndiv.Duree == duree)]
            
            dfObsIndivAbscSmpl = ads.addAbsenceSightings(dfObsIndivSmpl, transectCol, taxonCol, expectedTaxa, sampleCols)
            
            print(passage, adulte, duree, ':', len(dfObsIndivAbscSmpl))

# Sandbox