In [None]:
import sys
import os
import importlib as implib
from packaging import version

import re

from collections import OrderedDict as odict

import numpy as np
import pandas as pd

from tqdm import tqdm

In [None]:
# Activate Warnings as Exception
#import warnings
#warnings.filterwarnings('error')

# Tests unitaires et d'intégration module *autods*.

## 0. Détection de Distance

In [None]:
import autods as ads

## 1. Classe DataSet

In [None]:
# Excel source
ds = ads.DataSet(source=os.path.join('AutoDS', 'refin', 'ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx'),
                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])
ds.dfData.head()

In [None]:
# CSV source with ',' as decimal point
ds = ads.DataSet(source=os.path.join('AutoDS', 'refin', 'ACDC2019-Papyrus-TURMER-AB-5mn-1dec-dist.txt'),
                 decimalFields=['Point transect*Survey effort', 'Observation*Radial distance'])

assert not any(ds.dfData[col].dropna().apply(lambda v: isinstance(v, str)).any() for col in ds.decimalFields), \
       'Error: Some strings found in declared decimal fields ... any decimal format issue ?'

ds.dfData.head()

In [None]:
# CSV source with '.' as decimal point
ds = ads.DataSet(source=os.path.join('AutoDS', 'refin', 'ACDC2019-Papyrus-ALAARV-AB-10mn-1dotdec-dist.txt'),
                 decimalFields=['Point transect*Survey effort', 'Observation*Radial distance'])

assert not any(ds.dfData[col].dropna().apply(lambda v: isinstance(v, str)).any() for col in ds.decimalFields), \
       'Error: Some strings found in declared decimal fields ... any decimal format issue ?'

ds.dfData.head()

In [None]:
# DataFrame source.
dfData = pd.DataFrame(columns=['Date', 'TrucDec', 'Espece', 'Point', 'Effort', 'Distance'],
                      data=[('2019-05-13', 3.5, 'TURMER', 23, 2,   83),
                            ('2019-05-15', np.nan, 'TURMER', 23, 2,   27.355),
                            ('2019-05-13', 0, 'ALAARV', 29, 2,   56.85),
                            ('2019-04-03', 1.325, 'PRUMOD', 53, 1.3,  7.2),
                            ('2019-06-01', 2, 'PHICOL', 12, 1,  np.nan),
                            ('2019-06-19', np.nan, 'PHICOL', 17, 0.5, np.nan),
                           ])
dfData['Region'] = 'ACDC'
dfData['Surface'] = '2400'
dfData

In [None]:
ds = ads.DataSet(source=dfData, decimalFields=['Effort', 'Distance', 'TrucDec'])
ds.dfData

## 2. Classes XXEngine

### a. Instanciation et chargement des spécifs sur les stats en sortie

In [None]:
try:
    eng = ads.MCDSEngine(workDir=os.path.join('AutoDS', 'test out'))
    print('Error: Should have raised an AssertionError !')
except AssertionError as exc:
    print('Good forbidden chars detection:', exc)

In [None]:
eng = ads.MCDSEngine(workDir=os.path.join('AutoDS', 'mcds-out'))

In [None]:
_ = eng.setupRunFolder(runPrefix='uni') # Unit tests

### b. Génération fichier de données en entrée de MCDS

In [None]:
_ = implib.reload(ads)

In [None]:
dataFileName = eng.buildDataFile(dataSet=ds)

### c. Génération fichier de "commandes"

In [None]:
cmdFileName = eng.buildCmdFile(estimKeyFn='HNORMAL', estimAdjustFn='COSINE',
                               estimCriterion='AIC', cvInterval=95)

### d. Execution en mode "debug"

(génération des fichiers cmd et data, mais pas d'appel à l'exécutable)

In [None]:
runCode, runTime, runDir = eng.run(ds, realRun=False, runPrefix='int',
                                   estimKeyFn='UNIFORM', estimAdjustFn='POLY',
                                   estimCriterion='AIC', cvInterval=95)
assert runCode == 0, 'Should have NOT run (run code = 0)'
dict(runCode=runCode, runDir=runDir, runTime=runTime)

### e. Exécution réelle

In [None]:
runCode, runTime, runDir = eng.run(ds, realRun=True, runPrefix='int',
                                   estimKeyFn='UNIFORM', estimAdjustFn='POLY',
                                   estimCriterion='AIC', cvInterval=95)
assert runCode == 2, 'Should have run with warnings (run code = 2)'
dict(runCode=runCode, runDir=runDir, runTime=runTime)

### f. Génération fichier de données en entrée pour Distance

(mode 'point transect' uniquement pour le moment)

In [None]:
os.makedirs(os.path.join(eng.workDir, 'distance-in'), exist_ok=True)

In [None]:
distDataFileName = \
    eng.buildDistanceDataFile(ds, tgtFilePathName=os.path.join(eng.workDir, 'distance-in', 'import-data-noextra.txt'))

In [None]:
distDataFileName = \
    eng.buildDistanceDataFile(ds, tgtFilePathName=os.path.join(eng.workDir, 'distance-in', 'import-data-withextra.txt'),
                              withExtraFields=True)

### g. classe ResultsSet

In [None]:
miCustCols = pd.MultiIndex.from_tuples([('id', 'index', 'Value'),
                                        ('sample', 'species', 'Value'),
                                        ('sample', 'periods', 'Value'),
                                        ('sample', 'duration', 'Value'),
                                        ('variant', 'precision', 'Value')])
dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=['index', 'species', 'periods', 'duration', 'precision'],
                           fr=['numéro', 'espèce', 'périodes', 'durée', 'précision']))

rs = ads.ResultsSet(analysisClass=ads.MCDSAnalysis, miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans)

In [None]:
assert rs.dfData.empty

In [None]:
sCustom = pd.Series(index=miCustCols, data=list(range(len(miCustCols))))
miResCols = ads.MCDSAnalysis.MIRunColumns.append(ads.MCDSEngine.statModCols())
sResult = pd.Series(index=miResCols, data=list(range(len(miResCols))))
rs.append(sCustom, sResult)

In [None]:
dfRaw = rs.dfData
dfRaw

In [None]:
dfTrans = rs.dfTransData('fr')
dfTrans

In [None]:
assert len(dfRaw.columns) == len(dfTrans.columns)

# Tests de validation module autods

## 1. MCDSEngine : Génération de fichiers d'entrée pour Distance

* via un jeu de fichiers d'entrée bruts Excel, et leur export de référence, éprouvé dans Distance,
* et comparaison du produit de XXEngine.buildDistanceDataFile à cette référence.

In [None]:
dfDistCases = pd.DataFrame([dict(inFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'],
                                 refOutFileName='ACDC2019-Papyrus-ALAARV-saisie-5-cols.txt', withExtraFields=False),
                            dict(inFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx',
                                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'],
                                 refOutFileName='ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.txt', withExtraFields=True)])
dfDistCases

In [None]:
eng = ads.MCDSEngine(workDir=os.path.join('AutoDS', 'mcds-out'))

In [None]:
fails = 0
for ind, sCase in dfDistCases.iterrows():
    
    print('#', ind, ':', sCase.inFileName)

    # Create data set
    ds = ads.DataSet(source=os.path.join('AutoDS', 'refin', sCase.inFileName),
                     decimalFields=sCase.decimalFields)
    
    # Build distance import data file
    ofn = os.path.join(eng.workDir, 'distance-in', sCase.refOutFileName)
    ofn = eng.buildDistanceDataFile(dataSet=ds, tgtFilePathName=ofn, withExtraFields=sCase.withExtraFields)
    
    # Compare generated file to reference
    rfn = os.path.join('AutoDS', 'refout', sCase.refOutFileName)
    with open(ofn, 'r') as fOut, open(rfn, 'r') as fRef:
        if fOut.read() == fRef.read():
            print('Success : Conform to reference.')
        else:
            print('Error: Generated file differs from reference', rfn)
            fails += 1
            
    print()
    
print('All test cases succeeded !' if fails == 0 else 'Error: {} test case(s) failed.'.format(fails))

# 2. MCDSEngine : Exécution avec de vraies données

## A virer : tests MCDSAnalysis ci-dessous englobants 

In [None]:
ds = ads.DataSet(source=os.path.join('AutoDS', 'refin', 'ACDC2019-Papyrus-ALAARV-saisie-ttes-cols.xlsx'),
                 decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])

eng = ads.MCDSEngine(workDir=os.path.join('AutoDS', 'mcds-out'))

runCode, runTime, runDir = eng.run(ds, realRun=True, runPrefix='int',
                                   estimKeyFn='UNIFORM', estimAdjustFn='POLY',
                                   estimCriterion='AIC', cvInterval=95)
assert runCode == 2, 'Should have run with warnings (run code = 2)'
dict(runCode=runCode, runDir=runDir, runTime=runTime)

# 3. MCDSAnalysis : Analyse avec de vraies données

(et comparaison à des analyses faites à la main avec Distance 7.3)

In [None]:
_ = implib.reload(ads)

## a. Construction des cas tests

In [None]:
# Load refout results table
dfRefRes = pd.read_excel(os.path.join('AutoDS', 'refout', 'ACDC2019-Papyrus-ALAARV-TURMER-resultats-distance-73.xlsx'))

In [None]:
# Generate test cases definition code from refout results file (don't cheat : only input columns :-)
dfAnlysCases = dfRefRes[['Species', 'Sample', 'Precision', 'Duration', 'Name']].copy()

#dfAnlysCases['Status'] = \
#    dfAnlysCases.Status.apply(lambda s: 1 if s == 'OK' else 2 if s == 'Warnings' else 3)
dfAnlysCases['KeyFn'] = \
    dfAnlysCases.Name.apply(lambda s: 'UNIFORM' if s.startswith('Unif') \
                                       else 'HNORMAL' if s.startswith('Half') else 'HAZARD')
dfAnlysCases['AdjSer'] = \
    dfAnlysCases.Name.apply(lambda s: 'COSINE' if s.endswith('Cos') \
                                       else 'POLY' if s.endswith('SimPoly') else 'HERMITE')
dfAnlysCases['InFileName'] = \
    dfAnlysCases.apply(lambda sRow: 'ACDC2019-Papyrus-{}-{}-{}mn-{}dec-dist.txt' \
                                    .format(sRow.Species,
                                            'AB' if 'A+B' in sRow.Sample else 'A' if 'A' in sRow.Sample else 'B',
                                            5 if '5' in sRow.Duration == '5 mn' else 10,
                                            6 if sRow.Precision.startswith('6 déc') else 1),
                       axis='columns')

dfAnlysCases

## b. Exécution des analyses

In [None]:
decimalFields = ['Point transect*Survey effort', 'Observation*Radial distance']

In [None]:
# Analysis engine
mcds = ads.MCDSEngine(workDir=os.path.join('AutoDS', 'mcds-out'),
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
# Frozen analysis parameters (a choice here)
KEstimCriterion = 'AIC'
KCVInterval = 95

In [None]:
# Run all analyses
miCustCols = pd.MultiIndex.from_tuples([('sample', col, 'Value') for col in dfAnlysCases.columns[:5]])
dfCustColTrans = \
    pd.DataFrame(index=miCustCols,
                 data=dict(en=['Species', 'Sample', 'Precision', 'Duration', 'Model'],
                           fr=['Espèce', 'Echantillon', 'Précision', 'Durée', 'Modèle']))

results = ads.ResultsSet(analysisClass=ads.MCDSAnalysis, miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans)

lastInFileName = ''
for ind, sCase in dfAnlysCases.iterrows():
    
    prefix = sCase.InFileName[len('ACDC2019-Papyrus')+1:-len('-dist.txt')]
    print('#{:3d}'.format(ind+1), prefix, sCase.KeyFn, sCase.AdjSer, end='\n'*2)
    
    # Create data set if not already done.
    if lastInFileName != sCase.InFileName:
        ds = ads.DataSet(os.path.join('AutoDS', 'refin', sCase.InFileName), decimalFields=decimalFields)
        lastInFileName = sCase.InFileName
        
    # Run analysis
    analysis = ads.MCDSAnalysis(engine=mcds, dataSet=ds, namePrefix=prefix,
                                estimKeyFn=sCase.KeyFn, estimAdjustFn=sCase.AdjSer,
                                estimCriterion=KEstimCriterion, cvInterval=KCVInterval)
    sResult = analysis.run()

    # Save results
    sHead = pd.Series(data=[sCase[col] for col in sCase.index[:5]], index=miCustCols)

    results.append(sCustom=sHead, sResult=sResult)

In [None]:
dfActRes = results.dfData

actResFileName = os.path.join(mcds.workDir, 'ACDC2019-Papyrus-ALAARV-TURMER-auto-results.xlsx')
dfActRes.to_excel(actResFileName, index=True)

dfActRes.head()

In [None]:
dfActTrRes = results.dfTransData('fr')
dfActTrRes.head()

## c. Comparaison des résultats à la référence

(référence = analyses faites "à la main" avec distance)

In [None]:
compCols = \
{
    ('sample', 'Species', 'Value'):   'Species',
    ('sample', 'Sample', 'Value'):    'Sample',
    ('sample', 'Precision', 'Value'): 'Precision',
    ('sample', 'Duration', 'Value'):  'Duration',
    ('sample', 'Model', 'Value'):     'Model',
    
    ('run output', 'run status', 'Value') : 'Status',
    
    ('encounter rate', 'number of observations (n)', 'Value'): '# obs',
    ('detection probability', 'total number of parameters (m)', 'Value'): '# params',
    
    ('detection probability', 'AIC value', 'Value'): 'AIC',
    ('detection probability', 'chi-square test probability (distance set 3)', 'Value')         : 'GOF Chi-p',
    ('detection probability', 'Kolmogorov-Smirnov test probability', 'Value')                  : 'GOF K-S p',
    ('detection probability', 'Cramér-von Mises (uniform weighting) test probability', 'Value'): 'GOF CvM (unif) p',
    ('detection probability', 'Cramér-von Mises (cosine weighting) test probability', 'Value') : 'GOF CvM (cos) p',
    
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Value'): 'ESW/EDR',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Lcl')  : 'ESW/EDR LCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Ucl')  : 'ESW/EDR UCL',
    ('detection probability', 'effective strip width (ESW) or effective detection radius (EDR)', 'Cv')   : 'ESW/EDR CV',
    
    ('density/abundance', 'density of animals', 'Value'): 'D',
    ('density/abundance', 'density of animals', 'Lcl')  : 'D LCL',
    ('density/abundance', 'density of animals', 'Ucl')  : 'D UCL',
    ('density/abundance', 'density of animals', 'Cv')   : 'D CV',
    
    ('detection probability', 'probability of detection (Pw)', 'Value'): 'P',
    ('detection probability', 'probability of detection (Pw)', 'Lcl')  : 'P LCL',
    ('detection probability', 'probability of detection (Pw)', 'Ucl')  : 'P UCL',
    ('detection probability', 'probability of detection (Pw)', 'Cv')   : 'P CV',
    ('detection probability', 'probability of detection (Pw)', 'Df')   : 'P DF',
}
len(compCols)

In [None]:
# TODO ?????????????????????????

In [None]:
dfActRes.reindex(columns=pd.MultiIndex.from_tuples(compCols.keys()))

In [None]:
#dfActRes.columns.to_list()

In [None]:
dfActRes.iloc[5:10].T.iloc[20:40]

In [None]:
dfRefRes.iloc[5:10].T

In [None]:
print('All test cases succeeded !' if fails == 0 else 'Error: {} test case(s) failed.'.format(fails))

In [None]:
ds = ads.DataSet(source=os.path.join('AutoDS', 'refin', 'ALAARV-saisie-ttes-cols.xlsx'),
decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])

eng = ads.MCDSEngine(workDir=os.path.join('AutoDS', 'mcds-out'))

anlys = ads.MCDSAnalysis(engine=eng, dataSet=ds, namePrefix='mcds',
                         estimKeyFn='HNORMAL', estimAdjustFn='COSINE', estimCriterion='AIC', cvInterval=95)

sRes = anlys.run()

assert sRes[('run output', 'run status', 'Value')] == 2, 'Should have run with warnings (run code = 2)'
sRes[('run output', 'run folder', 'Value')]

In [None]:
sRes[('run output',)]

## 2. Analyses massives ACDC Papier 2019

In [None]:
def extraireJeuDonnees(dfTout, espece, passages=['A', 'B'], duree='10mn'):
    
    assert all(p in ['A', 'B'] for p in passages)
    assert duree in ['5mn', '10mn']
    assert espece in dfTout.ESPECE.unique()
    
    # Passages
    dfJeu = dfTout[(dfTout.ESPECE == espece) & (dfTout.PASSAGE.isin(passages))].copy()
    
    # Durée
    if duree == '10mn':
        dfJeu['NOMBRE'] = dfJeu[['PER5MN', 'PER10MN']].sum(axis='columns')
    else:
        dfJeu['NOMBRE'] = dfJeu['PER5MN']
    dfJeu.drop(dfJeu[dfJeu.NOMBRE.isnull()].index, inplace=True)
    assert all(dfJeu.NOMBRE == 1)
        
    # Effort
    dfJeu['EFFORT'] = len(passages)
        
    # Nettoyage
    dfJeu.drop(['PER5MN', 'PER10MN'], axis='columns', inplace=True)
    
    return dfJeu

In [None]:
def ajouterAbsences(dfJeu, effort, pointsPapier):
    
    assert not dfJeu.empty, 'Erreur : Il n\'y aurait que des absences !'

    zone, surface, espece = dfJeu.iloc[0][['ZONE', 'HA', 'ESPECE']]
    dAbsence = { 'ZONE': zone, 'HA': surface, 'POINT': None, 'ESPECE': espece,
                 'DISTANCE': np.nan, 'EFFORT': effort, 'MALE': None,
                 'NOMBRE': np.nan, 'DATE': pd.NaT, 'OBSERVATEUR': None, 'PASSAGE': None }

    pointsManquants = [p for p in pointsPapier if p not in dfJeu.POINT.unique()]
    for p in pointsManquants:
        dAbsence.update(POINT=p)
        dfJeu = dfJeu.append(dAbsence, ignore_index=True)
    
    dfJeu.sort_values(by=['POINT'], inplace=True)

    return dfJeu, len(pointsManquants)

In [None]:
# Paramètres généraux.
workDir = os.path.join('AutoDS', 'acdc-auto')
runEngine = True # Pas d'appel à l'exe si False, juste pour les fichiers d'entrée.

In [None]:
# Tous les points effectués (pour absences).
pointsPapier = \
    list(map(int, """23,39,40,41,42,55,56,57,58,59,60,72,73,74,75,76,88,89,90,91,
                     105,106,109,110,112,113,122,123,125,126,127,128,129,130,141,142,143,144,145,146,
                     147,148,157,158,159,160,161,162,163,164,165,166,174,175,176,177,178,179,180,181,
                     182,183,184,185,192,193,194,195,196,197,198,199,200,201,202,210,211,212,213,214,
                     215,216,218,219,228,229,232,233,245,246,247,250,262,263,265,266,280,281,282,283,
                     284,299,300,301""".split(',')))

# Données brutes saisies par les observateurs, déjà individualisées, que les mâles.
ficDonnees = os.path.join('AutoDS', 'refin', 'ACDC2019-Papyrus-DonneesBrutesPourAutoDS.xlsx')

dfMales = pd.read_excel(ficDonnees, sheet_name='ResultIndivMales')
dfMales.rename(columns={ 'ha': 'HA', 'Distance en m': 'DISTANCE', 'Mâle\xa0?': 'MALE', 'Date': 'DATE',
                         'Période': 'PASSAGE', '0-5mn': 'PER5MN', '5-10 mn': 'PER10MN' }, inplace=True)

assert all(dfMales.MALE.str.lower() == 'oui')

# Les espèces et passages à traiter.
dfToDo = pd.read_excel(ficDonnees, sheet_name='AFaire')
toDoCols = ['ESPECE', 'MALES', 'PERIODE']
assert all(col in dfToDo.columns for col in toDoCols)
dfToDo = dfToDo.reindex(toDoCols, axis='columns')
dfToDo.sort_values(by='MALES', ascending=False, inplace=True)

dfToDoAll = dfToDo # Sauvegarde du tout

# Les paramètres de toutes les analyses à faire à chaque fois.
dfParams = pd.read_excel(ficDonnees, sheet_name='ParamsAnalyses')
paramCols = ['KeyFn', 'AdjustFn', 'Criterion', 'CVInterval']
assert all(col in paramCols for col in dfParams.columns)
dfParams = dfParams.reindex(paramCols, axis='columns')

In [None]:
print('Mâles au total       :', len(dfMales))
print('Espèces au total     :', len(dfMales.ESPECE.unique()))
print('Espèces à traiter    :', len(dfToDo))
print('Mâles à traiter      :', len(dfMales[dfMales.ESPECE.isin(dfToDo.ESPECE)]))
print('Variantes d\'analyses :', len(dfParams))

In [None]:
# Réduire éventuellement la masse de travail ...
dfToDo = dfToDo[:2] # Réduction
#dfToDo = dfToDoAll # Restauration du tout

In [None]:
pd.DataFrame([ ('Mâles au total', len(dfMales)),
               ('Espèces au total', len(dfMales.ESPECE.unique())),
               ('Espèces à traiter', len(dfToDo)),
               ('Mâles à traiter', len(dfMales[dfMales.ESPECE.isin(dfToDo.ESPECE)])),
               ('Variantes d\'analyses', len(dfParams)) ], columns=['', 'Nombre']).set_index('')

In [None]:
_ = implib.reload(ads)

In [None]:
# Le moteur
mcds = ads.MCDSEngine(workDir=workDir,
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

# Les résultats d'analyse
miCustColumns = pd.MultiIndex.from_tuples([('id', 'index', 'Value'),
                                           ('sample', 'species', 'Value'),
                                           ('sample', 'periods', 'Value'),
                                           ('sample', 'duration', 'Value'),
                                           ('variant', 'precision', 'Value')])
resultats = ads.ResultsSet(analysisClass=ads.MCDSAnalysis, customColumns=miCustColumns)

#Pour chaque espèce à traiter
for index, sToDo in dfToDo[:2].iterrows():

    espece, nbIndivs, passage = sToDo
    passages = [p for p in passage]

    # Pour les 2 durées d'inventaire (sur chaque point)
    for duree in ['5mn', '10mn']:

        # Sélection des données
        dfJeu = extraireJeuDonnees(dfMales, espece, passages, duree)
        nMales = len(dfJeu)

        # Ajout des lignes d'absence
        dfJeu, nAbsences = ajouterAbsences(dfJeu, effort=len(passages), pointsPapier=pointsPapier)

        # Pour chaque précision numérique sur la distance (en décroissant)
        for precDist in [None, 1]:
            
            print(espece, passage, duree, ':', nMales, 'mâles,', nAbsences, 'absences')

            # Arrondi à la précision.
            if precDist is not None:
                dfJeu.DISTANCE = dfJeu.DISTANCE.apply(round, ndigits=precDist)
            
            # Voici donc le jeu de données
            jeu = ads.DataSet(source=dfJeu, decimalFields=['EFFORT', 'DISTANCE', 'NOMBRE'])
            
            # Pour chaque jeu de paramètres d'analyse
            for index, sParams in dfParams.iterrows():

                precision = ('tt' if precDist is None else str(precDist)) + 'dec'
                prfxAnalyse = '{}-{}-{}-{}-{}'.format(espece, duree, passage, precision, index)
                analyse = ads.MCDSAnalysis(engine=mcds, dataSet=jeu, namePrefix=prfxAnalyse,
                                           estimKeyFn=sParams['KeyFn'], estimAdjustFn=sParams['AdjustFn'],
                                           estimCriterion=sParams['Criterion'], cvInterval=sParams['CVInterval'])

                sEntete = pd.Series(data=[index, espece, passage, duree, precision], index=miCustColumns)
                
                sResultat = analyse.run(realRun=runEngine)
                
                resultats.append(sCustom=sEntete, sResult=sResultat)
                                
                #raise StopIteration()
                
            print()

# Sauvegarde des résultats
ficRes = os.path.join(workDir, 'ACDC2019-Papyrus-ResultatsAutoAnalyses.xlsx')
print('Sauvegarde résultats dans', ficRes)
resultats.dfData.to_excel(ficRes, index=True)

In [None]:
sResultat

In [None]:
resultats.dfData.head()

# Generate stats columns translation file

(from documentation stats & modules specs)

In [None]:
tgtTransFileName = os.path.join('AutoDS', 'mcds-stat-mod-trans.txt')

In [None]:
class Translator(object):
    
    def __init__(self, dTrans, lang='en'):
        assert 'en' in dTrans, 'At least "en" translation must be defined'
        self.dTrans = dTrans
        self.setLang(lang)
        
    def setLang(self, lang):
        self.lang = lang.lower()
        assert self.lang in ['en', 'fr'], 'No support for "{}" language'.format(lang)
        
    def __call__(self, s):
        return self.dTrans.get(self.lang, self.dTrans['en']).get(s, self.dTrans['en'].get(s, s))

In [None]:
DFigureTrans = \
    dict(en=dict(Value='', Cv='ConfInd', Lcl='Min', Ucl='Max', Df='DoF'),
         fr=dict(Value='', Cv='IndConf', Lcl='Min', Ucl='Max', Df='DegLib'))

figtr = Translator(DFigureTrans, lang='en')

In [None]:
DStatisticTrans = \
    dict(en={ 'number of observations (n)': 'NObs',
              'number of samples (k)': 'NSamp',
              'effort (L or K or T)': 'Effort',
              'encounter rate (n/L or n/K or n/T)': 'EncRate',
              'left truncation distance': 'LeftTruncDist',
              'right truncation distance (w)': 'RightTruncDist',
              'total number of parameters (m)': 'TotNumPars',
              'AIC value': 'AIC',
              'chi-square test probability (distance set 1)': 'Chi2 P 1',
              'chi-square test probability (distance set 2)': 'Chi2 P 2',
              'chi-square test probability (distance set 3)': 'Chi2 P 3',
              'f(0) or h(0)': 'f/h(0)',
              'probability of detection (Pw)': 'PDetec',
              'effective strip width (ESW) or effective detection radius (EDR)': 'EDR/ESW',
              'AICc': 'AICc',
              'BIC': 'BIC',
              'Log likelihood': 'LogLhood',
              'Kolmogorov-Smirnov test probability': 'KS P',
              'Cramér-von Mises (uniform weighting) test probability': 'CvM Uw P',
              'Cramér-von Mises (cosine weighting) test probability': 'CvM Cw P',
              'key function type': 'KeyFn',
              'adjustment series type': 'AdjSer',
              'number of key function parameters (NKP)': 'NumKFnPars',
              'number of adjustment term parameters (NAP)': 'NumASerPars',
              'number of covariate parameters (NCP)': 'NumCovars',
              'estimated value of A(1) adjustment term parameter': 'EstA(1)',
              'estimated value of A(2) adjustment term parameter': 'EstA(2)',
              'estimated value of A(3) adjustment term parameter': 'EstA(3)',
              'estimated value of A(4) adjustment term parameter': 'EstA(4)',
              'estimated value of A(5) adjustment term parameter': 'EstA(5)',
              'estimated value of A(6) adjustment term parameter': 'EstA(6)',
              'estimated value of A(7) adjustment term parameter': 'EstA(7)',
              'estimated value of A(8) adjustment term parameter': 'EstA(8)',
              'estimated value of A(9) adjustment term parameter': 'EstA(9)',
              'estimated value of A(10) adjustment term parameter': 'EstA(10)',
              'average cluster size': 'AvgClustSz',
              'size-bias regression correlation (r)': 'SzBias RegCorr',
              'p-value for correlation significance (r-p)': 'CorSignPVal',
              'estimate of expected cluster size corrected for size bias': 'EstExpFixedCluSz',
              'density of clusters (or animal density if non-clustered)': 'DensClu',
              'density of animals': 'Density',
              'number of animals, if survey area is specified': 'Number',
              'bootstrap density of clusters': 'BootsDensClu',
              'bootstrap density of animals': 'BootDensity',
              'bootstrap number of animals': 'BootNumber' },
         fr={ 'number of samples (k)': 'NEchant',
              'encounter rate (n/L or n/K or n/T)': 'TxContact',
              'left truncation distance': 'DistTroncGche',
              'right truncation distance (w)': 'DistTroncDte',
              'total number of parameters (m)': 'NbTotPars',
              'Log likelihood': 'LogProba',
              'key function type': 'FnClé',
              'adjustment series type': 'SérAjust',
              'number of key function parameters (NKP)': 'NbParsFnClé',
              'number of adjustment term parameters (NAP)': 'NbParsSérAjust',
              'number of covariate parameters (NCP)': 'NbCovars',
              'average cluster size': 'TailMoyClust',
              'size-bias regression correlation (r)': 'CorrReg BiaisTail',
              'p-value for correlation significance (r-p)': 'PVal SignifCorr',
              'estimate of expected cluster size corrected for size bias': 'TailCorrCluAttEst',
              'density of animals': 'Densité',
              'number of animals, if survey area is specified': 'Nombre',
              'bootstrap density of clusters': 'BootsDensClu',
              'bootstrap density of animals': 'DensitéBoot',
              'bootstrap number of animals': 'NombreBoot' })

statr = Translator(DStatisticTrans, lang='en')

In [None]:
dfStatModTrans = ads.MCDSEngine.MIStatModColumns.to_frame()
dfStatModTrans.reset_index(drop=True, inplace=True)
dfStatModTrans.rename(columns={ 0: 'Module', 1: 'Statistic', 2: 'Figure' }, inplace=True)
for lang in ['en', 'fr']:
    figtr.setLang(lang)
    statr.setLang(lang)
    dfStatModTrans[lang] = \
        dfStatModTrans.apply(lambda sRow: '{} {}'.format(figtr(sRow.Figure), statr(sRow.Statistic)).strip(),
                             axis='columns')

In [None]:
dfStatModTrans

In [None]:
dfStatModTrans.to_csv(tgtTransFileName, sep='\t', index=False)
tgtTransFileName

In [None]:
pd.DataFrame(index=analysis.MIRunColumns,
             data=dict(en=['ModKeyFn', 'ModAdjSer', 'ModChcCrit', 'ConfInter', 'RunCode', 'RunFolder'],
                       fr=['FnCléMod', 'SérAjustMod', 'CritChxMod', 'InterConf', 'CodeExec', 'DossierExec']))


In [None]:
dfStatModTransExt = pd.read_csv(tgtTransFileName, sep='\t')
dfStatModTransExt

In [None]:
dfStatModTransExt = pd.read_csv(tgtTransFileName, sep='\t')dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])

In [None]:
lang = 'fr'
dTrans = dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])[lang].to_dict()
resultats.dfData.columns = [dTrans.get(col, col) for col in resultats.dfData.columns]
resultats.dfData

In [None]:
x = resultats.dfData.columns


In [None]:
x.to_list()

In [None]:
dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])[lang].to_dict()

# Test case class

(no use actually : pd.DataFrame already does the job !)

In [None]:
# Super-class for test cases
class TestCase(object):
    def __init__(self, **attrs):
        if not hasattr(self.__class__, 'AttributeNames'):
            self.__class__.AttributeNames = set(attrs.keys())
        else:
            assert set(attrs.keys()) == self.AttributeNames, \
                   'Some attribute name not in frozen set {{{}}}'.format(','.join(self.AttributeNames))
        for attrName, AttrValue in attrs.items():
            setattr(self, attrName, AttrValue)
    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, ','.join('{}:{}'.format(k, v) for k, v in self.__dict__.items()))

In [None]:
# Test this super-class.
class TCTest(TestCase):
    pass

tstTestCases = list()
tstTestCases.append(TCTest(x=1, y='a')) # Define attributes
tstTestCases.append(TCTest(x=2, y='b')) # Check attributes
try:
    tstTestCases.append(TCTest(x=2, z=None)) # Refuse new attributes
    assert False, 'Error: New attributes should be refused'
except AssertionError as exc:
    print('Good refuse of new attributes:', exc)
    
[str(tc) for tc in tstTestCases]

# Mise au point décodage sorties de MCDS : fichier de stats

TODO: Add french translation of variables / parameters names and descriptions

## 1. Nom et description des colonnes du tableau de stats

In [None]:
fileName = 'mcds-stat-row-specs.txt'

fStatRowSpecs = open(fileName, mode='r', encoding='utf8')

In [None]:
statRowSpecLines = [line.rstrip('\n') for line in fStatRowSpecs.readlines() if not line.startswith('#')]
statRowSpecs =  [(statRowSpecLines[i].strip(), statRowSpecLines[i+1].strip()) \
                 for i in range(0, len(statRowSpecLines)-2, 3)]
dfStatRowSpecs = pd.DataFrame(columns=['Name', 'Description'], data=statRowSpecs).set_index('Name')

dfStatRowSpecs

In [None]:
dfStatRowSpecs.index

## 2. Numéro et description des modules et statistiques associées

(colonnes Module et Statistic du tableau)

In [None]:
fileName = 'mcds-stat-mod-specs.txt'

fStatModSpecs = open(fileName, mode='r', encoding='utf8')

In [None]:
nMaxAdjParams = 10

statModSpecLines = [line.rstrip('\n') for line in fStatModSpecs.readlines() if not line.startswith('#')]
reModSpecNumName = re.compile('(.+) – (.+)')
statModSpecs = list()
moModule = None
for line in statModSpecLines:
    if not line:
        continue
    if moModule is None:
        moModule = reModSpecNumName.match(line.strip())
        continue
    if line == ' ':
        moModule = None
        continue
    moStatistic = reModSpecNumName.match(line.strip())
    modNum, modDesc, statNum, statDescNotes = \
        moModule.group(1), moModule.group(2), moStatistic.group(1), moStatistic.group(2)
    for i in range(len(statDescNotes)-1, -1, -1):
        if not re.match('[\d ,]', statDescNotes[i]):
            statDesc = statDescNotes[:i+1]
            statNotes = statDescNotes[i+1:].replace(' ', '')
            break
    modNum = int(modNum)
    if statNum.startswith('101 '):
        for num in range(nMaxAdjParams): # Assume no more than that ... a bit hacky !
            statModSpecs.append((modNum, modDesc, 101+num, # Make statDesc unique for later indexing
                                 statDesc.replace('each', 'A({})'.format(num+1)), statNotes))
    else:
        statNum = int(statNum)
        if modNum == 2 and statNum == 3: # Actually, there are 0 or 3 of these ...
            for num in range(3):
                statModSpecs.append((modNum, modDesc, num+201,
                                     # Change statNum & Make statDesc unique for later indexing
                                     statDesc+' (distance set {})'.format(num+1), statNotes))
        else:
            statModSpecs.append((modNum, modDesc, statNum, statDesc, statNotes))
dfStatModSpecs = pd.DataFrame(columns=['modNum', 'modDesc', 'statNum', 'statDesc', 'statNotes'],
                              data=statModSpecs).set_index(['modNum', 'statNum'])

dfStatModSpecs

In [None]:
# Modules
dfStatModSpecs.modDesc.unique()

## 3. Notes sur les statistiques des modules

(infos supplémentaire indiquant comment utiliser ou pas les 5 dernières colonnes Value, Cv, Lcl, Ucl, Df)

In [None]:
fileName = 'mcds-stat-mod-notes.txt'

fStatModNotes = open(fileName, mode='r', encoding='utf8')

In [None]:
statModNoteLines = [line.rstrip('\n') for line in fStatModNotes.readlines() if not line.startswith('#')]
statModNotes =  [(int(line[:2]), line[2:].strip()) for line in statModNoteLines if line]

dfStatModNotes = pd.DataFrame(data=statModNotes, columns=['Note', 'Text']).set_index('Note')

dfStatModNotes

## 4. Lecture du tableau

In [None]:
eng = mcds

In [None]:
eng.statsFileName

In [None]:
dfStatRows = pd.read_csv(eng.statsFileName, sep=' +', engine='python', names=dfStatRowSpecs.index)
dfStatRows

## 5. Décodage du tableau

Attention: On suppose 1 seule strate '0' (Stratum), 1 seul échantillon '0' (Sample) et 1 seul estimateur '1' (Estimator).

### a. Suppression des colonnes Stratum, Sample et Estimator

(puisqu'on se limite ici aux cas où il n'y a qu'1 de chaque)

In [None]:
dfStatRows.drop(columns=['Stratum', 'Sample', 'Estimator'], inplace=True)
dfStatRows

### b. Nettoyage des données sans objets

(selon les notes descriptives des statistiques)

In [None]:
# Empilage des "chiffres" (Figures) Value, Cv, Lcl, Ucl, Df pour chaque statistique / module
dfStats = dfStatRows.set_index(['Module', 'Statistic'], append=True).stack() \
                    .reset_index().rename(columns={'level_0': 'id', 'level_3': 'Figure', 0: 'Value'})
dfStats.head(10)

In [None]:
# 4. Fix multiple Module=2 & Statistic=3 rows (before joining with self.DfStatModSpecs)
newStatNum = 200
for lbl, sRow in dfStats[(dfStats.Module == 2) & (dfStats.Statistic == 3)].iterrows():
    if dfStats.loc[lbl, 'Figure'] == 'Value':
        newStatNum += 1
    dfStats.loc[lbl, 'Statistic'] = newStatNum
dfStats[(dfStats.Module == 2)]

In [None]:
# Ajout des colonnes de description/nommage des modules et statistiques
dfStats = dfStats.join(dfStatModSpecs, on=['Module', 'Statistic'])
dfStats.tail(10)

In [None]:
#dfStats[(dfStats.Module == 2) & (dfStats.Statistic > 200)]

In [None]:
# Vérification que les chiffres sans objet le sont vraiment (tous à 0.0 ?)
# Attention: Il doit y avoir un bug dans MCDS avec Module 2 / Statistic 10x : certains Cv ne sont pas nuls ...
sKeepOnlyValueFig = ~dfStats.statNotes.str.contains('1')
sFigs2Drop = (dfStats.Figure != 'Value') & sKeepOnlyValueFig
assert ~dfStats[sFigs2Drop & ((dfStats.Module != 2) | (dfStats.Statistic < 100))].Value.any(), \
       'Attention: Des chiffres supposés "sans objet" on des valeurs non nulles !'

In [None]:
# 2nde vérif. visuelle
dfStats[sFigs2Drop & dfStats.Value != 0].sort_values(by='Value', ascending=False)

In [None]:
# Suppression des lignes / chiffres sans objet.
dfStats.drop(dfStats[sFigs2Drop].index, inplace=True)
dfStats

In [None]:
dfStats.head()

In [None]:
dfStats = dfStats.reindex(columns=['modDesc', 'statDesc', 'Figure', 'Value'])
dfStats.set_index(['modDesc', 'statDesc', 'Figure'], inplace=True)
dfStats

In [None]:
dfStats.T.iloc[0]

# Bac à sable

In [None]:
DecimalFields = ['SMP_EFFORT', 'DISTANCE']

ImportFieldAliasREs = \
    odict([('STR_LABEL', ['region', 'zone', 'strate', 'stratum']),
           ('STR_AREA', ['surface', 'area', 'ha', 'km2']),
           ('SMP_LABEL', ['point', 'lieu', 'location']),
           ('SMP_EFFORT', ['effort', 'passages', 'surveys', 'samplings']),
           ('DISTANCE', ['distance'])])

def matchDataFields(srcFields):

    print('Matching required data columns:', end=' ')

    # Try and match required data columns.
    matFields = list()
    matDecFields = list()
    for tgtField in ImportFieldAliasREs:
        print(tgtField, end='=')
        foundTgtField = False
        for srcField in srcFields:
            print(srcField, end=':')
            for pat in ImportFieldAliasREs[tgtField]:
                print(pat, end=';')
                if re.search(pat, srcField, flags=re.IGNORECASE):
                    print(srcField, end=', ')
                    matFields.append(srcField)
                    if tgtField in DecimalFields:
                        matDecFields.append(srcField)
                    foundTgtField = True
                    break
            if foundTgtField:
                break
        if not foundTgtField:
            raise Exception('Error: Failed to find a match for expected {} in dataset columns {}' \
                            .format(tgtField, srcFields))

    # Extra fields.
    extFields = [field for field in srcFields if field not in matFields]

    print('... success.')

    return matFields, matDecFields, extFields

In [None]:
matchDataFields(['Region*Label', 'Region*Area', 'Point transect*Label',
       'Point transect*Survey effort', 'Observation*Radial distance'])

In [None]:
mo = re.search('area', 'Region*Area', flags=re.IGNORECASE)
mo

In [None]:
def safeFloat2Str(val, prec=None, decPt='.'):
    strVal = '' if pd.isnull(val) else str(val) if prec is None \
                else '{:.{prec}f}'.format(val, prec=prec)
    if decPt != '.':
        strVal = strVal.replace('.', decPt)
    return strVal

In [None]:
safeFloat2Str(12.53, prec=None, decPt='.')

In [None]:
safeFloat2Str(12.53, prec=1, decPt='.')

In [None]:
safeFloat2Str(12.53, prec=4, decPt='.')

In [None]:
safeFloat2Str(12.53, prec=None, decPt=',')

In [None]:
_ = implib.reload(ads)

In [None]:
cmdTxt = ads.MCDSEngine.CmdTxt.format(output='output.txt', log='log.txt',
                            stats='stats.txt', bootstrap='boot.txt',
                            survType='Point', distType='Radial',
                            distUnit='m', areaUnit='ha',
                            dataFields=', '.join(['a', 'b', 'c']), dataFileName='data.txt',
                            estKeyFn='HNORMAL', estAdjustFn='COSINE',
                            estCriterion='AIC', cvInterv=95)
cmdTxt

In [None]:
ads.MCDSEngine.CmdTxt

## Appending series to series ... index order

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
s

In [None]:
s.append(pd.Series(index=[('A', 'b'), ('A', 'a'), ('B', 'c')], data=[1, 2, 3], name=0))

## Appending series to DataFrame ... columns order

### a. Append

In [None]:
df = pd.DataFrame()

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
#df = df.append(s, ignore_index=False) # => df.columns pas MultiIndex !
df = df.append([s], ignore_index=False)
df

In [None]:
s = pd.Series(index=[('A', 'c'), ('B', 'b'), ('B', 'a')], data=[4, 5, 6], name=1)  # Mêmes colonnes : append ne retrie pas
#s = pd.Series(index=[('A', 'a'), ('A', 'b'), ('B', 'c')], data=[4, 5, 6], name=1)  # Nouvelle colonne : append retrie
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('A', 'a'), ('B', 'c')], data=[7, 8])
df = df.append(s, ignore_index=True)
df

In [None]:
s = pd.Series(index=[], data=[])
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('C', 'd')], data=[9])
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('d',)], data=[10])
df = df.append(s, ignore_index=True)
df

In [None]:
df

### b. Concat

In [None]:
df = pd.DataFrame()

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
df = pd.concat([df, s], axis='columns')
df

In [None]:
s = pd.Series(index=[('B', 'b'), ('B', 'a'), ('A', 'c')], data=[4, 5, 6], name=1) # Mêmes colonnes : concat ne retrie pas
#s = pd.Series(index=[('A', 'a'), ('A', 'b'), ('B', 'c')], data=[4, 5, 6], name=1) # Nouvelle colonne : concat retrie
df = pd.concat([df, s], axis='columns')
df

### c. Restore desired columns

* desired order,
* desired list of columns : new ones, and / or ignored ones.

In [None]:
df

In [None]:
# Add new A/b, D/a and remove B/c and C/d
i = pd.MultiIndex.from_tuples([('A', 'c'), ('A', 'b'), ('A', 'a'), ('B', 'b'), ('B', 'a'), ('D', 'a')])
i

In [None]:
# Keep added columns (with no data inside)
df2 = df.reindex(i, axis='columns')
df2

In [None]:
# Remove added columns (with no data inside)
df2 .dropna(how='all', axis='columns')