<!-- Auto table of contents -->
<h1 class='tocIgnore'>Development archives (low level code)</h1>

**pyaudisam**: Automation of Distance Sampling analyses with [Distance software](http://distancesampling.org/)

Copyright (C) 2021 Jean-Philippe Meuret

This program is free software: you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the Free Software Foundation,
either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program.
If not, see https://www.gnu.org/licenses/.

<div style="overflow-y: auto">
  <h2 class='tocIgnore'>Table of contents</h2>
  <div id="toc"></div>
</div>

In [None]:
%%javascript
$.getScript('ipython_notebook_toc.js')

In [None]:
%matplotlib inline

In [None]:
import sys
import os
import pathlib as pl

import re

import concurrent.futures as cofu

import math
import numpy as np
import pandas as pd

from tqdm import tqdm

from IPython.display import HTML

In [None]:
# Activate Warnings as Exception
#import warnings
#warnings.filterwarnings('error')

In [None]:
sys.path.insert(0, '..')

In [None]:
import pyaudisam as ads

ads.runtime

In [None]:
# Create temporary directory if not yet done.
tmpDir = pl.Path('tmp')
tmpDir.mkdir(exist_ok=True)

In [None]:
# Logging configuration.
ads.log.configure(handlers=[sys.stdout, tmpDir / 'devarc1.log'], verbose=True, reset=True)

ads.logger('matplotlib', level=ads.WARNING, reset=True)

ads.logger('ads', level=ads.INFO, reset=True)
ads.logger('ads.dat', level=ads.INFO, reset=True)
ads.logger('ads.eng', level=ads.INFO2, reset=True)
ads.logger('ads.opn', level=ads.INFO1, reset=True)
ads.logger('ads.opr', level=ads.INFO1, reset=True)
ads.logger('ads.anr', level=ads.INFO1, reset=True)

logger = ads.logger('unintst', level=ads.DEBUG, reset=True)

# I. Development of pyaudisam

## Distance detection

In [None]:
# Distance software detection params.
DistanceSuppVers = [7, 6] # Lastest first.
DistancePossInstPaths = [pl.Path().resolve(), pl.Path('C:\\Program files (x86)'),
                         pl.Path('C:\\Program files'), pl.Path('C:\\PortableApps')]

# Find given executable installation dir.
def findExecutable(exeFileName):

    exeFilePathName = None
    print('Looking for {} ...'.format(exeFileName))
    for path in DistancePossInstPaths:
        for ver in DistanceSuppVers:
            exeFileDir = path / 'Distance {}'.format(ver)
            print(' - checking {} : '.format(exeFileDir), end='')
            exeFN = exeFileDir / exeFileName
            if not exeFN.exists():
                print('no.')
            else:
                print('yes !')
                exeFilePathName = exeFN
                break
        if exeFilePathName:
            break

    if exeFilePathName:
        print('{} found in {}'.format(exeFileName, exeFileDir))
    else:
        raise Exception('Could not find {} ; please install Distance software (V6 or later)'.format(exeFileName))

    return exeFilePathName

In [None]:
findExecutable('MCDS.exe')

## Results reports styling

(to stress interesting and/or important things)

In [None]:
dfTrSynRes = results.dfTransData('fr', columns=synthCols)
dfTrSynRes

In [None]:
cChrGray = '#869074'
cBckGreen, cBckGray = '#e0ef8c', '#dae3cb'
cSclGreen, cSclOrange, cSclRed = '#cbef8c', '#f9da56', '#fe835a' #'#f25e2d'
scaledColors = [cSclGreen, cSclOrange, cSclRed]
scaledColorsRvd = list(reversed(scaledColors))

dExCodeColors = dict(zip([1, 2, 3], scaledColors))
def colorExecCodes(sCodes):
    return ['background-color: ' + dExCodeColors.get(c, dExCodeColors[3]) for c in sCodes]

def scaledColorV(v, thresholds, colors): # len(thresholds) == len(colors) - 1
    if pd.isnull(v):
        return cBckGray
    for ind, thresh in enumerate(thresholds):
        if v > thresh:
            return colors[ind]
    return colors[-1]
def scaledColorS(sValues, thresholds, colors):
    return ['background-color: ' + scaledColorV(v, thresholds, colors) for v in sValues]

densCVThresholds = [0.4, 0.1]

dfs = dfTrSynRes \
        .sort_values(by=['Espèce', 'Echantillon', 'Précision', 'Durée', 'Delta AIC']) \
        .style \
        .set_precision(3) \
        .set_properties(subset=pd.IndexSlice[dfTrSynRes[dfTrSynRes['Delta AIC'] == 0].index, :],
                        **{'background-color': cBckGreen}) \
        .apply(colorExecCodes, subset=['CodEx'], axis='columns') \
        .apply(scaledColorS, subset=['CoefVar Densité'], axis='columns',
               thresholds=densCVThresholds, colors=scaledColors) \
        .set_properties(subset=pd.IndexSlice[dfTrSynRes[~dfTrSynRes.CodEx.isin([1, 2])].index, :],
                         **{'color': cChrGray}) \
        .where(pd.isnull, 'color: transparent')

    #.format(lambda v: v if not pd.isnull(v) else '') # Détruit une partie des arrondis, auugmente la précision ???

    #.set_precision(3) # Not really usable, as only for the whole frame

    #.apply(lambda s: ['color: grey']*len(s), subset=pd.IndexSlice[dfTrSynRes[~dfTrSynRes.CodEx.isin([1, 2])].index, :],
    #       axis='index') # OK
    
    #.apply(lambda s: ['color: grey']*len(s), subset=dfTrSynRes[~dfTrSynRes.CodEx.isin([1, 2])].index,
    #       axis='index') # KO
    
dfs.to_excel('tmp/styled-results.xlsx')

dfs

## Decode MCDS plots file

In [None]:
import matplotlib.pyplot as plt
import plotly.graph_objs as plygo

In [None]:
realRunWorkDir = pl.Path('../donnees/acdc/210118-1904/SylvAtri-b-5mn-m-haz-cos-xk2syfzw')
[fpn.name for fpn in realRunWorkDir.iterdir()]

In [None]:
srcFileName = pl.Path(realRunWorkDir, 'plots.txt')

In [None]:
lines = open(srcFileName, 'r').readlines()
lines = [line.strip() for line in lines]
len(lines)

In [None]:
lines[:10]

In [None]:
itLines = iter(lines)
chapters = list()
for title in itLines:
    #title = next(itLines)
    subTitle = next(itLines)
    xLabel = next(itLines)
    yLabel = next(itLines)
    xMin, xMax, yMin, yMax = [float(s) for s in next(itLines).split()]
    nDataRows = int(next(itLines))
    dataRows = list()
    for l in range(nDataRows):
        dataRows.append([float(s) for s in next(itLines).split()])
    chapters.append(dict(title=title, subTitle=subTitle, dataRows=dataRows, #nDataRows=nDataRows,
                         xLabel=xLabel, yLabel=yLabel, xMin=xMin, xMax=xMax, yMin=yMin, yMax=yMax))
len(chapters), chapters[0]

In [None]:
## QQ-plot
chapter = chapters[0]
chapter

In [None]:
n = len(chapter['dataRows'])
dfQqData = pd.DataFrame(data=chapter['dataRows'], columns=['If the fit was perfect ...', 'Real observations'],
                        index=np.linspace(0.5/n, 1.0-0.5/n, n))
dfQqData

In [None]:
# Option 1 : OK
#fig = plt.figure(figsize=(16, 6))
#axes = fig.subplots()
#_ = dfQqData.plot(ax=axes, color=['blue', 'red'], grid=True,
#                  xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))

# Option 2 : OK
axes = dfQqData.plot(figsize=(16, 6), color=['blue', 'red'], grid=True,
                     
                     xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))
fig = axes.figure

axes.legend(['If the fit was perfect ...', 'Real observations'], fontsize=12)
axes.set_facecolor('#f9fbf3')
axes.figure.patch.set_facecolor('#f9fbf3')
axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)
axes.set_xlabel(chapter['xLabel'], fontsize=12)
_ = axes.set_ylabel(chapter['yLabel'], fontsize=12)

In [None]:
axes.figure.savefig('tmp/mlb-qqplot.jpg', box_inches='tight')
axes.figure.savefig('tmp/mlb-qqplot.png', box_inches='tight', transparent=True)

In [None]:
plt.close(fig)

In [None]:
# Detection probability 1
chapter = chapters[1]
chapter

In [None]:
dfDetProbData = pd.DataFrame(data=chapter['dataRows'], 
                             columns=[chapter['xLabel'], chapter['yLabel'] + ' (sampled)', chapter['yLabel'] + ' (fitted)'])
dfDetProbData.set_index(chapter['xLabel'], inplace=True)
dfDetProbData

In [None]:
axes = dfDetProbData.plot(figsize=(16, 6), color=['blue', 'red'], grid=True,
                          xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))

axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)
axes.legend(dfDetProbData.columns, fontsize=12)
axes.set_xlabel(chapter['xLabel'], fontsize=12)
_ = axes.set_ylabel(chapter['yLabel'], fontsize=12)

In [None]:
# Plotly 4
fig = plygo.Figure()

fig.add_trace(plygo.Scatter(x=dfQqData.index, y=dfQqData['If the fit was perfect ...'],
                            name='If the fit was perfect ...', line=dict(color='blue', width=2), opacity=0.7))
fig.add_trace(plygo.Scatter(x=dfQqData.index, y=dfQqData['Real observations'],
                            name='Real observations', line=dict(color='red', width=2)))

fig.update_layout(title=chapter['title'] + ' : ' + chapter['subTitle'],
                  xaxis=dict(title=chapter['xLabel'], range=(chapter['xMin'], chapter['xMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  yaxis=dict(title=chapter['yLabel'], range=(chapter['yMin'], chapter['yMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  legend=plygo.layout.Legend(x=0.09, y=0.90, bordercolor='black', borderwidth=1),
                  shapes=[plygo.layout.Shape(type='line', x0=chapter['xMax'], y0=chapter['yMin'],
                                                          x1=chapter['xMax'], y1=chapter['yMax']),
                          plygo.layout.Shape(type='line', x0=chapter['xMin'], y0=chapter['yMax'],
                                                          x1=chapter['xMax'], y1=chapter['yMax'])],
                  template='none')

fig

In [None]:
# Wow ... VERY slooooooooow !
fig.write_image("tmp/ply-qqplot.svg")
fig.write_image("tmp/ply-qqplot.png")

In [None]:
# Plotly 4
fig = plygo.Figure()

fig.add_trace(plygo.Scatter(x=dfDetProbData.index, y=dfDetProbData[chapter['yLabel'] + ' (sampled)'],
                            name=chapter['yLabel'] + ' (sampled)', line=dict(color='blue', width=2), opacity=0.7))
fig.add_trace(plygo.Scatter(x=dfDetProbData.index, y=dfDetProbData[chapter['yLabel'] + ' (fitted)'],
                            name=chapter['yLabel'] + ' (fitted)', line=dict(color='red', width=2)))

fig.update_layout(title=chapter['title'] + ' : ' + chapter['subTitle'],
                  xaxis=dict(title=chapter['xLabel'], range=(chapter['xMin'], chapter['xMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  yaxis=dict(title=chapter['yLabel'], range=(chapter['yMin'], chapter['yMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  legend=plygo.layout.Legend(x=0.65, y=0.85*chapter['yMax'], bordercolor='black', borderwidth=1),
                  shapes=[plygo.layout.Shape(type='line', x0=chapter['xMax'], y0=chapter['yMin'],
                                                          x1=chapter['xMax'], y1=chapter['yMax']),
                          plygo.layout.Shape(type='line', x0=chapter['xMin'], y0=chapter['yMax'],
                                                          x1=chapter['xMax'], y1=chapter['yMax'])],
                  template='none')

fig

In [None]:
# PDF 1
chapter = chapters[2]
chapter

In [None]:
dfProdDensData = pd.DataFrame(data=chapter['dataRows'], 
                              columns=[chapter['xLabel'], chapter['yLabel'] + ' (sampled)', chapter['yLabel'] + ' (fitted)'])
dfProdDensData.set_index(chapter['xLabel'], inplace=True)
dfProdDensData

In [None]:
axes = dfProdDensData.plot(figsize=(16, 6), color=['blue', 'red'],
                           xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))
axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)
axes.legend(dfProdDensData.columns, fontsize=12)
axes.set_xlabel(chapter['xLabel'], fontsize=12)
_ = axes.set_ylabel(chapter['yLabel'], fontsize=12)

In [None]:
# Plotly 4
fig = plygo.Figure()

fig.add_trace(plygo.Scatter(x=dfProdDensData.index, y=dfProdDensData[chapter['yLabel'] + ' (sampled)'],
                            name=chapter['yLabel'] + ' (sampled)', line=dict(color='blue', width=2), opacity=0.7))
fig.add_trace(plygo.Scatter(x=dfProdDensData.index, y=dfProdDensData[chapter['yLabel'] + ' (fitted)'],
                            name=chapter['yLabel'] + ' (fitted)', line=dict(color='red', width=2)))

fig.update_layout(title=chapter['title'] + ' : ' + chapter['subTitle'],
                  xaxis=dict(title=chapter['xLabel'], range=(chapter['xMin'], chapter['xMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  yaxis=dict(title=chapter['yLabel'], range=(chapter['yMin'], chapter['yMax']),
                             zeroline=True, linewidth=1, linecolor='black'),
                  legend=plygo.layout.Legend(xanchor='right', yanchor='top', bordercolor='black', borderwidth=1),
                  #margin=plygo.layout.Margin(l=40, r=40, b=40, t=40, pad=0),
                  shapes=[plygo.layout.Shape(type='line', x0=chapter['xMax'], y0=chapter['yMin'],
                                                          x1=chapter['xMax'], y1=chapter['yMax']),
                          plygo.layout.Shape(type='line', x0=chapter['xMin'], y0=chapter['yMax'],
                                                          x1=chapter['xMax'], y1=chapter['yMax'])],
                  template='none')

fig

In [None]:
# PDF 3, with stripplot
chapter = chapters[6]
chapter

In [None]:
dfProdDensData = pd.DataFrame(data=chapter['dataRows'], 
                              columns=[chapter['xLabel'], chapter['yLabel'] + ' (sampled)', chapter['yLabel'] + ' (fitted)'])
dfProdDensData.set_index(chapter['xLabel'], inplace=True)
dfProdDensData

In [None]:
with open(pl.Path(realRunWorkDir, 'cmd.txt'), 'r') as cmdFile:
    fieldsLine = next(line for line in cmdFile.readlines() if line.startswith('Fields='))
dataCols = fieldsLine.strip('\n;')[len('Fields='):].split(',')
print(dataCols)

dfData = pd.read_csv(pl.Path(realRunWorkDir, 'data.txt'), sep='\t', names=dataCols)
sDists = dfData.DISTANCE.dropna()
sDists

In [None]:
dfData

In [None]:
import seaborn as sb
import matplotlib.ticker as pltt

In [None]:
%%time

axes = dfProdDensData.plot(figsize=(16, 6), color=['blue', 'red'], grid=True, linewidth=1,
                    xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))
axes.set_xlabel(chapter['xLabel'], fontsize=12)
axes.set_ylabel(chapter['yLabel'], fontsize=12)
axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)

axes2 = axes.twinx()
sb.stripplot(ax=axes2, x=sDists, color='green', size=8, alpha=0.4, jitter=0.3)

aMTicks = axes.get_xticks()
axes.xaxis.set_minor_locator(pltt.MultipleLocator((aMTicks[1]-aMTicks[0])/5))
axes.tick_params(which='minor', grid_linestyle='-.', grid_alpha=0.6)
axes.grid(True, which='minor')

axes.legend().remove()
_ = axes.figure.legend(labels=list(dfProdDensData.columns) + ['Individual observations'], fontsize=12,
                       bbox_to_anchor=(1, 1), bbox_transform=axes.transAxes)

In [None]:
lines, labels, lines2, labels2

In [None]:
%%time

axes = dfProdDensData.plot(figsize=(16, 6), color=['blue', 'red'], grid=True,
                           xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))
axes.set_xlabel(chapter['xLabel'], fontsize=12)
axes.set_ylabel(chapter['yLabel'], fontsize=12)
axes.set_title(label=chapter['title'] + ' : ' + chapter['subTitle'], fontdict=dict(fontsize=16), pad=20)
axes.legend(dfProdDensData.columns, fontsize=12)

axes2 = axes.twinx()
_ = sb.swarmplot(ax=axes2, x=sDists, color='green', size=8, alpha=0.5)

In [None]:
# Rebuild distance histogram from plot data (intervals) and distance data
sh = dfProdDensData['Probability Density (sampled)']
bins = [0] + sh.loc[((sh.shift(-1) != sh) | (sh.shift(1) != sh)) & (sh == 0)].index.tolist()
bins[-1] += 0.001
bins

In [None]:
np.histogram(sDists, bins=bins)

In [None]:
%%time

# Dropping useless points actually doesn't save overall "plotting" time !!!
_ = sh.loc[(sh.shift(-1) != sh) | (sh.shift(1) != sh)] \
      .plot(figsize=(16, 6), color='blue', grid=True,
            xlim=(chapter['xMin'], chapter['xMax']), ylim=(chapter['yMin'], chapter['yMax']))

In [None]:
# Histogramme à tranches fixes (OK)
distBinWidth = 10
distBins = np.linspace(start=0, stop=distBinWidth * int(chapter['xMax'] / distBinWidth),
                       num=1 + int(chapter['xMax'] / distBinWidth)).tolist()
if distBins[-1] < chapter['xMax']:
    distBins.append(chapter['xMax'])

axes = dfData.DISTANCE.plot.hist(bins=distBins, #xmin=chapter['xMin'], xmax=chapter['xMax'],
                                 figsize=(12, 6), fill=None, edgecolor='blue', rwidth=1.0, linewidth=1, zorder=10)
axes.set_xlim((0, dfData.DISTANCE.max()))
axes.set_xlabel('Distance')
axes.grid(True, which='major', zorder=0)

axes.grid(True, which='minor', zorder=0)
aMTicks = axes.get_xticks()
axes.tick_params(which='minor', grid_linestyle='-.', grid_alpha=0.6)
axes.xaxis.set_minor_locator(pltt.MultipleLocator((aMTicks[1]-aMTicks[0])/5))
axes.legend()

In [None]:
# Histogramme à tranches fixes => 2 tailles de tranches (???)
distBinWidth = 40
distBins = np.linspace(start=0, stop=distBinWidth * int(chapter['xMax'] / distBinWidth),
                       num=1 + int(chapter['xMax'] / distBinWidth)).tolist()
if distBins[-1] < chapter['xMax']:
    distBins.append(chapter['xMax'])

axes = dfData.DISTANCE.plot.hist(bins=distBins, #xmin=chapter['xMin'], xmax=chapter['xMax'],
                                 figsize=(12, 6), fill=None, edgecolor='red', rwidth=1.00, linewidth=1, zorder=10)

distBinWidth = 20
distBins = np.linspace(start=0, stop=distBinWidth * int(chapter['xMax'] / distBinWidth),
                       num=1 + int(chapter['xMax'] / distBinWidth)).tolist()
if distBins[-1] < chapter['xMax']:
    distBins.append(chapter['xMax'])

_ = dfData.DISTANCE.plot.hist(ax=axes, bins=distBins, #xmin=chapter['xMin'], xmax=chapter['xMax'],
                                 figsize=(12, 6), fill=None, edgecolor='green', rwidth=0.85, linewidth=1, zorder=20)

distBinWidth = 10
distBins = np.linspace(start=0, stop=distBinWidth * int(chapter['xMax'] / distBinWidth),
                       num=1 + int(chapter['xMax'] / distBinWidth)).tolist()
if distBins[-1] < chapter['xMax']:
    distBins.append(chapter['xMax'])

_ = dfData.DISTANCE.plot.hist(ax=axes, bins=distBins, #xmin=chapter['xMin'], xmax=chapter['xMax'],
                                 figsize=(12, 6), fill=None, edgecolor='blue', rwidth=0.70, linewidth=1, zorder=30)

axes.set_xlim((0, dfData.DISTANCE.max()))
axes.set_xlabel('Distance')
axes.grid(True, which='major', zorder=0)

axes.grid(True, which='minor', zorder=0)
aMTicks = axes.get_xticks()
axes.tick_params(which='minor', grid_linestyle='-.', grid_alpha=0.6)
axes.xaxis.set_minor_locator(pltt.MultipleLocator((aMTicks[1]-aMTicks[0])/5))
axes.legend([f'{deltaDist} m' for deltaDist in [40, 20, 10]])

In [None]:
# Histogramme à tranches fixes (KO : pas trouvé moyen forcer les ticks)
distBinWidth = 10
distBins = np.linspace(start=0, stop=distBinWidth * int(chapter['xMax'] / distBinWidth),
                       num=1 + int(chapter['xMax'] / distBinWidth)).tolist()
if distBins[-1] < chapter['xMax']:
    distBins.append(chapter['xMax'])
distHist, distBins = np.histogram(dfData.DISTANCE, bins=distBins)
sDistHist = pd.Series(data=distHist, index=distBins[:-1]+distBinWidth/2)

axes = sDistHist.plot.bar(figsize=(16, 4), fill=None, edgecolor='blue', width=0.9, zorder=10)

#majTicksDelta = 100
#aMajTicks = np.linspace(start=0, stop=majTicksDelta * int(chapter['xMax'] / majTicksDelta),
#                      num=1 + int(chapter['xMax'] / majTicksDelta)).tolist()
#axes.set_xticks(minor=False, ticks=aMajTicks)
axes.grid(True, which='major', zorder=0)

#axes.xaxis.set_minor_locator(pltt.MultipleLocator(majTicksDelta/5))
#axes.tick_params(which='minor', grid_linestyle='-.', grid_alpha=0.6)
#axes.grid(True, which='minor', zorder=0)

In [None]:
distHist, distBins, aMajTicks

In [None]:
# Histogramme à tranches "optimisées" (max(sturges, fd))
distHist, distBins = np.histogram(dfData.DISTANCE, bins='auto', range=(0, 500))
sDistHist = pd.Series(data=distHist, index=distBins[:-1])

axes = sDistHist.plot.bar(figsize=(16, 4), fill=None, edgecolor='blue', width=0.9, zorder=10)


## Decode stats (actual results) from MCDS work folders

In [None]:
#_ = implib.reload(ads)

In [None]:
# Results set to store results into.
miCustCols = pd.MultiIndex.from_tuples([('id', 'ExecCase', 'Value')])
dfCustColTrans = \
    pd.DataFrame(index=miCustCols, data=dict(en=['ExecCase'], fr=['CasExec']))

results = ads.MCDSAnalysisResultsSet(miCustomCols=miCustCols, dfCustomColTrans=dfCustColTrans, sampleIndCol='SampNum',
                                     distanceUnit='Meter', areaUnit='Hectare',
                                     surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Analysis engine
mcds = ads.MCDSEngine(workDir='refout/dist-order-sens-min',
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial', clustering=False)

In [None]:
# Process folders in engine work folder.
for folder in os.listdir(mcds.workDir):
    
    # Skip folders that are not MCDS run ones.
    folderPath = os.path.join(mcds.workDir, folder)
    if not os.path.isdir(folderPath):
        continue
    if os.path.splitext(folder)[1] or 'stats.txt' not in os.listdir(folderPath):
        print(f'Skipping {folderPath}, not an MCDS.exe run folder with a stats.txt file')
        continue
        
    # Tell the engine were it has run (even it does not rember it ;-)
    #_ = mcds.setupRunFolder(forceSubFolder=folder)
    
    # Decode results.
    sRes = mcds.decodeStats(folder)
    print()
    
    # Store them for later.
    sHead = pd.Series(data=[folder], index=miCustCols)
    results.append(sRes, sCustomHead=sHead)

# Tadaaaaaaa !
results.dfTransData('fr')

In [None]:
results.dfTransData('en').to_excel(pl.Path('tmp', 'dist-order-sens-auto-results.xlsx'), index=False)

## Development of AnalysisResultsSet.compare

In [None]:
dfl = pd.DataFrame([dict(i=1, a=1.0, b=2.0, c=np.nan),
                    dict(i=2, a=2.0, b=3.0, c=4.0),
                    dict(i=3, a=4.0, b=5.0, c=np.nan),
                    dict(i=4, a=np.nan, b=6.0, c=-7.5)])
dfl.set_index('i', inplace=True)
dfr = pd.DataFrame([dict(i=0, a=0.0, b=np.nan, c=2.0),
                    dict(i=2, a=2.01, b=3.0, c=np.nan),
                    dict(i=3, a=np.nan, b=5.01, c=np.nan),
                    dict(i=4, a=np.nan, b=6.0, c=-7.5)])
dfr.set_index('i', inplace=True)

In [None]:
dfl

In [None]:
dfr

In [None]:
dfl1 = dfl.join(dfr[['a']], rsuffix='_r', how='outer').drop(columns='a_r')
dfl1

In [None]:
dfr1 = dfr.join(dfl[['a']], rsuffix='_l', how='outer').drop(columns='a_l')
dfr1

In [None]:
nColLevels = dfl1.columns.nlevels
KTmpCol = 'tmp' if nColLevels == 1 else tuple('tmp{}'.format(i) for i in range(nColLevels))
dfd = dfl1.copy()
for col2Diff in dfl1.columns:
    dfd[KTmpCol] = dfr1[col2Diff]
    dfd[col2Diff] = dfd[[col2Diff, KTmpCol]].apply(closeness, axis='columns')
    dfd.drop(columns=[KTmpCol], inplace=True)
dfd

In [None]:
dfd.loc[dfl[~dfl.index.isin(dfr.index)].index, :] = 0
dfd.loc[dfr[~dfr.index.isin(dfl.index)].index, :] = 0

In [None]:
dfd

In [None]:
dropNans = True
#sbRows2Drop = (dfd > 2 | dfd.isnull()).all(axis='columns')
sbRows2Drop = dfd.apply(lambda s: s > 2 | s.isnull(), axis='index').all(axis='columns')
#sbRows2Drop = dfd.applymap(lambda v: v > 2 or pd.isnull(v)).all(axis='columns')
sbRows2Drop

In [None]:
dfd.drop(dfd[sbRows2Drop].index, axis='index', inplace=True)

In [None]:
dfr1 = dfr.append(dfl[~dfl.index.isin(dfr.index)], sort=False)
dfr1.sort_index(inplace=True)
dfr1

## Generate stats columns translation file

(from documentation stats & modules specs)

In [None]:
tgtTransFileName = 'tmp/stat-mod-trans.auto.txt'

In [None]:
class Translator(object):
    
    def __init__(self, dTrans, lang='en'):
        assert 'en' in dTrans, 'At least "en" translation must be defined'
        self.dTrans = dTrans
        self.setLang(lang)
        
    def setLang(self, lang):
        self.lang = lang.lower()
        assert self.lang in ['en', 'fr'], 'No support for "{}" language'.format(lang)
        
    def __call__(self, s):
        return self.dTrans.get(self.lang, self.dTrans['en']).get(s, self.dTrans['en'].get(s, s))

In [None]:
DFigureTrans = \
    dict(en=dict(Value='', Cv='CoefVar', Lcl='Min', Ucl='Max', Df='DoF'),
         fr=dict(Value='', Cv='CoefVar', Lcl='Min', Ucl='Max', Df='DegLib'))

figtr = Translator(DFigureTrans, lang='en')

In [None]:
DStatisticTrans = \
    dict(en={ 'number of observations (n)': 'NObs',
              'number of samples (k)': 'NSamp',
              'effort (L or K or T)': 'Effort',
              'encounter rate (n/L or n/K or n/T)': 'EncRate',
              'left truncation distance': 'LeftTruncDist',
              'right truncation distance (w)': 'RightTruncDist',
              'total number of parameters (m)': 'TotNumPars',
              'AIC value': 'AIC',
              'chi-square test probability (distance set 1)': 'Chi2 P 1',
              'chi-square test probability (distance set 2)': 'Chi2 P 2',
              'chi-square test probability (distance set 3)': 'Chi2 P 3',
              'f(0) or h(0)': 'f/h(0)',
              'probability of detection (Pw)': 'PDetec',
              'effective strip width (ESW) or effective detection radius (EDR)': 'EDR/ESW',
              'AICc': 'AICc',
              'BIC': 'BIC',
              'Log likelihood': 'LogLhood',
              'Kolmogorov-Smirnov test probability': 'KS P',
              'Cramér-von Mises (uniform weighting) test probability': 'CvM Uw P',
              'Cramér-von Mises (cosine weighting) test probability': 'CvM Cw P',
              'key function type': 'KeyFn',
              'adjustment series type': 'AdjSer',
              'number of key function parameters (NKP)': 'NumKFnPars',
              'number of adjustment term parameters (NAP)': 'NumASerPars',
              'number of covariate parameters (NCP)': 'NumCovars',
              'estimated value of A(1) adjustment term parameter': 'EstA(1)',
              'estimated value of A(2) adjustment term parameter': 'EstA(2)',
              'estimated value of A(3) adjustment term parameter': 'EstA(3)',
              'estimated value of A(4) adjustment term parameter': 'EstA(4)',
              'estimated value of A(5) adjustment term parameter': 'EstA(5)',
              'estimated value of A(6) adjustment term parameter': 'EstA(6)',
              'estimated value of A(7) adjustment term parameter': 'EstA(7)',
              'estimated value of A(8) adjustment term parameter': 'EstA(8)',
              'estimated value of A(9) adjustment term parameter': 'EstA(9)',
              'estimated value of A(10) adjustment term parameter': 'EstA(10)',
              'average cluster size': 'AvgClustSz',
              'size-bias regression correlation (r)': 'SzBias RegCorr',
              'p-value for correlation significance (r-p)': 'CorSignPVal',
              'estimate of expected cluster size corrected for size bias': 'EstExpFixedCluSz',
              'density of clusters (or animal density if non-clustered)': 'DensClu',
              'density of animals': 'Density',
              'number of animals, if survey area is specified': 'Number',
              'bootstrap density of clusters': 'BootsDensClu',
              'bootstrap density of animals': 'BootDensity',
              'bootstrap number of animals': 'BootNumber' },
         fr={ 'number of samples (k)': 'NEchant',
              'encounter rate (n/L or n/K or n/T)': 'TxContact',
              'left truncation distance': 'DistTroncGche',
              'right truncation distance (w)': 'DistTroncDte',
              'total number of parameters (m)': 'NbTotPars',
              'Log likelihood': 'LogProba',
              'key function type': 'FnClé',
              'adjustment series type': 'SérAjust',
              'number of key function parameters (NKP)': 'NbParsFnClé',
              'number of adjustment term parameters (NAP)': 'NbParsSérAjust',
              'number of covariate parameters (NCP)': 'NbCovars',
              'average cluster size': 'TailMoyClust',
              'size-bias regression correlation (r)': 'CorrReg BiaisTail',
              'p-value for correlation significance (r-p)': 'PVal SignifCorr',
              'estimate of expected cluster size corrected for size bias': 'TailCorrCluAttEst',
              'density of animals': 'Densité',
              'number of animals, if survey area is specified': 'Nombre',
              'bootstrap density of clusters': 'BootsDensClu',
              'bootstrap density of animals': 'DensitéBoot',
              'bootstrap number of animals': 'NombreBoot' })

statr = Translator(DStatisticTrans, lang='en')

In [None]:
dfStatModTrans = ads.MCDSEngine.statModCols().to_frame()
dfStatModTrans.reset_index(drop=True, inplace=True)
dfStatModTrans.rename(columns={ 0: 'Module', 1: 'Statistic', 2: 'Figure' }, inplace=True)
for lang in ['en', 'fr']:
    figtr.setLang(lang)
    statr.setLang(lang)
    dfStatModTrans[lang] = \
        dfStatModTrans.apply(lambda sRow: '{} {}'.format(figtr(sRow.Figure), statr(sRow.Statistic)).strip(),
                             axis='columns')

In [None]:
dfStatModTrans

In [None]:
dfStatModTrans.to_csv(tgtTransFileName, sep='\t', index=False)
tgtTransFileName

In [None]:
pd.DataFrame(index=ads.MCDSAnalysis.MIRunColumns,
             data=dict(en=['ModKeyFn', 'ModAdjSer', 'ModChcCrit', 'ConfInterv', 'LeftTrunc', 'RightTrunc',
                           'FitDistCuts', 'DiscrDistCuts', 'RunCode', 'StartTime', 'ElapsedTime', 'RunFolder'],
                       fr=['FnCléMod', 'SérAjustMod', 'CritChxMod', 'IntervConf', 'TroncGauche', 'TroncDroite',
                           'TranchDistFit', 'TranchDistDiscr', 'CodeExec', 'DébutExec', 'DuréeExec', 'DossierExec']))


In [None]:
dfStatModTransExt = pd.read_csv(tgtTransFileName, sep='\t')
dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])

In [None]:
lang = 'fr'
dTrans = dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])[lang].to_dict()
results.dfData.columns = [dTrans.get(col, col) for col in results.dfData.columns]
results.dfData

In [None]:
dfStatModTransExt.set_index(['Module', 'Statistic', 'Figure'])[lang].to_dict()

## Test case class

(no use actually : pd.DataFrame already does the job !)

In [None]:
# Super-class for test cases
class TestCase(object):
    def __init__(self, **attrs):
        if not hasattr(self.__class__, 'AttributeNames'):
            self.__class__.AttributeNames = set(attrs.keys())
        else:
            assert set(attrs.keys()) == self.AttributeNames, \
                   'Some attribute name not in frozen set {{{}}}'.format(','.join(self.AttributeNames))
        for attrName, AttrValue in attrs.items():
            setattr(self, attrName, AttrValue)
    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, ','.join('{}:{}'.format(k, v) for k, v in self.__dict__.items()))

In [None]:
# Test this super-class.
class TCTest(TestCase):
    pass

tstTestCases = list()
tstTestCases.append(TCTest(x=1, y='a')) # Define attributes
tstTestCases.append(TCTest(x=2, y='b')) # Check attributes
try:
    tstTestCases.append(TCTest(x=2, z=None)) # Refuse new attributes
    assert False, 'Error: New attributes should be refused'
except AssertionError as exc:
    print('Good refuse of new attributes:', exc)
    
[str(tc) for tc in tstTestCases]

## MCDS output decoding : stats file

TODO: Add french translation of variables / parameters names and descriptions

### 1. Name and description of stats table columns

In [None]:
fileName = 'mcds/stat-row-specs.txt'

fStatRowSpecs = open(fileName, mode='r', encoding='utf8')

In [None]:
statRowSpecLines = [line.rstrip('\n') for line in fStatRowSpecs.readlines() if not line.startswith('#')]
statRowSpecs =  [(statRowSpecLines[i].strip(), statRowSpecLines[i+1].strip()) \
                 for i in range(0, len(statRowSpecLines)-2, 3)]
dfStatRowSpecs = pd.DataFrame(columns=['Name', 'Description'], data=statRowSpecs).set_index('Name')

dfStatRowSpecs

In [None]:
dfStatRowSpecs.index

### 2. Number and description of the modules and relevant stats

(Module and Statistic columns of the table)

In [None]:
fileName = 'mcds/stat-mod-specs.txt'

fStatModSpecs = open(fileName, mode='r', encoding='utf8')

In [None]:
nMaxAdjParams = 10

statModSpecLines = [line.rstrip('\n') for line in fStatModSpecs.readlines() if not line.startswith('#')]
reModSpecNumName = re.compile('(.+) – (.+)')
statModSpecs = list()
moModule = None
for line in statModSpecLines:
    if not line:
        continue
    if moModule is None:
        moModule = reModSpecNumName.match(line.strip())
        continue
    if line == ' ':
        moModule = None
        continue
    moStatistic = reModSpecNumName.match(line.strip())
    modNum, modDesc, statNum, statDescNotes = \
        moModule.group(1), moModule.group(2), moStatistic.group(1), moStatistic.group(2)
    for i in range(len(statDescNotes)-1, -1, -1):
        if not re.match('[\d ,]', statDescNotes[i]):
            statDesc = statDescNotes[:i+1]
            statNotes = statDescNotes[i+1:].replace(' ', '')
            break
    modNum = int(modNum)
    if statNum.startswith('101 '):
        for num in range(nMaxAdjParams): # Assume no more than that ... a bit hacky !
            statModSpecs.append((modNum, modDesc, 101+num, # Make statDesc unique for later indexing
                                 statDesc.replace('each', 'A({})'.format(num+1)), statNotes))
    else:
        statNum = int(statNum)
        if modNum == 2 and statNum == 3: # Actually, there are 0 or 3 of these ...
            for num in range(3):
                statModSpecs.append((modNum, modDesc, num+201,
                                     # Change statNum & Make statDesc unique for later indexing
                                     statDesc+' (distance set {})'.format(num+1), statNotes))
        else:
            statModSpecs.append((modNum, modDesc, statNum, statDesc, statNotes))
dfStatModSpecs = pd.DataFrame(columns=['modNum', 'modDesc', 'statNum', 'statDesc', 'statNotes'],
                              data=statModSpecs).set_index(['modNum', 'statNum'])

dfStatModSpecs

In [None]:
# Modules
dfStatModSpecs.modDesc.unique()

### 3. Notes on module statistics

(more infos explainig how to use or not the 5 last columns: Value, Cv, Lcl, Ucl, Df)

In [None]:
fileName = 'mcds/stat-mod-notes.txt'

fStatModNotes = open(fileName, mode='r', encoding='utf8')

In [None]:
statModNoteLines = [line.rstrip('\n') for line in fStatModNotes.readlines() if not line.startswith('#')]
statModNotes =  [(int(line[:2]), line[2:].strip()) for line in statModNoteLines if line]

dfStatModNotes = pd.DataFrame(data=statModNotes, columns=['Note', 'Text']).set_index('Note')

dfStatModNotes

### 4. Read table

In [None]:
eng = mcds

In [None]:
eng.statsFileName

In [None]:
dfStatRows = pd.read_csv(eng.statsFileName, sep=' +', engine='python', names=dfStatRowSpecs.index)
dfStatRows

### 5. Decode table

Warning: We assume there's only 1 '0' stratum, only 1 '0' sample and only 1 '1' estimator '1'

#### a. Remove Stratum, Sample and Estimator columns

(see warning above)

In [None]:
dfStatRows.drop(columns=['Stratum', 'Sample', 'Estimator'], inplace=True)
dfStatRows

#### b. Cleanup N/A data

(according to 'notes' on stats)

In [None]:
# Empilage des "chiffres" (Figures) Value, Cv, Lcl, Ucl, Df pour chaque statistique / module
dfStats = dfStatRows.set_index(['Module', 'Statistic'], append=True).stack() \
                    .reset_index().rename(columns={'level_0': 'id', 'level_3': 'Figure', 0: 'Value'})
dfStats.head(10)

In [None]:
# 4. Fix multiple Module=2 & Statistic=3 rows (before joining with self.DfStatModSpecs)
newStatNum = 200
for lbl, sRow in dfStats[(dfStats.Module == 2) & (dfStats.Statistic == 3)].iterrows():
    if dfStats.loc[lbl, 'Figure'] == 'Value':
        newStatNum += 1
    dfStats.loc[lbl, 'Statistic'] = newStatNum
dfStats[(dfStats.Module == 2)]

In [None]:
# Ajout des colonnes de description/nommage des modules et statistiques
dfStats = dfStats.join(dfStatModSpecs, on=['Module', 'Statistic'])
dfStats.tail(10)

In [None]:
#dfStats[(dfStats.Module == 2) & (dfStats.Statistic > 200)]

In [None]:
# Vérification que les chiffres sans objet le sont vraiment (tous à 0.0 ?)
# Attention: Il doit y avoir un bug dans MCDS avec Module 2 / Statistic 10x : certains Cv ne sont pas nuls ...
sKeepOnlyValueFig = ~dfStats.statNotes.str.contains('1')
sFigs2Drop = (dfStats.Figure != 'Value') & sKeepOnlyValueFig
assert ~dfStats[sFigs2Drop & ((dfStats.Module != 2) | (dfStats.Statistic < 100))].Value.any(), \
       'Attention: Des chiffres supposés "sans objet" on des valeurs non nulles !'

In [None]:
# 2nde vérif. visuelle
dfStats[sFigs2Drop & dfStats.Value != 0].sort_values(by='Value', ascending=False)

In [None]:
# Suppression des lignes / chiffres sans objet.
dfStats.drop(dfStats[sFigs2Drop].index, inplace=True)
dfStats

In [None]:
dfStats.head()

In [None]:
dfStats = dfStats.reindex(columns=['modDesc', 'statDesc', 'Figure', 'Value'])
dfStats.set_index(['modDesc', 'statDesc', 'Figure'], inplace=True)
dfStats

In [None]:
dfStats.T.iloc[0]

## List MCDS warnings

(from massive analysis runs)

In [None]:
# $ cd donnees/cretes-cantal/201006-1527  # Un dossier de travail pour analyses auto
# $ find . -name "log.txt" | xargs grep "Warning
# Ex.
# ./TurdMeru-m-haz-cos-l50-r200-olv9yrf2/log.txt:      ** Warning: Parameter  2 is at an upper bound. **
# ./TurdMeru-m-haz-cos-l50-r250-4v1smzl8/log.txt:      ** Warning: Parameters are being constrained to obtain monotonicity. **
# ./TurdMeru-m-haz-cos-la-0v5ylg74/log.txt:      ** Warning: Parameters are being constrained to obtain monotonicity. **
# ./TrogTrog-m-uni-pol-ra-ma-pgspzdn1/log.txt:** Warning: convergence failure **

with open('donnees/cretes-cantal/mcds-warnings.log', 'r') as file:
    lines = file.readlines()
    
dfw = pd.DataFrame(data=[line.split(':') for line in lines], columns=['fpn', 'z', 'x', 'y'])
dfw[['t', 'fn']] = dfw.fpn.apply(lambda s: pd.Series(s[2:].split('/')[:2]))
dfw[['s', 'a']] = dfw.fn.apply(lambda s: pd.Series([s[:10], s[11:]]))
dfw['w'] = dfw[['x', 'y']].apply(lambda s: s.x + (' ' + s.y.strip() if s.y else ''), axis='columns').apply(str.strip)
dfw.w = dfw.w.apply(lambda s: s.replace(' **', ''))
dfw.drop(columns=['fn', 'z', 'x', 'y'], inplace=True)
#dfw.set_index(['t', 's', 'a', 'fpn'], inplace=True)

dfw

In [None]:
dict(warning_analyses=dfw.fpn.nunique(), warning_types=dfw.w.nunique())

In [None]:
print(dfw[['a', 'w']].groupby('a').count().value_counts().reset_index().T.to_markdown())

In [None]:
# $ cd donnees/cretes-cantal/201006-1527  # Un dossier de travail pour analyses auto
# $ find .-name "*log.txt" >../mcds-runs.log
# Ex:
# ./AlauArve-m-haz-cos-bqia9v69/log.txt
# ./AlauArve-m-haz-cos-l30-r100-i6y4g1l8/log.txt
# ./AlauArve-m-haz-cos-l30-r150-nbntf7tm/log.txt
# ./AlauArve-m-haz-cos-l30-r200-wlrrb2f5/log.txt
# ./AlauArve-m-haz-cos-l30-r250-tbd2sgrd/log.txt
# ./AlauArve-m-haz-cos-l50-r100-uzc695ac/log.txt
# ./AlauArve-m-haz-cos-l50-r150-2jiy8n9b/log.txt


with open('donnees/cretes-cantal/mcds-runs.log', 'r') as file:
    lines = file.readlines()

In [None]:
dfr = pd.DataFrame(index=[line.strip() for line in lines])
dfr

In [None]:
dict(analyses=len(dfr), warning_analyses=dfw.fpn.nunique(), pct_warning_analyses=100 * dfw.fpn.nunique() / len(dfr))

In [None]:
#dfw = dfr.join(dfw.set_index('fpn'))

In [None]:
dfw.w = dfw.w.apply(lambda s: s.replace('Parameter  1', 'Parameter  n').replace('Parameter  2', 'Parameter  n'))
dfw

In [None]:
dfp = dfw[['t', 's', 'a', 'w']].groupby(['t', 's', 'a']).apply(lambda dfg: ' # '.join(dfg.w.sort_values()))
dfp = dfp.reset_index().rename(columns={0: 'w'})
dfp['n'] = dfp.w.apply(lambda s: s.count('#')+1)
dfp.sort_values(by=['n', 'w'], ascending=True, inplace=True)
dfp

In [None]:
dfp.w.nunique()

In [None]:
dfp.w.value_counts().reset_index().to_excel('tmp/__.xlsx', index=False)

In [None]:
dfp.to_excel('tmp/_.xlsx', index=False)

In [None]:
print(pd.read_excel('tmp/__.xlsx', sheet_name='top24').to_markdown(index=False).replace('   ', ''))

## Distance cut specs for MCDS

* Mise au point
* tests unitaires

In [None]:
def distanceCutSpecs(minDist, maxDist, distCuts):
    
    distCutSpecs = ''
        
    #distCuts = params['distCuts']
    if distCuts is not None:

        if isinstance(distCuts, list) and minDist is not None and maxDist is not None:
            distCutSpecs += ' /Int=' + ','.join(str(d) for d in [minDist] + distCuts + [maxDist])
        elif isinstance(distCuts, int):
            distCutSpecs += ' /NClass=' + str(distCuts)

    if not isinstance(distCuts, list): # None or int

        #minDist = params['minDist']
        if minDist is not None:
            distCutSpecs += ' /Left=' + str(minDist)

        #maxDist = params['maxDist']
        if maxDist is not None:
            distCutSpecs += ' /Width=' + str(maxDist)
            
    return distCutSpecs

In [None]:
assert distanceCutSpecs(minDist=None, maxDist=None, distCuts=None) == ''

assert distanceCutSpecs(minDist=5, maxDist=None, distCuts=None) == ' /Left=5'
assert distanceCutSpecs(minDist=None, maxDist=100, distCuts=None) == ' /Width=100'
assert distanceCutSpecs(minDist=25.2, maxDist=100.5, distCuts=None) == ' /Left=25.2 /Width=100.5'

assert distanceCutSpecs(minDist=None, maxDist=None, distCuts=3) == ' /NClass=3'
assert distanceCutSpecs(minDist=None, maxDist=300, distCuts=8) == ' /NClass=8 /Width=300'
assert distanceCutSpecs(minDist=20, maxDist=None, distCuts=8) == ' /NClass=8 /Left=20'
assert distanceCutSpecs(minDist=20, maxDist=300, distCuts=8) == ' /NClass=8 /Left=20 /Width=300'

assert distanceCutSpecs(minDist=20, maxDist=300, distCuts=[100, 200, 230, 290]) == ' /Int=20,100,200,230,290,300'
assert distanceCutSpecs(minDist=None, maxDist=None, distCuts=[1, 2, 3]) == '' # min & maxDist have to be both defined
assert distanceCutSpecs(minDist=0, maxDist=None, distCuts=[1, 2, 3]) == '' # min & maxDist have to be both defined
assert distanceCutSpecs(minDist=None, maxDist=4, distCuts=[1, 2, 3]) == '' # min & maxDist have to be both defined

## Data tools development

### addAbsenceSightings

In [None]:
dfInSightings = dfObsIndiv

In [None]:
transectCol = 'Point'
taxonCol = 'Espece'
sampleCols = ['Passage', 'Adulte', 'Duree']

# The set of expected taxa ... of which we'll look for abscence on every location
expectedTaxa = list(dfObsIndiv[taxonCol].unique())
', '.join(expectedTaxa), len(expectedTaxa)

In [None]:
# Add "abscence" sightings to field data collected on transects for a given sample
#def addAbsenceSightings(dfInSightings, transectCol, taxonCol, expectedTaxa, sampleCols):
    
def absenceSightings(taxonCol, taxon, dAbscSightTmpl):
    dAbscSight = dAbscSightTmpl.copy()
    dAbscSight[taxonCol] = taxon
    return dAbscSight

assert not dfInSightings.empty, 'Error : Empty sightings data to complete !'

ldfAbscSightings = list()

# Use 1st sightings of the sample to build the absence sightings prototype
# (all null columns except for the sample identification ones)
dAbscSightTmpl = dfInSightings.iloc[0].to_dict()
dAbscSightTmpl.update({ k: None for k in dAbscSightTmpl.keys() if k not in sampleCols })

# For each transect
for transect in dfInSightings[transectCol].unique():

    # Update absence sightings template with transect id
    dAbscSightTmpl.update({ transectCol: transect })
    
    # Generate the absence sightings from it : 1 per lacking taxon
    lackingTaxa = set(expectedTaxa) - set(dfInSightings.loc[dfInSightings[transectCol] == transect, taxonCol].unique())
    dfAbscSights = pd.DataFrame([absenceSightings(taxonCol, txn, dAbscSightTmpl) for txn in lackingTaxa])
    
    ldfAbscSightings.append(dfAbscSights)

In [None]:
# Concat all data frames into one.
dfOutSightings = pd.concat([dfInSightings] + ldfAbscSightings)

# Reset index (for unique labels).
dfOutSightings.reset_index(inplace=True, drop=True)

In [None]:
len(dfOutSightings), len(dfInSightings)

## Distance troncations : auto-generation of variants

(at least a try ...)

### Data for playing : Individualised ones ...

(dfObsIndiv from somewhere in valtests.ipynb)

In [None]:
dfObsIndiv.head()

In [None]:
dfObsIndiv.groupby('Espece').size().sort_values(ascending=False)[:5]

In [None]:
dfObsIndSpc = dfObsIndiv[dfObsIndiv.Espece == 'Merle noir'].copy()

### Histograms

In [None]:
# Histogramme uniforme
_ = dfObsIndSpc.distMem.hist(figsize=(16, 3), bins=40)

### Empirical distribution

In [None]:
import statsmodels.distributions.empirical_distribution as sted

In [None]:
ecdf = sted.ECDF(dfObsIndSpc.distMem)

In [None]:
sEcdf = pd.Series({ x : ecdf(x) for x in dfObsIndSpc.distMem.unique() }).sort_index()
_ = sEcdf.plot(figsize=(16, 3))

### Quantiles : 2.5, 5 et 10%, left and right sides

In [None]:
aqLims = np.array([0.025, 0.05, 0.1, 0.9, 0.95, 0.975])
aqLims * len(dfObsIndSpc)

In [None]:
np.quantile(a=dfObsIndSpc.distMem, q=aqLims)

In [None]:
dfObsIndSpc[dfObsIndSpc.distMem <= 11.61]

### Brute force combination algorithm

In [None]:
lParams = list() # of dict(ltr=<left trunc dist or None>, rtr=<right trunc dist or None>, nc=<nb of cuts>)

In [None]:
aqtlLims = np.array([1.25, 2.5, 3.75, 5, 7.5, 10])

In [None]:
sLeftTruncs = pd.Series(index=aqtlLims, data=np.percentile(a=dfObsIndSpc.distMem, q=aqtlLims))
for leftPct, leftTrunc in sLeftTruncs.items():
    nRetSights = len(dfObsIndSpc[dfObsIndSpc.distMem >= leftTrunc])
    sqrNRetSights = math.sqrt(nRetSights)
    for nCuts in [2*sqrNRetSights/3, sqrNRetSights, 3*sqrNRetSights/2]:
        lParams.append(dict(ltr=leftTrunc, rtr=None, nc=round(nCuts), nrs=nRetSights, pct=100-leftPct))

In [None]:
sRightTruncs = pd.Series(index=100-aqtlLims, data=np.percentile(a=dfObsIndSpc.distMem, q=100-aqtlLims)).sort_index()
for rightPct, rightTrunc in sRightTruncs.items():
    nRetSights = len(dfObsIndSpc[dfObsIndSpc.distMem <= rightTrunc])
    sqrNRetSights = math.sqrt(nRetSights)
    for nCuts in [2*sqrNRetSights/3, sqrNRetSights, 3*sqrNRetSights/2]:
        lParams.append(dict(ltr=None, rtr=rightTrunc, nc=round(nCuts), nrs=nRetSights, pct=rightPct))

In [None]:
# ... etc ... but, why not use an optimisation engine then ?

In [None]:
pd.DataFrame(lParams)

### Optimising algorithm

In [None]:
dfObsIndSpc.head()

In [None]:
mcds = ads.MCDSEngine(workDir=os.path.join('ACDC', '2019-nat-opt'),
                      distanceUnit='Meter', areaUnit='Hectare',
                      surveyType='Point', distanceType='Radial')

In [None]:
sampleDecFields = ['Effort', 'Distance']

dAreaInfo = dict(Zone='ACDC', Surface=2400) # ha
dfObsIndSpc = ads.addSurveyAreaInfo(dfObsIndSpc, dAreaInfo=dAreaInfo)

dfObsIndSpc.rename(columns=dict(distMem='Distance'), inplace=True)
dfObsIndSpc.sort_values(by='Point', inplace=True)

sampDataSet = ads.DataSet(dfObsIndSpc, decimalFields=sampleDecFields)

In [None]:
sSamp = dfObsIndSpc.iloc[0]
abrvSpe = ''.join(word[:4].title() for word in sSamp['Espece'].split(' '))
sampAbbrev = '{}-{}-{}-{}'.format(abrvSpe, sSamp.Passage.replace('+', ''),
                                  sSamp.Adulte.replace('+', ''), sSamp['Duree'])

In [None]:
KPreEstimCrit = 'AICC'
KPreCVInterval = 95

def dsAnalyser3(aParams, mcdsEngine, sampDataSet, sampAbbrev, keyFn, adjSer):

    minDist = aParams[0]
    maxDist = aParams[1]
    fitDistCuts = round(aParams[2])
    print(minDist, maxDist, fitDistCuts)
    
    modAbbrev = keyFn[:3].lower() + '-' + adjSer[:3].lower()

    analysis = ads.MCDSAnalysis(engine=mcdsEngine, sampleDataSet=sampDataSet,
                                name=sampAbbrev + '-' + modAbbrev, logData=False,
                                estimKeyFn=keyFn, estimAdjustFn=adjSer,
                                estimCriterion=KPreEstimCrit, cvInterval=KPreCVInterval,
                                minDist=minDist, maxDist=maxDist, fitDistCuts=fitDistCuts)
    
    sResult = analysis.submit().getResults()

    aic = sResult[('detection probability', 'AIC value', 'Value')]
    
    return aic

def dsAnalyser2(aParams, mcdsEngine, sampDataSet, sampAbbrev, keyFn, adjSer, fitDistCuts):

    minDist = aParams[0]
    maxDist = aParams[1]
    print(minDist, maxDist)
    
    modAbbrev = keyFn[:3].lower() + '-' + adjSer[:3].lower()

    analysis = ads.MCDSAnalysis(engine=mcdsEngine, sampleDataSet=sampDataSet,
                                name=sampAbbrev + '-' + modAbbrev, logData=False,
                                estimKeyFn=keyFn, estimAdjustFn=adjSer,
                                estimCriterion=KPreEstimCrit, cvInterval=KPreCVInterval,
                                minDist=minDist, maxDist=maxDist, fitDistCuts=fitDistCuts)
    
    sResult = analysis.submit().getResults()
    #print(sResult.to_dict())

    #return sResult[('detection probability', 'AIC value', 'Value')]
    return sResult[('detection probability', 'AICc', 'Value')]

In [None]:
adjSer = 'COSINE'
keyFn = 'HNORMAL'
#keyFn = 'HAZARD'
#keyFn = 'UNIFORM'
#keyFn = 'NEXPON'

In [None]:
# Juste une analyse pour tester la fonction à r (AIC)
#              minDist, maxDist, fitDistCuts
aParams = np.array([0, 250, round(math.sqrt(len(sampDataSet.dfData)))])
dsAnalyser3(aParams, mcds, sampDataSet, sampAbbrev, keyFn, adjSer)

In [None]:
# Et maintenant, on lance l'optimisation.
#              minDist, maxDist, fitDistCuts
#maxMinDist, minMaxDist = np.percentile(a=dfObsIndSpc.Distance, q=[20, 80])
#maxMinDist, minMaxDist = np.percentile(a=dfObsIndSpc.Distance, q=[40, 60])
maxMinDist, minMaxDist = np.percentile(a=dfObsIndSpc.Distance, q=[49, 51])
sqrNRetSights = math.sqrt(len(dfObsIndSpc))
minFitDistCuts, maxFitDistCuts = round(2*sqrNSights/3), round(3*sqrNRetSights/2)
paramBounds = [(0, maxMinDist), (minMaxDist, dfObsIndSpc.Distance.max()), (minFitDistCuts, maxFitDistCuts)]
paramBounds

In [None]:
from scipy import optimize

In [None]:
dOptRes = dict()

In [None]:
fitDistCuts = 12
dOptRes['shgo'] = optimize.shgo(func=dsAnalyser2, bounds=paramBounds[:2], iters=2,
                                args=(mcds, sampDataSet, sampAbbrev, keyFn, adjSer, fitDistCuts))
dOptRes['shgo']

In [None]:
dOptRes['shgo'].keys()

In [None]:
dOptRes['shgo'] = optimize.shgo(func=dsAnalyser3, bounds=paramBounds, iters=2,
                                args=(mcds, sampDataSet, sampAbbrev, keyFn, adjSer))
dOptRes['shgo']

In [None]:
dOptRes['da'] = optimize.dual_annealing(func=dsAnalyser3, bounds=paramBounds,
                                        args=(mcds, sampDataSet, sampAbbrev, keyFn, adjSer))
dOptRes['da']

In [None]:
dOptRes['de'] = optimize.differential_evolution(func=dsAnalyser3, bounds=paramBounds,
                                                args=(mcds, sampDataSet, sampAbbrev, keyFn, adjSer))
dOptRes['de']

In [None]:
dOptRes['bh'] = optimize.basinhopping(func=dsAnalyser3, x0=[(mx+mn)/2 for mx, mn in paramBounds], stepsize=2,
                                      minimizer_kwargs=dict(args=(mcds, sampDataSet, sampAbbrev, keyFn, adjSer)))
dOptRes['bh']

## FieldDataSet, MonoCategoryDataSet dev

In [None]:
# Transform a multi-Category sightings set into an equivalent mono-Category sightings set,
# that is where no sightings has more that one category with positive count (keeping the same total counts).
# Ex: A sightings set with 2 Category count columns nMales and nFemales
#     * in the input set, you may have 1 sightings with nMales = 5 and nFemales = 2
#     * in the output set, this sightings have been separated in 2 distinct ones (all other properties left untouched) :
#       the 1st with nMales = 5 and nFemales = 0, the 2nd with nMales = 0 and nFemales = 2.

# A slower version or ads.separateMultiCategoryCounts :
#  from 9.5s to 0.1s with countColumns = [nMalAd1,nAutAd10,nJuv10,nDetTot10,nMalAd5,nAutAd5,nJuv5,nDetTot5,nTotAd5,nTotAd10]
#  on the "ACDC 2019 Naturalist" multi-Category data set (~4000 rows)
def separateMultiCategoryCounts_slow_version(dfInSightings, countColumns):
    
    outSightings = list()

    for lbl, sInSight in dfInSightings.iterrows():
        
        # [a little optimisation ?] If this is already a mono-Category sightings, simply append it as is.
        sCounts = sInSight[countColumns]
        sCounts = sCounts[sCounts > 0]
        if len(sCounts) == 1:
            
            outSightings.append(sInSight)
            
            continue

        # If it is a multi-Category sightings, we need to split it down.
        for col in sCounts.index:

            sOutSight = sInSight.copy()
            sOutSight[countColumns] = 0
            sOutSight[col] = sInSight[col]

            outSightings.append(sOutSight)

    return pd.DataFrame(data=outSightings, index=np.arange(len(outSightings)))

In [None]:
dfObs = pd.read_csv('refin/ACDC2019-Naturalist-ExtraitObsBrutesAvecDist.txt', sep='\t', decimal=',')
dfObs.head()

In [None]:
countCols =  ['nMalAd10', 'nAutAd10', 'nMalAd5', 'nAutAd5']

In [None]:
sCounts = dfObs[countCols].sum()

In [None]:
len(dfObs), sCounts.to_dict()

In [None]:
assert len(dfObs) == 724
assert not any(sCounts - pd.Series({'nMalAd10': 613, 'nAutAd10': 192, 'nMalAd5': 326, 'nAutAd5': 102}))

In [None]:
%%time

dfObsMonoCat_slow = separateMultiCategoryCounts_slow_version(dfObs, countCols)
len(dfObsMonoCat_slow), dfObsMonoCat_slow[countCols].sum()

In [None]:
# Transform a multi-individual mono-Category sightings set into an equivalent mono-individual mono-Category sightings set
# that is where no sightings has more that one individual per category (keeping the same total counts).
# Ex: A sightings set with 2 mono-Category count columns nMales and nFemales
#     In input set, you may have 1 sightings with nMales = 3 and nFemales = 0 (but none with nMales and nFemales > 0)
#     In out set, no : this sightings have been separated in 3 distinct ones (all other properties left untouched) :
#                      all with nMales = 1 and nFemales = 0.

# A slower version or ads.individualiseMonoCategoryCounts :
#  from 15.5s to 0.06s with countColumns = [nMalAd1,nAutAd10,nJuv10,nDetTot10,nMalAd5,nAutAd5,nJuv5,nDetTot5,nTotAd5,nTotAd10]
#  on the "ACDC 2019 Naturalist" mono-Category data set (~20000 rows)
def individualiseMonoCategoryCounts_slow(dfInSightings, countColumns):

    
    outSightings = list()

    for lbl, sInSight in dfInSightings.iterrows():

        # [a little check] Multi-Category sightings not supported here.
        sCounts = sInSight[countColumns]
        sCounts = sCounts[sCounts > 0]
        assert len(sCounts) == 1, 'Error: Multi-Category sightings not supported ' + str(lbl, sInSight)
        
        # Get the positive count column and its value
        posCol = sCounts.index[0]
        count = sCounts[posCol]

        # [a little optimisation ?] If this is a mono-individual sightings, simply append it as is.
        if count == 1:
            
            outSightings.append(sInSight)

            continue

        # If it is a multi-individual sightings, we need to split it down.
        while count > 0:

            sOutSight = sInSight.copy()
            sOutSight[posCol] = 1

            outSightings.append(sOutSight)

            count -= 1

    return pd.DataFrame(data=outSightings, index=np.arange(len(outSightings)))

In [None]:
%%time

dfObsIndiv_slow = individualiseMonoCategoryCounts_slow(dfObsMonoCat_slow, countCols)
len(dfObsIndiv_slow), dfObsIndiv_slow[countCols].sum()

In [None]:
# Transform a multi-individual multi-Category sightings set into an equivalent mono-individual multi-Category sightings set
# that is where no sightings has more that one individual per category (keeping the same total counts).
# Ex: A sightings set with 2 Category count columns nMales and nFemales
#     In input set, you may have 1 sightings with nMales = 5 and nFemales = 2
#     In out set, no : this sightings have been separated in 5 distinct ones (all other properties left untouched) :
#                      the 2 1st ones with nMales = 1 and nFemales = 1, the last 3 ones with nMales = 1 and nFemales = 0.

# Finally, of no use : simply chain ads.separateMultiCategoryCounts and ads.individualiseMonoCategoryCounts
# And from far much slower !
def individualiseMultiCategoryCounts(dfInSightings, countColumns):
    
    outSightings = list()

    for lbl, sInSight in dfInSightings.iterrows():

        # [a little optimisation ?] If this is a mono-individual sightings, simply append it as is.
        sCounts = sInSight[countColumns]
        if sCounts.max() == 1:
            
            outSightings.append(sInSight)
            
            continue

        # If it is a multi-individual sightings, we need to split it down.
        sCounts = sCounts.copy()

        while sCounts.max() > 0:

            sOutSight = sInSight.copy()
            sOutSight[countColumns] = sCounts.apply(lambda n: 1 if n > 0 else 0)

            outSightings.append(sOutSight)

            sCounts = sCounts.apply(lambda n: n-1 if n > 0 else 0)

    return pd.DataFrame(data=outSightings, index=list(range(len(outSightings))))

In [None]:
# Add "abscence" sightings to field data collected on transects for a given sample
# Warning: A special version for an all-taxon data set
# * dfInSights : input data table
# * transectCol : the name of the transect id column
# * taxonSampleCol : the name of the taxon id column
# * otherSampleCols : the names of the other sample id columns (taxon id not included)
# * expectedTaxa : the expected taxon ids : absence sightings are there to make sure
#                  all of the taxa are found at least once in the output data table
def addAbsenceSightings(dfInSights, transectCol, taxonSampleCol, otherSampleCols, expectedTaxa):
    
    def absenceSightings(taxonCol, taxon, dAbscSightTmpl):
        dAbscSight = dAbscSightTmpl.copy()
        dAbscSight[taxonCol] = taxon
        return dAbscSight

    assert not dfInSights.empty, 'Error : Empty sightings data to complete !'

    ldfAbscSights = list()

    # Use 1st sightings of the sample to build the absence sightings prototype
    # (all null columns except for the sample identification ones)
    dAbscSightTmpl = dfInSights.iloc[0].to_dict()
    dAbscSightTmpl.update({ k: None for k in dAbscSightTmpl.keys() if k not in otherSampleCols })

    # For each transect
    for transect in dfInSights[transectCol].unique():

        # Update absence sightings template with transect id
        dAbscSightTmpl.update({ transectCol: transect })

        # Generate the absence sightings from it : 1 per lacking taxon
        lackingTaxa = \
          set(expectedTaxa) - set(dfInSights.loc[dfInSights[transectCol] == transect, taxonSampleCol].unique())
        dfAbscSights = pd.DataFrame([absenceSightings(taxonSampleCol, txn, dAbscSightTmpl) for txn in lackingTaxa])

        # Save the data frame for later
        ldfAbscSights.append(dfAbscSights)

    # Concat all absence data frames into one.
    dfOutSights = pd.concat([dfInSights] + ldfAbscSights)

    # Reset index (for unique labels).
    dfOutSights.reset_index(inplace=True, drop=True)
    
    # Done.
    return dfOutSights



In [None]:
# Tests unitaires de addAbsenceSightings

In [None]:
# Define transect, taxon and sample columns
transectCol = 'Point'
taxonCol = 'Espece'
sampleCols = ['Passage', 'Adulte', 'Duree']

In [None]:
# The set of expected taxa ... of which we'll look for abscence on every location
expectedTaxa = list(dfObsIndiv[taxonCol].unique())

assert len(expectedTaxa) == 58

', '.join(expectedTaxa), len(expectedTaxa)

In [None]:
# Select 1 random sample
passage = 'a'
adulte = 'm'
duree = '10'
dfObsIndivSmpl = dfObsIndiv[(dfObsIndiv.Passage == passage) & (dfObsIndiv.Adulte == adulte) & (dfObsIndiv.Duree == duree)]

assert len(dfObsIndivSmpl) == 322 and dfObsIndivSmpl[transectCol].nunique() == 21

In [None]:
%%time

dfObsIndivAbscSmpl = ads.addAbsenceSightings(dfObsIndivSmpl, transectCol, taxonCol, expectedTaxa, sampleCols)
len(dfObsIndivAbscSmpl)

In [None]:
# Check for no change in sample columns
assert list(dfObsIndivAbscSmpl.columns) == list(dfObsIndivSmpl.columns)

# Check for number of added rows
assert len(dfObsIndivAbscSmpl) == 1333

# Check for no change in number of transect and taxa
assert dfObsIndivAbscSmpl[transectCol].nunique() == 21 and dfObsIndivAbscSmpl[taxonCol].nunique() == 58

# Check for no change in sample identification
assert list(dfObsIndivAbscSmpl.Passage.unique()) == [passage]
assert list(dfObsIndivAbscSmpl.Adulte.unique()) == [adulte]
assert list(dfObsIndivAbscSmpl.Duree.unique()) == [duree]

In [None]:
dfObsIndivSmpl.sort_values(by=['Passage', 'Observateur', 'Point', 'Espece', 'distMem']).head(20)

In [None]:
dfObsIndivAbscSmpl.sort_values(by=['Passage', 'Observateur', 'Point', 'Espece', 'distMem']).head(30)

In [None]:
%%time

# Performance test
print('Passage  Adulte Duree NbDonnees')

for passage in ['a', 'b', 'a+b']: 
    
    for adulte in ['m', 'a', 'm+a']:
    
        for duree in ['5', '10']:
            
            passages = passage.split('+')
            adultes = adulte.split('+')
            dfObsIndivSmpl = dfObsIndiv[dfObsIndiv.Passage.isin(passages) & dfObsIndiv.Adulte.isin(adultes) \
                                        & (dfObsIndiv.Duree == duree)]
            
            dfObsIndivAbscSmpl = ads.addAbsenceSightings(dfObsIndivSmpl, transectCol, taxonCol, expectedTaxa, sampleCols)
            
            print(passage, adulte, duree, ':', len(dfObsIndivAbscSmpl))

## ResultsSet.append

Updated version thanks to [pd.DataFrame.append(pd.Series) study](#Appending-series-to-DataFrame-...-columns-order) below

In [None]:
def append(dfData, sdfResult, sCustomHead):

    if sCustomHead is not None:
        if isinstance(sdfResult, pd.Series):
            sdfResult = sCustomHead.append(sdfResult)
        else: # DataFrame
            dfCustomHead = pd.DataFrame([sCustomHead]*len(sdfResult)).reset_index(drop=True)
            sdfResult = pd.concat([dfCustomHead, sdfResult], axis='columns')

    # Normal append if _dfData not empty ; otherwise initialise _dfData in a way
    # that keeps the original types of sdfResult / 
    if dfData.columns.empty:
        if isinstance(sdfResult, pd.Series):
            dfData = pd.DataFrame([sdfResult])
        else: # DataFrame
            dfData = sdfResult
    else:
        dfData = dfData.append(sdfResult, ignore_index=True)

    return dfData

### a. Initialise DataFrame

In [None]:
# Empty
df = pd.DataFrame()

In [None]:
# Not empty, mono-index columns
df = pd.DataFrame([dict(a=1, b=2.5, c='x', x='a', y=1, z=1.78),
                   dict(a=2, b=4.5, c='y', x='b', y=2, z=5.88889)])
df

In [None]:
# Not empty, multi-index columns
df = pd.DataFrame([{('a', 'z'): 1, ('b', 'y'): 2.5, ('c', 'x'): 'x', ('x', 'w'): 'a', ('y', 'v'): 1, ('z', 'u'): 1.78},
                   {('a', 'z'): 2, ('b', 'y'): 4.5, ('c', 'x'): 'y', ('x', 'w'): 'b', ('y', 'v'): 2, ('z', 'u'): 5.88889}])
df.columns = pd.MultiIndex.from_tuples(df.columns)
df

### b. Initialise Series / DataFrame to append

In [None]:
# Mono-index
sh = pd.Series(dict(a=3, b=5.978, c='w'))

In [None]:
sr = pd.Series(dict(x='c', y=4, z=9.567))

In [None]:
sr = pd.DataFrame([dict(x='d', y=9, z=12.9),
                   dict(x='e', y=8, z=7.778)])

In [None]:
# Multi-index
sh = pd.Series({('a', 'z'): 3, ('b', 'y'): 5.978, ('c', 'x'): 'w'})

In [None]:
sr = pd.Series({('x', 'w'): 'c', ('y', 'v'): 4, ('z', 'u'): 9.567})

In [None]:
sr = pd.DataFrame([{('x', 'w'): 'd', ('y', 'v'): 9, ('z', 'u'): 12.9},
                   {('x', 'w'): 'e', ('y', 'v'): 8, ('z', 'u'): 7.778}])

### c. append Series /DataFrame to DataFrame

In [None]:
df = append(df, sr, sh)
df

### d. See what's happening

In [None]:
df.dtypes

## MCDSAnalysisResultsSet quality indicators functions dev

In [None]:
import plotly.graph_objs as plygo

In [None]:
def normNTotPars(nTotPars, a=0.2, b=0.6, c=2, d=1):
        #return 1 / (a * sRes[cls.CLNTotPars] + b)  # Trop pénalisant: a=0.2, b=1
        return 1 / (a * max(c, nTotPars)**d + b)  # Mieux: a=0.2, b=0.6 / a=0.2, b=0.8

In [None]:
xDomain = [0, 1, 2, 3, 4, 5, 6, 8, 10 , 12]
paramSets = [dict(a=0.2, b=0.6, c=2, d=1), dict(a=0.2, b=0.8, c=1, d=1), dict(a=0.3, b=0.7, c=1, d=1),
             dict(a=0.2, b=0.8, c=1, d=2), dict(a=0.2, b=0.8, c=1, d=1.5)]

In [None]:
fig = plygo.Figure()

for params in paramSets:
    fig.add_trace(plygo.Scatter(x=xDomain,
                                y=[normNTotPars(nTotPars, **params) for nTotPars in xDomain],
                                name=str(params)))

fig.update_layout(title='NTotPars normalisation')

fig

In [None]:
def normCVDens(dCv, a=12):
        #return max(0, 1 - a * sRes[cls.CLDCv]) # Pas très pénalisant: a=1
        return math.exp(-a * dCv ** 2) # Mieux : déjà ~0.33 à 30% (a=12)

In [None]:
xDomain = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.08, 0.15, 0.30, 0.40, 0.5, 0.7, 1.0]
paramSets = [dict(a=8), dict(a=12), dict(a=14), dict(a=16), dict(a=20)]

In [None]:
fig = plygo.Figure()

for params in paramSets:
    fig.add_trace(plygo.Scatter(x=xDomain,
                                y=[normCVDens(cvDens, **params) for cvDens in xDomain],
                                name=str(params)))

fig.update_layout(title='CVDens normalisation')

fig

## Analysis results filtering and sorting tools

### 1. Adaptative filtering

See [devarchives2.ipynb / Development : Automated filtering and sorting of optanalysis results](./devarchives2.ipynb#Development-%3A-Automated-filtering-and-sorting-of-optanalysis-results) needed input data 

In [None]:
nTgtRmdr = 5
chi2Range = np.linspace(start=0.8, stop=0.1, num=8)

for lblEch, sEch in dfStatsEch.iterrows():

    dfFSEchRep = dfFilSorRep[dfFilSorRep.Echant == lblEch]
    #print('#{} {} : {}'.format(lblEch, sEch['Abréviation'], len(dfFSEchRep)), end=' => ')

    for chi2 in chi2Range:
        i2DropEch = dfFSEchRep[dfFSEchRep['Chi2 P'] < chi2].index.to_list()
        #print(len(i2DropEch), end=', ')
        if len(dfFSEchRep) - len(i2DropEch) >= nTgtRmdr:
            break

    #print(' => ', chi2, len(i2DropEch))
    dfFilSorRep.drop(i2DropEch, inplace=True)

len(dfFilSorRep)

In [None]:
def filterAdaptScheme(dfRes, sampleIds, sampleIdCol, critCol, ascendCrit=True, nMaxSteps=5, nMinRes=10):
    
    """Fonction générique de filtrage avec stratégie de contrôle du nombre de résultats conservé
    via un schéma adaptatif de seuillage sur 1 critère (fonction de son domaine réel de valeurs)
    """
    
    assert nMaxSteps > 2, 'At least 3 steps are mandatory'
    
    # For each sample ...
    i2Drop = []
    for sampId in sampleIds:

        # Extract results.
        dfSampRes = dfRes[dfRes[sampleIdCol] == sampId]
        print('#{}: {} results'.format(sampId, len(dfSampRes)), end=' => ')

        # Compute criteria threshold variation scheme from actual value domain
        start = dfSampRes[critCol].max() if ascendCrit else dfSampRes[critCol].min()
        stop = dfSampRes[critCol].min() if ascendCrit else dfSampRes[critCol].max()
        print(f'{start=:.3f} {stop=:.3f}', end=': ')

        # For each step of the scheme ...
        for thresh in np.linspace(start, stop, num=nMaxSteps)[1:-1]:

            # Try and apply the threshold step : number of dropped results if ...
            if ascendCrit:
                i2DropSamp = dfSampRes[dfSampRes[critCol] < thresh].index
            else:
                i2DropSamp = dfSampRes[dfSampRes[critCol] > thresh].index
            
            # Stop if we are above the minimum number of results.
            print('t={:.3f}/k={}'.format(thresh, len(dfSampRes) - len(i2DropSamp)), end=', ')
            if len(dfSampRes) - len(i2DropSamp) >= nMinRes:
                break

        print('done')

        # Append index to drop for sample to the final one
        i2Drop = i2DropSamp if not len(i2Drop) else i2Drop.append(i2DropSamp)
 
    return i2Drop

### 2. Close distance truncations grouping from report

In [None]:
dfSelRep = dfRep[dfRep.Echant == 0].copy()

epsilDist = 1e-6

#truncCol, minIntrvDist, maxIntrvLen = 'Dist Tronc Drte', 25.0, 25.0
truncCol, minIntrvDist, maxIntrvLen = 'Dist Tronc Gche', 5.0, 5.0

dfIntrv = dfSelRep[[truncCol]].dropna().sort_values(by=truncCol)

dfIntrv['deltaDist'] = dfIntrv[truncCol].diff()
dfIntrv.loc[dfIntrv[truncCol].idxmin(), 'deltaDist'] = np.inf

dfIntrv.dropna(inplace=True)
dfIntrv = dfIntrv[dfIntrv.deltaDist > 0]

dfIntrv['dMin'] = dfIntrv.loc[dfIntrv.deltaDist > minIntrvDist, truncCol]
dfIntrv['dSup'] = dfIntrv.loc[dfIntrv.deltaDist > minIntrvDist, truncCol].shift(-1).dropna()
dfIntrv.loc[dfIntrv['dMin'].idxmax(), 'dSup'] = np.inf
dfIntrv.dropna(inplace=True)

sSelDist = dfSelRep[truncCol]
dfIntrv['dSup'] = dfIntrv['dSup'].apply(lambda supV: sSelDist[sSelDist < supV].max() + epsilDist)

dfIntrv = dfIntrv[['dMin', 'dSup']].reset_index(drop=True)
dfIntrv

In [None]:
lsNewIntrvs = list()
for _, sIntrv in dfIntrv.iterrows():
    
    if sIntrv.dSup - sIntrv.dMin > maxIntrvLen:
        nSubIntrvs = (sIntrv.dSup - sIntrv.dMin) / maxIntrvLen
        nSubIntrvs = int(nSubIntrvs) if nSubIntrvs - int(nSubIntrvs) < 0.5 else int(nSubIntrvs) + 1
        subIntrvLen = (sIntrv.dSup - sIntrv.dMin) / nSubIntrvs
        lsNewIntrvs += [pd.Series(dict(dMin=sIntrv.dMin + i * subIntrvLen, 
                                       dSup=min(sIntrv.dMin + (i + 1) * subIntrvLen, sIntrv.dSup)))
                        for i in range(nSubIntrvs)]
    else:
        lsNewIntrvs.append(sIntrv)
        
dfIntrv = pd.DataFrame(lsNewIntrvs).reset_index(drop=True)
dfIntrv.sort_values(by='dMin', inplace=True)
dfIntrv

In [None]:
dfSelRep['Grp ' + truncCol] = \
    dfSelRep[truncCol].apply(lambda d: 0 if pd.isnull(d) else 1 + dfIntrv[(dfIntrv.dMin <= d) & (dfIntrv.dSup > d)].index[0])
dfSelRep

In [None]:
dfSelRep['Grp ' + truncCol].unique()

In [None]:
dfSelRep.sort_values()

### 3. Close distance truncations grouping from results

In [None]:
# results: En provenance de "Visionature-ds-point.ipynb/XVI. Analyses automatiques"
resultsCopy = results.copy()
self = results
self._dfData.head()

In [None]:
self._dfData.columns.to_list()

In [None]:
self.miSampleCols

In [None]:
epsDist = 1e-6

ldTruncIntrvSpecs = [dict(col=self.CLParTruncLeft, minDist=5.0, maxLen=5.0),
                     dict(col=self.CLParTruncRight, minDist=25.0, maxLen=25.0)]

In [None]:
self.miSampleCols.append(pd.MultiIndex.from_tuples([self.sampleIndCol]))

In [None]:
# Les échantillons
dfSamples = self._dfData[pd.MultiIndex.from_tuples([self.sampleIndCol]).append(self.miSampleCols)].drop_duplicates()
dfSamples.set_index(self.sampleIndCol, inplace=True)
assert len(dfSamples) == dfSamples.index.nunique()

dfSamples

In [None]:
self.CLCAFilSor = 'auto filter sort'
self.CLTTruncGroup = 'Group'

In [None]:
# Pour chaque échantillon,
for lblSamp, sSamp in dfSamples.iterrows():
    
    print('#{} {} :'.format(lblSamp, ','.join([f'{k[1]}:{v}' for k, v in sSamp.items()])))

    # Sélectionner les résultats associés, et uniquement ceux-là
    dfSampRes = self._dfData[self._dfData[self.sampleIndCol] == lblSamp]

    # Pour chaque type de troncature (optimisée ou non),
    for isOpt in sorted(dfSampRes[self.optimTruncFlagMCol].unique()):
        
        print('* {}optimised'.format('' if isOpt else 'non ').title(), end=' : ')

        # Sélectionner les résultats associés, et uniquement ceux-là
        dfSampResPerOpt = dfSampRes[dfSampRes[self.optimTruncFlagMCol] == isOpt]

        for dTrunc in ldTruncIntrvSpecs:

            truncCol = dTrunc['col']
            minIntrvDist = dTrunc['minDist']
            maxIntrvLen = dTrunc['maxLen']

            print(truncCol[1], end=', ')

            sSelDist = dfSampResPerOpt[truncCol]
            dfIntrv = pd.DataFrame(dict(dist=sSelDist.dropna().sort_values().values))

            # Ecarts non nuls de distances entre distances consécutives triées
            dfIntrv['deltaDist'] = dfIntrv.dist.diff()
            dfIntrv.loc[dfIntrv.dist.idxmin(), 'deltaDist'] = np.inf
            dfIntrv.dropna(inplace=True)
            dfIntrv = dfIntrv[dfIntrv.deltaDist > 0].copy()

            # Début et fin de chaque intervalle (fermé à gauche = dMin, ouvert à droite = dSup)
            dfIntrv['dMin'] = dfIntrv.loc[dfIntrv.deltaDist > minIntrvDist, 'dist']
            dfIntrv['dSup'] = dfIntrv.loc[dfIntrv.deltaDist > minIntrvDist, 'dist'].shift(-1).dropna()
            dfIntrv.loc[dfIntrv['dMin'].idxmax(), 'dSup'] = np.inf
            dfIntrv.dropna(inplace=True)

            dfIntrv['dSup'] = dfIntrv['dSup'].apply(lambda supV: sSelDist[sSelDist < supV].max() + epsDist)

            dfIntrv = dfIntrv[['dMin', 'dSup']].reset_index(drop=True)

            # Si les intervalles ainsi détectés sont trop larges, on les découpe en tranches égales
            lsNewIntrvs = list()
            for _, sIntrv in dfIntrv.iterrows():

                if sIntrv.dSup - sIntrv.dMin > maxIntrvLen:
                    nSubIntrvs = (sIntrv.dSup - sIntrv.dMin) / maxIntrvLen
                    nSubIntrvs = int(nSubIntrvs) if nSubIntrvs - int(nSubIntrvs) < 0.5 else int(nSubIntrvs) + 1
                    subIntrvLen = (sIntrv.dSup - sIntrv.dMin) / nSubIntrvs
                    lsNewIntrvs += [pd.Series(dict(dMin=sIntrv.dMin + nInd * subIntrvLen, 
                                                   dSup=min(sIntrv.dMin + (nInd + 1) * subIntrvLen, sIntrv.dSup)))
                                    for nInd in range(nSubIntrvs)]
                else:
                    lsNewIntrvs.append(sIntrv)

            dfIntrv = pd.DataFrame(lsNewIntrvs).reset_index(drop=True)
            dfIntrv.sort_values(by='dMin', inplace=True)

            # Attribution du numéro de groupe de troncatures à chaque distance mesurée (0 = pas de troncature)
            sb = (self._dfData[self.sampleIndCol] == lblSamp) & (self._dfData[self.optimTruncFlagMCol] == isOpt)
            self._dfData.loc[sb, (self.CLCAFilSor, truncCol[1], self.CLTTruncGroup)] = \
                self._dfData.loc[sb, truncCol].apply(lambda d: 0 if pd.isnull(d) \
                                                       else 1 + dfIntrv[(dfIntrv.dMin <= d) & (dfIntrv.dSup > d)].index[0])

        print(len(dfSampResPerOpt))   

In [None]:
self._dfData

### 4. Filtering and sorting orders

In [None]:
#class RS(ads.MCDSTruncOptanalysisResultsSet):
#    
#    CLNObs = 'NObs'
#    CLNTotObs = 'NTot Obs'
#    CLNTotPars = 'NbTot Pars'
#    CLChi2  = 'Chi2 P'
#    CLDCv   = 'CoefVar Densité'
#    CLKS    = 'KS P'
#    CLCvMUw = 'CvM Uw P'
#    CLCvMCw = 'CvM Cw P'
#    
#    def __init__(self):
#        pass
#
#rs = RS()

In [None]:
results.AutoFilSorKeySchemes[2]

In [None]:
DRes2RefRepCols = {v:k for k, v in DRefRep2ResCols.items()}
DRes2RefRepCols

In [None]:
# Results
lblSamp = 0
dfSampRes = dfRes[dfRes.Echant == lblSamp].copy()

scheme = results.AutoFilSorKeySchemes[6]
print('group' in scheme, scheme)

# Sort results
dfSampRes.sort_values(by=results.transColumns(scheme['sort'], 'fr'), ascending=scheme['ascend'], 
                      na_position=scheme.get('napos', 'last'), inplace=True)
dfSampRes.set_index('Analyse', inplace=True)

# Compute order (specific to groups or global).
if 'group' in scheme:
    sSampOrder = dfSampRes.groupby(results.transColumns(scheme['group'], 'fr'), dropna=False).cumcount()
else:
    sSampOrder = pd.Series(data=range(len(dfSampRes)), index=dfSampRes.index)

sSampOrder

In [None]:
# Old report, new method
lblSamp = 0
dfSampRep = dfRefRep[dfRefRep.Echant == lblSamp].rename(columns=DRes2RefRepCols).copy()

optimTruncCol = 'OptimTrunc'
#scheme = dict(name='Meil CKCv Tronc Proch',  # Meilleur Chi2&KS&DCV par groupe de troncatures proches
#              sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
#                    'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx'],
#              ascend=[True, True, True, False, False, True, False, True],
#              group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte'])
scheme= dict(name='Meil Qual Chi2 Tronc Proch',  # Meilleur Qualité combinée Chi2+ par groupe de troncatures proches
             sort=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
                   'Qual Chi2'],
             ascend=[True, True, True, False],
             group=[optimTruncCol, 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte'])

print('group' in scheme, scheme)

# Sort results
dfSampRep.sort_values(by=scheme['sort'], ascending=scheme['ascend'], 
                      na_position=scheme.get('napos', 'last'), inplace=True)
dfSampRep.set_index('Analyse', inplace=True)

# Compute order (specific to groups or global).
if 'group' in scheme:
    sRepSampOrder = dfSampRep.groupby(scheme['group'], dropna=False).cumcount()
else:
    sRepSampOrder = pd.Series(data=range(len(dfSampRep)), index=dfSampRep.index)

sRepSampOrder

In [None]:
dfComp = sRepSampOrder.to_frame(name='rep').join(sSampOrder.to_frame(name='res')).sort_index()
dfComp[dfComp.res != dfComp.rep]

In [None]:
# Reduce float resolution (pb with least significant bits ?)
dfSampRep[['OptimTrunc', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
           'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']] = \
    dfSampRep[['OptimTrunc', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
               'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].astype(float)

# Sort results
dfSampRep = dfSampRep.sort_values(by=scheme['sort'], ascending=scheme['ascend'], 
                                  na_position=scheme.get('napos', 'last'))

# Compute order (specific to groups or global).
sRepSampOrder = dfSampRep.groupby(scheme['group'], dropna=False).cumcount() \
                 if 'group' in scheme else range(len(dfSampRep))

sRepSampOrder

In [None]:
dfSampRes[['OptimTrunc', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
           'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']] = \
    dfSampRes[['OptimTrunc', 'Groupe Tronc Gche', 'Groupe Tronc Drte',
               'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].astype(float)

# Sort results
dfSampRes = dfSampRes.sort_values(by=results.transColumns(scheme['sort'], 'fr'), ascending=scheme['ascend'], 
                                  na_position=scheme.get('napos', 'last'))

# Compute order (specific to groups or global).
sSampOrder = dfSampRes.groupby(results.transColumns(scheme['group'], 'fr'), dropna=False).cumcount() \
             if 'group' in scheme else range(len(dfSampRes))

sSampOrder

In [None]:
dfComp = sRepSampOrder.to_frame(name='rep').join(sSampOrder.to_frame(name='res')).sort_index()
dfComp[dfComp.res != dfComp.rep]

In [None]:
dfSampRep[['OptimTrunc', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
           'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].to_excel('tmp/_.xlsx')

In [None]:
dfSampRes[['OptimTrunc', 'Groupe Tronc Gche', 'Groupe Tronc Drte',
           'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].to_excel('tmp/__.xlsx')

In [None]:
dfSampRes[['OptimTrunc', 'Groupe Tronc Gche', 'Groupe Tronc Drte',
           'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].rename(columns=DRes2RefRepCols)

In [None]:
dfSampRep[['OptimTrunc', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
           'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']]

In [None]:
dfComp2 = dfSampRes[['OptimTrunc', 'Groupe Tronc Gche', 'Groupe Tronc Drte',
                     'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']] \
             .rename(columns=DRes2RefRepCols).sort_index() \
             .compare(dfSampRep[['OptimTrunc', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
                                 'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].sort_index())
dfComp2

In [None]:
dfComp2[('Chi2 P', 'self')] - dfComp2[('Chi2 P', 'other')]

In [None]:
dfComp3 = dfSampRes[['OptimTrunc', 'Groupe Tronc Gche', 'Groupe Tronc Drte',
                     'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].astype(np.float32) \
  .rename(columns=DRes2RefRepCols).sort_index() \
  .compare(dfSampRep[['OptimTrunc', 'Grp Dist Tronc Gche', 'Grp Dist Tronc Drte',
                      'Chi2 P', 'KS P', 'CoefVar Densité', 'NObs', 'CodEx']].astype(np.float32).sort_index())
dfComp3

In [None]:
dfComp4 = dfSampRes[ordDiffCols].sort_index().compare(dfSampRep.rename(columns=DRefRep2ResCols)[ordDiffCols].sort_index())
dfComp4

## Analysis elapsed time statistics

In [None]:
[col for col in results.columns if col[0] == 'run output']

In [None]:
results.columns.to_list()

In [None]:
subsetCols = [('parameters', 'estimator key function', 'Value'),
              ('parameters', 'estimator adjustment series', 'Value'),   
              ('run output', 'elapsed time', 'Value'),
              ('run output', 'run status', 'Value'),
              ('encounter rate', 'number of observations (n)', 'Value'),
              ('detection probability', 'number of key function parameters (NKP)', 'Value'),
              ('detection probability', 'number of adjustment term parameters (NAP)', 'Value'),
              ('run output', 'run folder', 'Value')]

In [None]:
df = results.dfTransData('fr', columns=subsetCols)
#df[['NObs', 'NbPars FnClé', 'NbPars SérAjust']] = df[['NObs', 'NbPars FnClé', 'NbPars SérAjust']].astype(int)
df.head()

In [None]:
df['NObs'].describe(), df['DuréeExec'].describe()

In [None]:
np.histogram(df['DuréeExec'], bins=80, range=(0, 400))

In [None]:
df.loc[df['DuréeExec'] > 60].DossierExec.to_list()

In [None]:
df.loc[df['DuréeExec'] < 60, ['NObs', 'DuréeExec']].plot.scatter(y='NObs', x='DuréeExec')

# II. Python recipes

## Function parameters discovery

In [None]:
import inspect

In [None]:
class Base(object):
    def __init__(self, a, b, xa=1, xb=2):
        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        print(args, values)
        print('function name "{}"'.format(inspect.getframeinfo(frame)[2]))
        for i in args:
            print('    {} = {}'.format(i, values[i]))
        print([(i, values[i]) for i in args])

In [None]:
b = Base(4, 5)

## Named tuple from dictionary

In [None]:
from collections import namedtuple as ntuple

In [None]:
d = dict(a=1, b=[3, 2], c='xxx')

In [None]:
NT = ntuple('NT', d.keys())

In [None]:
nt = NT(**d)

In [None]:
nt

## Appending series to series ... index order

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
s

In [None]:
s.append(pd.Series(index=[('A', 'b'), ('A', 'a'), ('B', 'c')], data=[1, 2, 3], name=0))

## Appending series to DataFrame ... columns order

### a. Append

In [None]:
df = pd.DataFrame()

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
#df = df.append(s, ignore_index=False) # => df.columns pas MultiIndex !
df = df.append([s], ignore_index=False)
df

In [None]:
s = pd.Series(index=[('A', 'c'), ('B', 'b'), ('B', 'a')], data=[4, 5, 6], name=1)  # Mêmes colonnes : append ne retrie pas
#s = pd.Series(index=[('A', 'a'), ('A', 'b'), ('B', 'c')], data=[4, 5, 6], name=1)  # Nouvelle colonne : append retrie
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('A', 'a'), ('B', 'c')], data=[7, 8])
df = df.append(s, ignore_index=True)
df

In [None]:
s = pd.Series(index=[], data=[])
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('C', 'd')], data=[9])
df = df.append([s], ignore_index=True)
df

In [None]:
s = pd.Series(index=[('d',)], data=[10])
df = df.append(s, ignore_index=True)
df

In [None]:
df

### b. Concat

In [None]:
df = pd.DataFrame()

In [None]:
s = pd.Series(index=pd.MultiIndex.from_tuples([('B', 'b'), ('B', 'a'), ('A', 'c')]), data=[1, 2, 3], name=0)
df = pd.concat([df, s], axis='columns')
df

In [None]:
s = pd.Series(index=[('B', 'b'), ('B', 'a'), ('A', 'c')], data=[4, 5, 6], name=1) # Mêmes colonnes : concat ne retrie pas
#s = pd.Series(index=[('A', 'a'), ('A', 'b'), ('B', 'c')], data=[4, 5, 6], name=1) # Nouvelle colonne : concat retrie
df = pd.concat([df, s], axis='columns')
df

### c. Restore desired columns

* desired order,
* desired list of columns : new ones, and / or ignored ones.

In [None]:
df

In [None]:
# Add new A/b, D/a and remove B/c and C/d
i = pd.MultiIndex.from_tuples([('A', 'c'), ('A', 'b'), ('A', 'a'), ('B', 'b'), ('B', 'a'), ('D', 'a')])
i

In [None]:
# Keep added columns (with no data inside)
df2 = df.reindex(i, axis='columns')
df2

In [None]:
# Remove added columns (with no data inside)
df2 .dropna(how='all', axis='columns')

## Appending partially-columned DataFrame to DataFrame

with generation of lacking columns by duplicating a series = a row template

In [None]:
df = pd.DataFrame([dict(a=1, b=2, c=3), dict(a=3, b=4, c=5), dict(a=4, b=5, c=6)])
df

In [None]:
s = pd.Series(dict(x=0, y=1), name=9)
s

In [None]:
pd.DataFrame([s]*len(df))

In [None]:
df = pd.concat([pd.DataFrame([s]*len(df)).reset_index(drop=True), df], axis='columns')
df

## Multi-indexing

In [None]:
df = pd.DataFrame(data=[(1, 2, 3), (4, 5, 6), (7, 5, 6)],
                  columns=pd.MultiIndex.from_tuples([('a', 'b'), ('a', 'c'), ('b', 'd')]))
df

In [None]:
df.loc[0, ('a', 'b')] = 9
df

In [None]:
df.columns

In [None]:
# OK: no need for passing a MultiIndex to []
df[[('a', 'c'), ('b', 'd')]] = 9
df

In [None]:
# Neither to duplicated ...
df[df.duplicated(subset=[('a', 'c'), ('b', 'd')])]

## Assymetric index and columns indexing

In [None]:
df = pd.DataFrame([dict(a=3, b=3, c=2), dict(a=3, b=3, c=3), dict(a=2, b=3, c=3)])
df

In [None]:
df[df == 3]

In [None]:
dfb = df.applymap(lambda v: v == 3)
dfb

### Label boolean indexing : easy !

In [None]:
dfb.all(axis='columns')

In [None]:
# It works ...
df[dfb.all(axis='columns')]

In [None]:
%%timeit

# ... and it's fast
df[dfb.all(axis='columns')]

In [None]:
# Dropping only works this more classical way...
df.drop(index=df[dfb.all(axis='columns')].index)

In [None]:
%%timeit

# ... and it's fast
df.drop(index=df[dfb.all(axis='columns')].index)

### Columns boolean indexing : unsymetric API (as of pandas 1.3) !

In [None]:
dfb.all(axis='index')

In [None]:
# It works this tortuous way, ...
df.T[dfb.all(axis='index')].T

In [None]:
%%timeit

# ... but it's 3 times slower than for label indexing !
df.T[dfb.all(axis='index')].T

In [None]:
# It works also this more classical way, ...
df[[col for col, b in dfb.all(axis='index').items() if b]]

In [None]:
%%timeit

# ... but it's also 3 times slower than for label indexing !
df[[col for col, b in dfb.all(axis='index').items() if b]]

In [None]:
df.T.loc[df.T[dfb.all(axis='index')].index].T

In [None]:
df.T[dfb.all(axis='index')]

In [None]:
# Dropping works this classical and tortuous way also ...
df.drop(columns=df.T[dfb.all(axis='index')].index)

In [None]:
%%timeit

# ... but it's 50% slower than the following even more classical way
df.drop(columns=df.T[dfb.all(axis='index')].index)

In [None]:
# Dropping works this even more classical way also ...
df.drop(columns=[col for col, b in dfb.all(axis='index').items() if b])

In [None]:
%%timeit

# ... and it's quite fast
df.drop(columns=[col for col, b in dfb.all(axis='index').items() if b])

## Check python derivation and class methods / attributes

In [None]:
class Base(object):
    
    A = 'Base.A'
    B = 'Base.B'
    
    def f(self):
        print('Base.f')
        return self.g()
        
    def g(self):
        print('Base.g')
        return self.A
    
    def i(self):
        print('Base.i')
        
class Derived(Base):
    
    A = 'Derived.A'
    
    @classmethod
    def h(cls):
        print('Derived.h: A=', cls.A)
        return cls.A

    def g(self):
        print('Derived.g')
        return self.h()

    def i(self):
        print('Derived.i')
        super().i()
        
d = Derived()

assert d.f() == 'Derived.A'

print('d.B=', d.B)

d.i()

In [None]:
# An other one
class A(object):
    X = 5
    def __init__(self, y):
        self.xy = y * self.X
    
class B(A):
    X = 10
    def __init__(self, y):
        super().__init__(y=y)

a = A(y=2)
print(a.X, a.xy)
        
b = B(y=3)
print(b.X, b.xy)

print(A.X, B.X)

## pd.DataFrame.round()

In [None]:
df = pd.DataFrame([dict(a=1.00, b=2.00, c=3.00),
                   dict(a=1.05, b=2.01, c=3.01), 
                   dict(a=1.01, b=1.94, c=3.02), 
                   dict(a=1.09, b=2.00, c=3.00)])
df

In [None]:
dfr = df.round(1)
dfr

In [None]:
df[~dfr.duplicated(keep='last')]

In [None]:
df.round(decimals=dict(a=1, b=2, c=0))

In [None]:
df.insert(df.columns.get_loc('c'), 'x', np.nan)
df

In [None]:
df.a.where(df.a < 1.02)

## fillna(inplace=True) on column subset

Why doesn't it exist ? (pandas <= 1.2.5)

In [None]:
df = pd.DataFrame([dict(a=1.00,   b=2.00,   c=3.00),
                   dict(a=1.05,   b=np.nan, c=3.01), 
                   dict(a=np.nan, b=1.94,   c=np.nan), 
                   dict(a=1.09,   b=np.nan, c=3.00)])
df

In [None]:
pd.__version__

In [None]:
# 1. Does not work, and raises a SettingWithCopyWarning warning with pandas <= 1.2.5 at least
df[['a', 'b']].fillna(-1, inplace=True)
df

In [None]:
# 2. This one works
df[['a', 'b']] = df[['a', 'b']].fillna(-1)
df

In [None]:
# 3. This one also works ... strange, isn't it, after 1. ?
df['c'].fillna(-1, inplace=True)
df

## Number of python code lines in PyAuDiSam project

In [None]:
# Warning: Only python and markdown support for the moment
def classifySourceLine(line, lang='python'):
    empty, comment, code = 0, 0, 0
    line = line.strip()
    if not line:
        empty = 1
    elif line[0] == '#' or lang == 'markdown':
        comment = 1
    else:
        code = 1
    return empty, comment, code

# Auto-tests
assert classifySourceLine(' ') == (1, 0, 0)
assert classifySourceLine(' #') == (0, 1, 0)
assert classifySourceLine('x = 1') == (0, 0, 1)
assert classifySourceLine('x = 1', lang='markdown') == (0, 1, 0)

In [None]:
import json

def countSourceLinesInFiles(files, lang='python', encoding='utf8'):
    lsCounts = list()
    for fpn in files:
        print(fpn.as_posix())
        sCounts = pd.Series(index=['nEmpties', 'nComments', 'nCode'], dtype=int, data=[0, 0, 0])
        with open(fpn, encoding=encoding) as file:
            if fpn.suffix == '.ipynb':  # Notebook file
                ldCells = json.load(file)['cells']
                for dCell in ldCells:
                    for line in dCell['source']:
                        sCounts += classifySourceLine(line, lang=dCell['cell_type'])
            else:  # Normal source file
                for line in file.readlines():
                    sCounts += classifySourceLine(line, lang='python')
        lsCounts.append(pd.Series(dict(file=fpn.name)).append(sCounts))
    return pd.DataFrame(lsCounts)

ExcludeFolderKeys = ['.git', 'venv/', '.ipynb_checkpoints', '__pycache__', 'build/', 'dist/']
def countSourceLinesInFolder(folder, glob='*.py', recurse=False, lang='python'):
    ldfCounts = list()
    folder = pl.Path(folder)
    if any(exclKey in folder.as_posix() for exclKey in ExcludeFolderKeys):
        return None
    dfCounts = countSourceLinesInFiles(folder.glob(glob), lang='python')
    if not dfCounts.empty:
        dfCounts.insert(0, 'folder', folder.as_posix())
        ldfCounts.append(dfCounts)
    if recurse:
        for fileOrSubFolder in folder.iterdir():
            if fileOrSubFolder.is_dir():
                dfCounts = countSourceLinesInFolder(fileOrSubFolder, glob=glob, recurse=recurse, lang='python')
                if not(dfCounts is None or dfCounts.empty):
                    ldfCounts.append(dfCounts)
    return None if not ldfCounts else pd.concat(ldfCounts)

def countSourceLines(folder, glob='*.py', recurse=True, lang='python'):
    dfCounts = countSourceLinesInFolder(folder, glob=glob, recurse=recurse, lang='python')
    if dfCounts is None:
        return None
    dTotalHead = dict()
    if recurse:
        dTotalHead.update(folder='all')
    dTotalHead.update(file='all')
    dfCounts = dfCounts.append(pd.Series(dTotalHead).append(dfCounts[['nEmpties', 'nComments', 'nCode']].sum()),
                               ignore_index=True)
    return dfCounts

In [None]:
# In py sources (core and tests)
countSourceLines('..')

In [None]:
countSourceLines('..', glob='*.ipynb')