In [None]:
# Show plots in Jupyter notebooks
%matplotlib inline

# Reload modules whenever they change
# (for development purposes)
%load_ext autoreload
%autoreload 2

# Make clusterking package available even without installation
import sys
sys.path = ["../../"] + sys.path

In [None]:
import clusterking as ck
from clusterking.stability.subsamplestability import SubSampleStabilityTester
from clusterking.stability.fom import *
from clusterking.stability.preprocessor import *
from clusterking.cluster import HierarchyCluster
from clusterking.benchmark import Benchmark
import numpy as np
import pandas as pd
import tqdm

In [None]:
d = ck.Data("output/tutorial_basics.sql")

## SubSampleStabilityTester

In this stability test we test the stability of the clustering and benchmarking by taking a set of sample points and reclustering/rebenchmarking subsets of it and then comparing the results.

In [None]:
# Initialize and configure sub sample stability test
ssst = SubSampleStabilityTester()
ssst.set_sampling(frac=0.8)
ssst.set_repeat(100)

# Add some figures of merit (FOMs)
ssst.add_fom(
    MatchingClusters(
        preprocessor=TrivialClusterMatcher(),
        name="MatchingClusters"
    )
)
ssst.add_fom(
    AverageBMProximityFOM(
        preprocessor=TrivialClusterMatcher(),
        name="AverageBMProximity"
    )
)
ssst.add_fom(DeltaNClusters(name="DeltaNClusters"))

In [None]:
# Configure clustering and benchmarking
c = HierarchyCluster()
c.set_max_d(0.2)
b = Benchmark()
b.set_metric("euclidean")

In [None]:
# Run the SSST using these clustering and benchmarking workers
r = ssst.run(d, c, benchmark=b)

In [None]:
# The SSST result object contains a dataframe with the FOMs
# which we are plotting here
r.df.hist(
    ["MatchingClusters", "DeltaNClusters", "AverageBMProximity"],
    density=True, layout=(1, 3), figsize=(15, 5)
);

In [None]:
r.df.describe()

## SubSampleStabilityVsFraction

Here, the idea is to repeat the ``SubSampleStabilityTest`` from before for different values of the fraction.
This has been implemented as the ``SubSampleStabilityVsFraction`` test.

In [None]:
from clusterking.stability.subsamplestability import SubSampleStabilityVsFraction

In [None]:
# Initialize and configure the SubSampleStabilityTester
# which will later be run repeatedly
ssst = SubSampleStabilityTester()
ssst.set_repeat(repeat=100)
ssst.add_fom(
    MatchingClusters(
        preprocessor=TrivialClusterMatcher(),
        name="MatchingClusters"
    )
)
ssst.add_fom(DeltaNClusters(name="DeltaNClusters"))

In [None]:
# Initialize the actual test
sssvf = SubSampleStabilityVsFraction()

In [None]:
# Run it using the SSST configured above
r = sssvf.run(d, c, ssst, list(np.linspace(0.7, 0.999, 5)))

In [None]:
# Again the result object contains a dataframe from which we can plot
r.df.plot.scatter("fraction", "MatchingClusters")

## Noisy sampling

In this stability test, we consider sets of sample points that are very close together, i.e. we take an equidistant grid of sample points and then consider several samples, where add noise to these sample points, recluster, rebenchmark and compare the results.

In [None]:
from clusterking.stability.noisysamplestability import NoisySampleStabilityTester, NoisySample

In the first step we set up the scanner, data and cluster workers.

In [None]:
import flavio
import numpy as np

s = ck.scan.WilsonScanner(scale=5, eft="WET", basis="flavio")

def dBrdq2(w, q):
    return flavio.np_prediction("dBR/dq2(B+->Dtaunu)", w, q)

s.set_dfunction(
    dBrdq2,
    binning=np.linspace(3.2, 11.6, 3),
    normalize=True,
    variable="q2"  # only sets name of variable
)

s.set_spoints_equidist(
    {
        "CVL_bctaunutau": (-1, 1, 3),
        "CSL_bctaunutau": (-1, 1, 3),
        "CT_bctaunutau": (-1, 1, 3)
    }
)

In [None]:
d = ck.Data()

In [None]:
c = HierarchyCluster()
c.set_max_d(0.2)

Now we generate samples, i.e. several runs where we scan with some noise applied to the parameter points. This is done by the NoisySample worker class.

In [None]:
ns = NoisySample()
ns.set_repeat(1)
ns.set_noise("gauss", mean=0., sigma=0.02)
nsr = ns.run(scanner=s, data=d)

The result of the ``NoisySample`` can be fed into the ``NoisySampleStabilityTester`` which doesn't do more than to recluster, rebenchmark and computes some figures of merit.

In [None]:
nsst = NoisySampleStabilityTester()
nsst.add_fom(
    MatchingClusters(
        preprocessor=TrivialClusterMatcher(),
        name="MatchingClusters"
    )
)
nsst.add_fom(DeltaNClusters(name="DeltaNClusters"))

In [None]:
r = nsst.run(sample=nsr, cluster=c)

In [None]:
r.df