In [64]:
from negentropy_approx import NegentropyApprox1, NegentropyApprox2, NegentropyApprox3, NegentropyApprox4, NegentropyApprox5
from scipy import stats
import numpy as np
import pandas as pd
import torch
from typing import Callable

Для вычисления точности приближений негэнтропий вычислим их значения для некоторых классических случайных величин. Сравнивать будем с негэнтропией, вычисленной с использованием библиотечных значений для энтропий

In [5]:
class NormalEntropy:
  def __init__(self, loc : float = 0, scale : float = 1):
    self.entropy = stats.norm.entropy(loc, scale)

  def __call__(self) -> float:
    return self.entropy

class NegentropyApproxScipy:
  def __init__(self):
    self.normal_entropy = NormalEntropy()()

  def __call__(self, x : np.ndarray) -> float:
    return self.normal_entropy - stats.differential_entropy(x)

Зафиксируем набор тестируемых приближений негэнтропий

In [29]:
# для первого приближения зафиксируем несколько значений константы
negentropy1_1 = NegentropyApprox1(1)
negentropy1_15 = NegentropyApprox1(1.5)
negentropy1_2 = NegentropyApprox1(2)
negentropy2 = NegentropyApprox2()
negentropy3 = NegentropyApprox3()
negentropy4 = NegentropyApprox4()
negentropy5 = NegentropyApprox5()

all_negentropy_approx = {
                      "NegentropyApprox1(1)" :negentropy1_1,
                      "NegentropyApprox1(1.5)" : negentropy1_15,
                      "NegentropyApprox1(2)" :negentropy1_2,
                      "NegentropyApprox2" : negentropy2,
                      "NegentropyApprox3" : negentropy3,
                      "NegentropyApprox4" : negentropy4,
                      "NegentropyApprox5" :negentropy5
                      }

Будем использовать 3 классических распределения: нормальное, равномерное и Лапласа. loc, scale зададит таким образом, чтобы E=0, D=1

In [30]:
distributions = {"normal" : [stats.norm, 0, 1],
                 "uniform" : [stats.uniform, -(3**0.5), 2*(3**0.5)],
                 "laplace": [stats.laplace, 0, 1/(2**0.5)]}

Также зафиксируем разные размеры выборок для тестирования

In [31]:
sample_sizes = [10, 100, 1000, 10000, 100000]

# столбцы в результирующей таблице
columns = ["Real negentropy", "Negentropy1(1)", "Negentropy1(1.5)", "Negentropy1(2)", "Negentropy2", "Negentropy3", "Negentropy4", "Negentropy5", "MeanNegentropy"]
rows = []
for name, features in distributions.items():
  for sample_size in sample_sizes:
    rows.append(f"{sample_size} {name}")

Зададим функции для проведения эксперимент

In [57]:
def test_negentropy_for_distribution(scipy_stats_distr, loc : float, scale : float, sample_size : int):
  """
  Функция по вычисления всех заданных приближений негэнтропий для выборки фиксированного размера из заданного распределения
  """
  res = []
  real_negentropy = NormalEntropy()() - scipy_stats_distr.entropy(loc, scale)
  res.append(real_negentropy)

  sample = torch.from_numpy(scipy_stats_distr.rvs(loc, scale, sample_size))
  for _, negentropy_approx in all_negentropy_approx.items():
    res_approx = negentropy_approx(sample)
    res.append(res_approx.item())
  mean_res = np.mean(res[1:])
  res.append(mean_res)
  return res

def experiment():
  """
  Функция по вычислению приближений негэнтропий для всех комбинаций размеров выборок и распределений
  Возвращает
  """
  # TODO: добавить выше, что возвращает
  results = []
  for _, dictr_features in distributions.items():
    for sample_size in sample_sizes:
      distr, loc, scale = dictr_features
      values = test_negentropy_for_distribution(distr, loc, scale, sample_size)
      results.append(values)
  res = pd.DataFrame(results, index = rows, columns = columns)
  errors_df = res[['Real negentropy']].copy()
  for column in columns:
    if column == "Real negentropy":
      continue
    errors_df[column+" error"] = (res["Real negentropy"]-res[column]).abs()
  errors_df["best approx"] = errors_df.loc[:, ~errors_df.columns.isin(['Real negentropy'])].idxmin(axis = 1)
  errors_df["worst approx"] = errors_df.loc[:, ~errors_df.columns.isin(['Real negentropy', 'best approx'])].idxmax(axis = 1)
  return res, errors_df

In [60]:
values, errors = experiment()
values

Unnamed: 0,Real negentropy,Negentropy1(1),Negentropy1(1.5),Negentropy1(2),Negentropy2,Negentropy3,Negentropy4,Negentropy5,MeanNegentropy
10 normal,0.0,0.03429213,0.04963064,0.05949772,0.01704256,0.01706,1.032874,0.576022,0.255203
100 normal,0.0,0.0007630709,0.001334193,0.001641019,0.000779382,0.011422,0.047235,0.038958,0.01459
1000 normal,0.0,1.67894e-05,1.559062e-05,2.617386e-05,8.949327e-07,0.000196,0.000969,0.001015,0.00032
10000 normal,0.0,1.090155e-07,5.121072e-07,2.935232e-07,1.484615e-06,0.000198,4.4e-05,2e-05,3.8e-05
100000 normal,0.0,1.734868e-05,1.63286e-05,2.833693e-05,3.435019e-07,1.1e-05,6.9e-05,4.2e-05,2.6e-05
10 uniform,0.176485,0.001093389,0.0007200277,0.0003833315,7.454236e-06,0.049696,0.220388,0.220281,0.070367
100 uniform,0.176485,0.00192237,0.004131197,0.006197887,0.003474815,0.042621,0.122855,0.118966,0.042881
1000 uniform,0.176485,0.0004954333,0.001295266,0.002129586,0.001530864,0.028168,0.044152,0.05286,0.018662
10000 uniform,0.176485,0.0004822657,0.001237601,0.002007131,0.00147995,0.027199,0.038006,0.051238,0.017379
100000 uniform,0.176485,0.0009006518,0.001998113,0.003033137,0.001968829,0.030662,0.055141,0.067732,0.023062


In [61]:
errors

Unnamed: 0,Real negentropy,Negentropy1(1) error,Negentropy1(1.5) error,Negentropy1(2) error,Negentropy2 error,Negentropy3 error,Negentropy4 error,Negentropy5 error,MeanNegentropy error,best approx,worst approx
10 normal,0.0,0.03429213,0.04963064,0.05949772,0.01704256,0.01706,1.032874,0.576022,0.255203,Negentropy2 error,Negentropy4 error
100 normal,0.0,0.0007630709,0.001334193,0.001641019,0.000779382,0.011422,0.047235,0.038958,0.01459,Negentropy1(1) error,Negentropy4 error
1000 normal,0.0,1.67894e-05,1.559062e-05,2.617386e-05,8.949327e-07,0.000196,0.000969,0.001015,0.00032,Negentropy2 error,Negentropy5 error
10000 normal,0.0,1.090155e-07,5.121072e-07,2.935232e-07,1.484615e-06,0.000198,4.4e-05,2e-05,3.8e-05,Negentropy1(1) error,Negentropy3 error
100000 normal,0.0,1.734868e-05,1.63286e-05,2.833693e-05,3.435019e-07,1.1e-05,6.9e-05,4.2e-05,2.6e-05,Negentropy2 error,Negentropy4 error
10 uniform,0.176485,0.1753918,0.1757652,0.1761019,0.1764778,0.12679,0.043903,0.043795,0.106118,Negentropy5 error,Negentropy2 error
100 uniform,0.176485,0.1745628,0.172354,0.1702873,0.1730104,0.133864,0.05363,0.05752,0.133604,Negentropy4 error,Negentropy1(1) error
1000 uniform,0.176485,0.1759898,0.1751899,0.1743556,0.1749543,0.148317,0.132333,0.123625,0.157824,Negentropy5 error,Negentropy1(1) error
10000 uniform,0.176485,0.1760029,0.1752476,0.1744781,0.1750053,0.149286,0.138479,0.125247,0.159107,Negentropy5 error,Negentropy1(1) error
100000 uniform,0.176485,0.1755846,0.1744871,0.1734521,0.1745164,0.145823,0.121344,0.108753,0.153423,Negentropy5 error,Negentropy1(1) error


------
Попробуем запустить еще серию экспериментов: проверим какой максимальный/минимальный размер ошибки можем получить этими приближениями, какой средний размер ошибки и ее разброс

Для этого для каждого приближения запустим n раз вычисления на зафиксированных распределениях и разных разных размерах выборки, а потом посчитаем метрики

In [79]:
sample_sizes = [10, 100, 500, 1000, 10000, 100000]

columns = ["Sample size", "Max error", "Min error", "Mean", "Std"]
rows = []
result = []

In [80]:
def calculate_metrics_for_approx(scipy_stats_distr, loc : float, scale : float, approx_f : Callable, sample_size : int, n : int = 1000):
  real_negentropy = NormalEntropy()() - scipy_stats_distr.entropy(loc, scale)
  errors = []
  for i in range(n):
    sample = torch.from_numpy(scipy_stats_distr.rvs(loc, scale, sample_size))
    negentropy_approx = approx_f(sample).item()
    errors.append(abs(real_negentropy-negentropy_approx))

  return errors

In [81]:
for approx_name, negentropy_approx in all_negentropy_approx.items():
  for sample_size in sample_sizes:
    rows.append(f"{approx_name}")
    row_errors = []
    for distr_name, features in distributions.items():
      distr, loc, scale = features
      row_errors += calculate_metrics_for_approx(distr, loc, scale, negentropy_approx, sample_size)

    result.append([sample_size, np.max(row_errors), np.min(row_errors), np.mean(row_errors), np.std(row_errors)])

In [83]:
errors_df = pd.DataFrame(result, index = rows, columns = columns)
errors_df

Unnamed: 0,Sample size,Max error,Min error,Mean,Std
NegentropyApprox1(1),10,0.594406,2.608066e-08,0.079577,0.068959
NegentropyApprox1(1),100,0.176485,8.353156e-10,0.081489,0.071251
NegentropyApprox1(1),500,0.176485,1.202305e-09,0.082139,0.071949
NegentropyApprox1(1),1000,0.176485,1.706786e-10,0.082228,0.072052
NegentropyApprox1(1),10000,0.176168,2.712721e-11,0.082273,0.072135
NegentropyApprox1(1),100000,0.175863,1.413568e-11,0.082281,0.072144
NegentropyApprox1(1.5),10,0.479046,1.687357e-08,0.079376,0.067226
NegentropyApprox1(1.5),100,0.176485,1.432465e-09,0.080545,0.070581
NegentropyApprox1(1.5),500,0.176485,2.014585e-11,0.081269,0.071563
NegentropyApprox1(1.5),1000,0.176473,1.752347e-09,0.081352,0.071625


In [84]:
errors_df.sort_values(by=['Sample size', 'Mean'])

Unnamed: 0,Sample size,Max error,Min error,Mean,Std
NegentropyApprox1(2),10,1.193198,2.930608e-09,0.079095,0.068947
NegentropyApprox1(1.5),10,0.479046,1.687357e-08,0.079376,0.067226
NegentropyApprox2,10,0.176485,4.588855e-09,0.079477,0.067567
NegentropyApprox1(1),10,0.594406,2.608066e-08,0.079577,0.068959
NegentropyApprox5,10,6.940723,9.041453e-05,0.383112,0.437564
NegentropyApprox3,10,283.047099,5.075092e-08,0.499899,5.813856
NegentropyApprox4,10,16.173287,0.000196038,0.538388,0.686078
NegentropyApprox5,100,0.665296,7.65094e-06,0.072526,0.067293
NegentropyApprox1(2),100,0.176485,9.636471e-09,0.079734,0.070027
NegentropyApprox1(1.5),100,0.176485,1.432465e-09,0.080545,0.070581


In [85]:
errors_df.sort_values(by=['Sample size','Std'])

Unnamed: 0,Sample size,Max error,Min error,Mean,Std
NegentropyApprox1(1.5),10,0.479046,1.687357e-08,0.079376,0.067226
NegentropyApprox2,10,0.176485,4.588855e-09,0.079477,0.067567
NegentropyApprox1(2),10,1.193198,2.930608e-09,0.079095,0.068947
NegentropyApprox1(1),10,0.594406,2.608066e-08,0.079577,0.068959
NegentropyApprox5,10,6.940723,9.041453e-05,0.383112,0.437564
NegentropyApprox4,10,16.173287,0.000196038,0.538388,0.686078
NegentropyApprox3,10,283.047099,5.075092e-08,0.499899,5.813856
NegentropyApprox5,100,0.665296,7.65094e-06,0.072526,0.067293
NegentropyApprox1(2),100,0.176485,9.636471e-09,0.079734,0.070027
NegentropyApprox1(1.5),100,0.176485,1.432465e-09,0.080545,0.070581


Какие выводы следуют из этих вычислений:

* на маленьких размерах выборки лучше всего работают приближения 1 и 2
* приближение 3(с эксцессом) ведет себя хуже всего в целом
* на самом деле приближения 1 и 2 дают самые стабильные результаты, их погрешность практически не зависит от размера выборки. Т.е. небольшая даже при <500
* приближения 4 и 5 на маленьких размерах(но не критично) выборки дают погрешности, не сильно отличающиеся от 1 и 2, но при этом обгоняют их в точности с ростом размера выборки

------
т.о.

* приближение 3 хуже всех, а исходя из его минуса -- чувствительности к выбросам, лучше не использовать его

* для очень маленьких размеров выборок(<50?) лучше всего использовать приближения 1, 2

* лучше всего себя показывают(но не на критично маленьких выборках) приближения scipy, 4 и 5