In [1]:
import json
import numpy as np
import os
import pandas as pd
import sklearn
import sys

print(sys.version)
print(np.__version__)
print(sklearn.__version__)

3.7.10 (default, Feb 26 2021, 18:47:35) 
[GCC 7.3.0]
1.18.5
0.24.1


## Multivariate Reproducibility

This notebook showcases the reproducibility issues with sampling from a multivariate normal distribution in NumPy. It uses the following [GitHub issue](https://github.com/numpy/numpy/issues/2435) as an example.

In [2]:
import numpy as np
import sys

d = 10
alpha = 1 / d**0.5
mu = np.ones(d)
R = alpha * np.ones((d, d)) + (1 - alpha) * np.eye(d)
def run_mn(random_option):
    ls = []
    for _ in range(100):
        np.random.rand(10)
        if random_option == 'randomstate':
            rng = np.random.RandomState(seed=587482)
            ls.append(rng.multivariate_normal(mu, R, 1))
        elif random_option == 'svd':
            rng = np.random.default_rng(587482)
            ls.append(rng.multivariate_normal(mu, R, 1))
        elif random_option == 'cholesky':
            rng = np.random.default_rng(587482)
            ls.append(rng.multivariate_normal(mu, R, 1, method='cholesky'))
    vals = np.unique(np.vstack(ls), axis=0).squeeze()
    return vals.tolist()

### Scenario 1

Using the old `RandomState` class, we sample from the multivariate normal:

In [3]:
randomstate_results = run_mn('randomstate')
randomstate_results

[0.0979690286726922,
 0.4400853794976822,
 0.26094846344373945,
 0.8460852784037585,
 0.7329605282179606,
 1.3780882566415975,
 -0.7805135060417068,
 -0.20018694051127506,
 -0.8869033304500695,
 -0.9147635773761715]

### Scenario 2
Using the new `Generator` class, we sample from the multivariate normal with default arguments:

In [4]:
svd_results = run_mn('svd')
svd_results

[1.5067828253897848,
 1.8202132278333147,
 1.630191108136768,
 1.1709111989663659,
 1.995509336179825,
 2.366191333437567,
 1.8158426438606308,
 0.5965130448771689,
 -1.0887921791227972,
 0.7323486103483059]

### Scenario 3
Using the new `Generator` class with the Cholesky decomposition method, we sample from the multivariate normal:

In [5]:
cholesky_results = run_mn('cholesky')
cholesky_results

[0.5895108638453821,
 2.7756236601607247,
 -1.0727796950422066,
 -0.07291690740378276,
 0.8692535007808192,
 1.2042482320343586,
 1.3048694562073413,
 0.9933494734676547,
 0.4586954349656871,
 1.102799093618016]

Cholesky decomposition should give us deterministic samples across all environments.

In [6]:
results = pd.DataFrame({
    'randomstate': randomstate_results,
    'svd': svd_results,
    'cholesky': cholesky_results,
})
results

Unnamed: 0,randomstate,svd,cholesky
0,0.097969,1.506783,0.589511
1,0.440085,1.820213,2.775624
2,0.260948,1.630191,-1.07278
3,0.846085,1.170911,-0.072917
4,0.732961,1.995509,0.869254
5,1.378088,2.366191,1.204248
6,-0.780514,1.815843,1.304869
7,-0.200187,0.596513,0.993349
8,-0.886903,-1.088792,0.458695
9,-0.914764,0.732349,1.102799


In [7]:
os.makedirs('output', exist_ok=True)
results.to_csv(os.path.join('output', 'linuxubuntu_mkl_results.csv'), index=False)