In [1]:
import json
import numpy as np
import os
import pandas as pd
import sklearn
import sys

print(sys.version)
print(np.__version__)
print(sklearn.__version__)

3.7.10 (default, Feb 26 2021, 18:47:35) 
[GCC 7.3.0]
1.18.5
0.24.1


## Multivariate Reproducibility

This notebook showcases the reproducibility issues with sampling from a multivariate normal distribution in NumPy. It uses the following [GitHub issue](https://github.com/numpy/numpy/issues/2435) as an example.

In [2]:
import numpy as np
import sys

d = 10
alpha = 1 / d**0.5
mu = np.ones(d)
R = alpha * np.ones((d, d)) + (1 - alpha) * np.eye(d)
def run_mn(random_option):
    ls = []
    for _ in range(100):
        if random_option == 'randomstate':
            rng = np.random.RandomState(seed=587482)
            ls.append(rng.multivariate_normal(mu, R, 1))
        elif random_option == 'svd':
            rng = np.random.default_rng(587482)
            ls.append(rng.multivariate_normal(mu, R, 1))
        elif random_option == 'cholesky':
            rng = np.random.default_rng(587482)
            ls.append(rng.multivariate_normal(mu, R, 1, method='cholesky'))
    vals = np.unique(np.vstack(ls), axis=0).squeeze()
    return vals.tolist()

### Scenario 1

Using the old `RandomState` class, we sample from the multivariate normal:

In [3]:
randomstate_results = run_mn('randomstate')
randomstate_results

[0.0979690286726922,
 0.8510258024225754,
 -1.2944922209998215,
 -1.1840145803847344,
 0.4854180841009934,
 -0.2390789724144473,
 0.7711746637352508,
 0.7755532835594922,
 0.08713020483749645,
 0.6230842869687101]

### Scenario 2
Using the new `Generator` class, we sample from the multivariate normal with default arguments:

In [4]:
svd_results = run_mn('svd')
svd_results

[1.5067828253897848,
 1.738522577150377,
 -0.5997607564874361,
 0.530395048480902,
 0.08545111349984924,
 1.8629863885433342,
 1.8233736624413166,
 2.8039635077587115,
 1.5785772381397205,
 1.2154195449903729]

### Scenario 3
Using the new `Generator` class with the Cholesky decomposition method, we sample from the multivariate normal:

In [5]:
cholesky_results = run_mn('cholesky')
cholesky_results

[0.5895108638453821,
 2.7756236601607247,
 -1.0727796950422066,
 -0.07291690740378254,
 0.8692535007808191,
 1.2042482320343586,
 1.3048694562073413,
 0.9933494734676546,
 0.4586954349656872,
 1.102799093618016]

Cholesky decomposition should give us deterministic samples across all environments.

In [6]:
results = pd.DataFrame({
    'randomstate': randomstate_results,
    'svd': svd_results,
    'cholesky': cholesky_results,
})
results

Unnamed: 0,randomstate,svd,cholesky
0,0.097969,1.506783,0.589511
1,0.851026,1.738523,2.775624
2,-1.294492,-0.599761,-1.07278
3,-1.184015,0.530395,-0.072917
4,0.485418,0.085451,0.869254
5,-0.239079,1.862986,1.204248
6,0.771175,1.823374,1.304869
7,0.775553,2.803964,0.993349
8,0.08713,1.578577,0.458695
9,0.623084,1.21542,1.102799


In [7]:
os.makedirs('output', exist_ok=True)
results.to_csv(os.path.join('output', 'linuxubuntu_openblas_results.csv'), index=False)