# Welford algorithm for updating variance online

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy
import seaborn as sns
sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

In [2]:
# set random seed
np.random.seed(42)
N = int(1e6)
x = np.random.normal(size=N)

In [3]:
def native_mean(x):
    N = len(x)
    sum_ = sum(x)
    
    return sum_ / N

def native_var(x):
    N = len(x)
    mu = native_mean(x)
    sum_ = sum([(e - mu)**2 for e in x])
    
    return sum_ / N

def semi_native_var(x):
    N = len(x)
    mu = native_mean(x)
    var_ = sum([e **2 for e in x]) / N - mu**2
    
    return var_

def online_mean(old_mean, new_x, N):
    new_mean = old_mean + (new_x - old_mean) / (N + 1)
    
    return new_mean
    
def welford(old_var, old_mean, new_x, new_mean):
    new_var = old_var + ((new_x - old_mean) * (new_x - new_mean) - old_var) / (N + 1)
    
    return new_var

## Correctness <a name='correctness'></a>

In [4]:
new_x = 1.
# test mean
np.testing.assert_allclose(
    native_mean(np.append(x, new_x)), 
    online_mean(old_mean=native_mean(x), new_x=new_x, N=N)
)

# test variance
new_mean = online_mean(native_mean(x), new_x, N)
np.testing.assert_allclose(
    native_var(np.append(x, new_x)), 
    welford(old_var=native_var(x), old_mean=native_mean(x),
            new_x=new_x, new_mean=new_mean)
)

In [5]:
# test with numpy package
np.testing.assert_allclose(
    np.mean(np.append(x, new_x)), online_mean(native_mean(x), new_x, N))
np.testing.assert_allclose(
    np.var(np.append(x, new_x)), 
    welford(old_var=native_var(x), old_mean=native_mean(x),
            new_x=new_x, new_mean=new_mean)
)

## Speed <a name='speed'></a>

In [6]:
new_xs = np.append(x, new_x)
old_mean = native_mean(x)

%time print('\n', native_mean(new_xs))
%time print('\n', np.mean(new_xs))
%time print('\n', online_mean(old_mean=old_mean, new_x=new_x, N=N))
pass


 -0.0015987548555
CPU times: user 98.8 ms, sys: 702 µs, total: 99.5 ms
Wall time: 100 ms

 -0.0015987548555
CPU times: user 1.5 ms, sys: 141 µs, total: 1.64 ms
Wall time: 914 µs

 -0.0015987548555
CPU times: user 61 µs, sys: 18 µs, total: 79 µs
Wall time: 68.2 µs


In [7]:
old_var = native_var(x)
new_mean = native_mean(new_xs)

%time print('\n', native_var(new_xs))
%time print('\n', np.var(new_xs))
%time print('\n', welford(old_var=old_var, old_mean=old_mean, new_x=new_x, new_mean=new_mean))
pass


 1.00037611179
CPU times: user 535 ms, sys: 16.4 ms, total: 552 ms
Wall time: 553 ms

 1.00037611179
CPU times: user 5.27 ms, sys: 2.57 ms, total: 7.85 ms
Wall time: 7.04 ms

 1.00037611179
CPU times: user 65 µs, sys: 19 µs, total: 84 µs
Wall time: 71 µs


## Numerical stability <a name='ns'></a>

In [8]:
offset = 1e9
y = offset + x
new_y = offset + new_x
new_ys = np.append(y, new_y)

np.testing.assert_allclose(
    np.var(new_ys), 
    welford(old_var=native_var(y), old_mean=native_mean(y),
            new_x=new_y, new_mean=native_mean(new_ys))
)

In [9]:
print('\n', native_var(new_ys))
print('\n', semi_native_var(new_ys))
print('\n', np.var(new_ys))
print('\n', welford(old_var=native_var(y), old_mean=native_mean(y), 
                    new_x=new_y, new_mean=native_mean(new_ys)))


 1.00037611632

 -116352.0

 1.00037611191

 1.00037611618
