# Welford algorithm for updating variance online

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy
import seaborn as sns
sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

In [2]:
# set random seed
np.random.seed(42)
N = int(1e6)
x = np.random.normal(size=N)

In [10]:
def native_mean(x):
    N = len(x)
    sum_ = sum(x)
    
    return sum_ / N

def native_var(x):
    N = len(x)
    mu = native_mean(x)
    sum_ = sum([(e - mu)**2 for e in x])
    
    return sum_ / N

def semi_native_var(x):
    N = len(x)
    mu = native_mean(x)
    sum_ = sum([e **2 for e in x]) - N * mu**2
    
    return sum_ / N

def online_mean(old_mean, new_x, N):
    new_mean = old_mean + (new_x - old_mean) / (N + 1)
    
    return new_mean
    
def welford(old_var, old_mean, new_x, new_mean):
    new_var = old_var + ((new_x - old_mean) * (new_x - new_mean) - old_var) / (N + 1)
    
    return new_var

## Correctness

In [4]:
new_x = 1.
# test mean
np.testing.assert_allclose(
    native_mean(np.append(x, new_x)), 
    online_mean(old_mean=native_mean(x), new_x=new_x, N=N)
)

# test variance
new_mean = online_mean(native_mean(x), new_x, N)
np.testing.assert_allclose(
    native_var(np.append(x, new_x)), 
    welford(old_var=native_var(x), old_mean=native_mean(x),
            new_x=new_x, new_mean=new_mean)
)

In [5]:
# test with numpy package
np.testing.assert_allclose(
    np.mean(np.append(x, new_x)), online_mean(native_mean(x), new_x, N))
np.testing.assert_allclose(
    np.var(np.append(x, new_x)), 
    welford(old_var=native_var(x), old_mean=native_mean(x),
            new_x=new_x, new_mean=new_mean)
)

## Speed

In [6]:
new_xs = np.append(x, new_x)
old_mean = native_mean(x)

%time print('\n', native_mean(new_xs))
%time print('\n', np.mean(new_xs))
%time print('\n', online_mean(old_mean=old_mean, new_x=new_x, N=N))
pass


 -0.0015987548555
CPU times: user 95.7 ms, sys: 375 µs, total: 96 ms
Wall time: 96.3 ms

 -0.0015987548555
CPU times: user 2.32 ms, sys: 369 µs, total: 2.69 ms
Wall time: 1.66 ms

 -0.0015987548555
CPU times: user 82 µs, sys: 40 µs, total: 122 µs
Wall time: 93 µs


In [7]:
old_var = native_var(x)
new_mean = native_mean(new_xs)

%time print('\n', native_var(new_xs))
%time print('\n', np.var(new_xs))
%time print('\n', welford(old_var=old_var, old_mean=old_mean, new_x=new_x, new_mean=new_mean))
pass


 1.00037611179
CPU times: user 496 ms, sys: 18.1 ms, total: 514 ms
Wall time: 515 ms

 1.00037611179
CPU times: user 5.74 ms, sys: 2.88 ms, total: 8.62 ms
Wall time: 7.46 ms

 1.00037611179
CPU times: user 60 µs, sys: 14 µs, total: 74 µs
Wall time: 67 µs


## Numerical stability

In [8]:
offset = 1e9
y = offset + x
new_y = offset + new_x
new_ys = np.append(y, new_y)

np.testing.assert_allclose(
    np.var(new_ys), 
    welford(old_var=native_var(y), old_mean=native_mean(y),
            new_x=new_y, new_mean=native_mean(new_ys))
)

In [11]:
print('\n', native_var(new_ys))
print('\n', semi_native_var(new_ys))
print('\n', np.var(new_ys))
print('\n', welford(old_var=native_var(y), old_mean=native_mean(y), 
                    new_x=new_y, new_mean=native_mean(new_ys)))


 1.00037611632

 -116366.653809

 1.00037611191

 1.00037611618
