In [1]:
import numpy as np

In [2]:
def single_pass_variance(arr):
    count = 0
    mean = 0.0
    M2 = 0.0

    for x in arr:
        count += 1
        delta = x - mean
        mean += delta / count
        delta2 = x - mean
        M2 += delta * delta2

    approx_variance = M2 / count if count > 1 else 0.0
    meta_data = {"mean": mean, "count": count, "M2": M2}

    return approx_variance, meta_data

In [3]:
def combine_variance(meta1, meta2):
    count1, mean1, M21 = meta1["count"], meta1["mean"], meta1["M2"]
    count2, mean2, M22 = meta2["count"], meta2["mean"], meta2["M2"]

    combined_count = count1 + count2
    delta = mean2 - mean1

    # Handle edge cases where the combined count is invalid
    if combined_count <= 0:
        # If the combined count is zero or negative, return zeroed metadata
        return 0.0, {"mean": 0.0, "count": 0, "M2": 0.0}

    combined_mean = (count1 * mean1 + count2 * mean2) / combined_count
    combined_M2 = M21 + M22 + delta ** 2 * count1 * count2 / combined_count

    # Compute variance only if there are enough samples
    combined_variance = combined_M2 / combined_count if combined_count > 1 else 0.0
    combined_meta_ = {"mean": combined_mean, "count": combined_count, "M2": combined_M2}

    return combined_variance, combined_meta_


In [4]:
configs = [(3, 2, 100), (2, 4, 100), (6, 5, 100), (10, 1, 100),  (10, 1, 100),
           (8, 0.5, 100), (3, 10, 100), (20, 5, 100), (10, 1, 100), (10, 0.5, 100),]
arrays = []
for mean, v, size in configs:
    array = np.random.normal(mean, v, size)
    arrays.append(array)

In [5]:
combined_meta = {"mean": 0.0, "count": 0, "M2": 0.0}
for arr in arrays:
    var, meta = single_pass_variance(arr)
    print("current arrays' var:", var)
    var, combined_meta = combine_variance(combined_meta, meta)
print(var)

current arrays' var: 3.53123956760863
current arrays' var: 14.709870449407413
current arrays' var: 20.99048340919769
current arrays' var: 1.1376232586273503
current arrays' var: 1.0299167510778449
current arrays' var: 0.24311824515259578
current arrays' var: 102.31056122915547
current arrays' var: 21.46079199880259
current arrays' var: 1.1674683619839121
current arrays' var: 0.21279694572204216
42.77890708495115


In [6]:
merged = np.zeros(sum([len(arr) for arr in arrays]))
index = 0
for arr in arrays:
    print("current arrays' var: {} | mean: {}", arr.var(), arr.mean())
    end_idx = index + len(arr)
    merged[index:end_idx] = arr
    index = end_idx
print(f"Aggregated Variance: {merged.var()}")

current arrays' var: {} | mean: {} 3.531239567608631 2.9211137264092586
current arrays' var: {} | mean: {} 14.709870449407413 1.366164067817935
current arrays' var: {} | mean: {} 20.99048340919769 6.370730949533316
current arrays' var: {} | mean: {} 1.1376232586273505 9.977346263949812
current arrays' var: {} | mean: {} 1.029916751077845 10.037239441494103
current arrays' var: {} | mean: {} 0.24311824515259636 7.99605362804817
current arrays' var: {} | mean: {} 102.31056122915541 2.841312120877409
current arrays' var: {} | mean: {} 21.460791998802588 20.097260461568816
current arrays' var: {} | mean: {} 1.1674683619839126 9.960200874248843
current arrays' var: {} | mean: {} 0.212796945722042 9.979575971499772
Aggregated Variance: 42.778907084951115


In [7]:
for arr in arrays:
    var, meta = single_pass_variance(arr)
    meta = {"mean": meta["mean"], "count": meta["count"]*-1, "M2": meta["M2"]*-1}
    var, combined_meta = combine_variance(combined_meta, meta)
    print(var, combined_meta["mean"])

43.758225368109414 8.736209308782021
39.750861894123474 9.657464963902532
40.66722282360894 10.126998394526707
47.25113468662522 10.151940416289523
56.49222075846614 10.174880611248607
69.0709691921938 10.719587357048717
30.405673759042088 13.345679102439155
0.690226502451419 9.969888422874321
0.21279694572218402 9.979575971499795
0.0 0.0


In [8]:
for arr in arrays:
    n = len(arr)
    merged = merged[n:]
    if len(merged) > 0:
        print(merged.var(), merged.mean(), len(merged))
    else:
        print("END")

43.758225368109365 8.73620930878202 900
39.75086189412344 9.65746496390253 800
40.6672228236089 10.126998394526703 700
47.251134686625164 10.151940416289518 600
56.49222075846606 10.174880611248602 500
69.07096919219372 10.719587357048708 400
30.405673759042056 13.345679102439144 300
0.690226502451348 9.969888422874307 200
0.212796945722042 9.979575971499772 100
END


In [10]:
test_arr = arrays[0]
print(single_pass_variance(test_arr))
var, agg_meta = single_pass_variance(test_arr[:1])
print(agg_meta)
for i in range(1, len(test_arr)):
    _, meta = single_pass_variance(test_arr[i:i+1])
    var, agg_meta = combine_variance(agg_meta, meta)
print(agg_meta)

(np.float64(3.53123956760863), {'mean': np.float64(2.92111372640926), 'count': 100, 'M2': np.float64(353.123956760863)})
{'mean': np.float64(1.0547895772439146), 'count': 1, 'M2': np.float64(0.0)}
{'mean': np.float64(2.9211137264092613), 'count': 100, 'M2': np.float64(353.12395676086305)}


In [16]:
test = np.linspace(0, 10, 11)
test, test.var()

(array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]),
 np.float64(10.0))