In [1]:
import numpy as np

In [2]:
def single_pass_variance(arr):
    count = 0
    mean = 0.0
    M2 = 0.0

    for x in arr:
        count += 1
        delta = x - mean
        mean += delta / count
        delta2 = x - mean
        M2 += delta * delta2

    approx_variance = M2 / count if count > 1 else 0.0
    meta_data = {"mean": mean, "count": count, "M2": M2}

    return approx_variance, meta_data

In [3]:
def combine_variance(meta1, meta2):
    count1, mean1, M21 = meta1["count"], meta1["mean"], meta1["M2"]
    count2, mean2, M22 = meta2["count"], meta2["mean"], meta2["M2"]

    combined_count = count1 + count2
    delta = mean2 - mean1

    # Handle edge cases where the combined count is invalid
    if combined_count <= 0:
        # If the combined count is zero or negative, return zeroed metadata
        return 0.0, {"mean": 0.0, "count": 0, "M2": 0.0}

    combined_mean = (count1 * mean1 + count2 * mean2) / combined_count
    combined_M2 = M21 + M22 + delta ** 2 * count1 * count2 / combined_count

    # Compute variance only if there are enough samples
    combined_variance = combined_M2 / combined_count if combined_count > 1 else 0.0
    combined_meta_ = {"mean": combined_mean, "count": combined_count, "M2": combined_M2}

    return combined_variance, combined_meta_


In [4]:
configs = [(3, 2, 100), (2, 4, 100), (6, 5, 100), (10, 1, 100),  (10, 1, 100),
           (8, 0.5, 100), (3, 10, 100), (20, 5, 100), (10, 1, 100), (10, 0.5, 100),]
arrays = []
for mean, v, size in configs:
    array = np.random.normal(mean, v, size)
    arrays.append(array)

In [5]:
combined_meta = {"mean": 0.0, "count": 0, "M2": 0.0}
for arr in arrays:
    var, meta = single_pass_variance(arr)
    print("current arrays' var:", var)
    var, combined_meta = combine_variance(combined_meta, meta)
print(var)

current arrays' var: 3.718343484803066
current arrays' var: 15.97092367835651
current arrays' var: 26.982244945836186
current arrays' var: 0.9837279269261521
current arrays' var: 1.0025548598340166
current arrays' var: 0.26652480289495173
current arrays' var: 104.89735846928266
current arrays' var: 25.814295480783535
current arrays' var: 1.0869866771464165
current arrays' var: 0.258468334481252
40.14453728518154


In [6]:
merged = np.zeros(sum([len(arr) for arr in arrays]))
index = 0
for arr in arrays:
    print("current arrays' var: {} | mean: {}", arr.var(), arr.mean())
    end_idx = index + len(arr)
    merged[index:end_idx] = arr
    index = end_idx
print(f"Aggregated Variance: {merged.var()}")

current arrays' var: {} | mean: {} 3.7183434848030674 3.052283344199052
current arrays' var: {} | mean: {} 15.97092367835651 2.1745184667572564
current arrays' var: {} | mean: {} 26.98224494583617 5.497054991476892
current arrays' var: {} | mean: {} 0.9837279269261521 10.043701445489491
current arrays' var: {} | mean: {} 1.0025548598340164 9.942178391025548
current arrays' var: {} | mean: {} 0.26652480289495173 7.996005197232475
current arrays' var: {} | mean: {} 104.89735846928268 2.7847963505271527
current arrays' var: {} | mean: {} 25.81429548078353 18.618266495788863
current arrays' var: {} | mean: {} 1.0869866771464156 9.924722633397502
current arrays' var: {} | mean: {} 0.2584683344812521 10.045586227692215
Aggregated Variance: 40.14453728518155


In [7]:
for arr in arrays:
    var, meta = single_pass_variance(arr)
    meta = {"mean": meta["mean"], "count": meta["count"]*-1, "M2": meta["M2"]*-1}
    var, combined_meta = combine_variance(combined_meta, meta)
    print(var, combined_meta["mean"])

41.16000956237442 8.55853668882082
38.577376580114844 9.356538966578764
37.80188655985781 9.907893820164746
43.93466005485547 9.885259215943956
52.52030354366168 9.873875380927634
64.48174934547484 10.343342926851424
25.618045705777394 12.862858452292848
0.676379507920351 9.98515443054484
0.2584683344813707 10.045586227692182
0.0 0.0


In [8]:
for arr in arrays:
    n = len(arr)
    merged = merged[n:]
    if len(merged) > 0:
        print(merged.var(), merged.mean(), len(merged))
    else:
        print("END")

41.16000956237442 8.558536688820823 900
38.57737658011484 9.356538966578768 800
37.8018865598578 9.90789382016475 700
43.93466005485545 9.885259215943961 600
52.52030354366167 9.873875380927641 500
64.48174934547481 10.343342926851433 400
25.618045705777288 12.86285845229286 300
0.6763795079202933 9.98515443054486 200
0.2584683344812521 10.045586227692215 100
END


In [9]:
test_arr = arrays[0]
print(single_pass_variance(test_arr))
var, agg_meta = single_pass_variance(test_arr[:1])
for i in range(1, len(test_arr)):
    _, meta = single_pass_variance(test_arr[i:i+1])
    var, agg_meta = combine_variance(agg_meta, meta)
print(var)

(3.718343484803066, {'mean': 3.0522833441990525, 'count': 100, 'M2': 371.83434848030663})
3.718343484803067
