In [1]:
from math import sqrt

In [2]:
def avg(xs):
    total = 0.0
    count = 0
    for x in xs:
        total += x
        count += 1
    return total / count

In [3]:
def stddev0(xs):
    items = list(xs)
    ibar = avg(xs)
    return sqrt(
        sum((x - ibar) ** 2 for x in xs) / len(xs)
    )

def stddev1(xs):
    xs = list(xs)
    xbar2 = avg(xs) ** 2
    x2bar = avg(x ** 2 for x in xs)
    return sqrt(x2bar - xbar2)

def stddev2(xs):
    tx = 0.0
    tx2 = 0.0
    count = 0
    for x in xs:
        tx += x
        tx2 += x * x
        count += 1
    return sqrt(tx2 / count - (tx / count) ** 2)
    
stddev = stddev2

There are at least two equivalent ways to express the standard deviation as a formula. First, the familiar, which is implemented as `stddev0()` above:

$$\sqrt{\frac{1}{N} \sum_{i=1}^{N} (x_i - \bar{x})^2 }$$

Second, a more computational friendly variant which is implemented as `stddev2()` above (and less efficiently as `stddev1()` above):

$$\sqrt{ \frac{1}{N} \sum_{i=1}^{N} x_i^2 - \left( \frac{1}{N} \sum_{i=1}^{N} x_i \right)^2 }$$

In [4]:
items = [1, 3, 1, 3]
print(avg(items), stddev0(items), stddev1(items), stddev2(items))
items = [1, 1, 1, 60]
print(avg(items), stddev0(items), stddev1(items), stddev2(items))
items = [4, 5, 3, 6]
print(avg(items), stddev0(items), stddev1(items), stddev2(items))

2.0 1.0 1.0 1.0
15.75 25.54774941164094 25.54774941164094 25.54774941164094
4.5 1.118033988749895 1.118033988749895 1.118033988749895


## Numpy

In [5]:
import numpy as np

items = np.array([1, 3, 1, 3])
print(np.average(items), items.std())
items = np.array([1, 1, 1, 60])
print(np.average(items), items.std())
items = np.array([4, 5, 3, 6])
print(np.average(items), items.std())

2.0 1.0
15.75 25.54774941164094
4.5 1.118033988749895


In [6]:
items = np.array([
    [4, 1, 2, 0],
    [5, -1, -3, 2],
    [3, 2, 1, 0],
    [6, -6, 7, -3],
])
print(np.average(items, axis=0))
print(items.std(axis=0))

[ 4.5  -1.    1.75 -0.25]
[1.11803399 3.082207   3.56195171 1.78535711]


# Standard Deviation Facts

## Combining Data Sets

When two data sets having the same standard deviation are combined, the combined standard deviation increases from the shared value as the difference between the two sets' averages increases.

In [7]:
import random

# Target Standard Deviation
sd = 10
# Target Averages and Sizes
aa, counta = 250, 1000
ab, countb = 100, 1000

# This generates normally distributed values with the given average and standard deviation.
data_a = np.array([random.gauss(aa, sd) for i in range(counta)])
data_b = np.array([random.gauss(ab, sd) for i in range(countb)])
# This combines the two sets.
data_c = np.concatenate((data_a, data_b))

print('A', np.average(data_a), data_a.std())
print('B', np.average(data_b), data_b.std())
print('C', np.average(data_c), data_c.std())

A 250.58994693490035 9.833214329859887
B 99.91381020376556 10.127518446324235
C 175.25187856933297 75.99640723102445


This applies even when the size of the sets are not equal. As the difference in the size of the sets increases, the effect on the standard deviation is reduced.

In [8]:
import random

# Target Standard Deviation
sd = 10
# Target Averages and Sizes
aa, counta = 250, 1000
ab, countb = 100, counta * 10

# This generates normally distributed values with the given average and standard deviation.
data_a = np.array([random.gauss(aa, sd) for i in range(counta)])
data_b = np.array([random.gauss(ab, sd) for i in range(countb)])
# This combines the two sets.
data_c = np.concatenate((data_a, data_b))

print('A', np.average(data_a), data_a.std())
print('B', np.average(data_b), data_b.std())
print('C', np.average(data_c), data_c.std())

A 250.07137414863433 9.718420577996524
B 100.16112043852411 9.94091893482591
C 113.78932532126139 44.22334112934206
