# Census aggregation scratchpad

By [Ben Welsh](https://palewi.re/who-is-ben-welsh/)

In [1]:
import math

### Approximation

![](https://assets.documentcloud.org/documents/6162551/pages/20180418-MOE-p50-normal.gif)
![](https://assets.documentcloud.org/documents/6162551/pages/20180418-MOE-p51-normal.gif)

In [2]:
males_under_5, males_under_5_moe = 10154024, 3778

In [3]:
females_under_5, females_under_5_moe = 9712936, 3911

In [4]:
total_under_5 = males_under_5 + females_under_5

In [5]:
total_under_5

19866960

In [6]:
total_under_5_moe = math.sqrt(males_under_5_moe**2 + females_under_5_moe**2)

In [7]:
total_under_5_moe

5437.757350231803

![](https://assets.documentcloud.org/documents/6162551/pages/20180418-MOE-p52-normal.gif?1561126109)

In [8]:
def approximate_margin_of_error(*pairs):
    """
    Returns the approximate margin of error after combining all of the provided Census Bureau estimates, taking into account each value's margin of error.
    
    Expects a series of arguments, each a paired list with the estimated value first and the margin of error second.
    """
    # According to the Census Bureau, when approximating a sum use only the largest zero estimate margin of error, once
    # https://www.documentcloud.org/documents/6162551-20180418-MOE.html#document/p52
    zeros = [p for p in pairs if p[0] == 0]
    if len(zeros) > 1:
        max_zero_margin = max([p[1] for p in zeros])
        not_zero_margins = [p[1] for p in pairs if p[0] != 0]
        margins = [max_zero_margin] + not_zero_margins
    else:
        margins = [p[1] for p in pairs]
    return math.sqrt(sum([m**2 for m in margins]))    

In [9]:
approximate_margin_of_error(
    (males_under_5, males_under_5_moe),
    (females_under_5, females_under_5_moe)
)

5437.757350231803

In [10]:
approximate_margin_of_error(
    [0, 22],
    [0, 22],
    [0, 29],
    [41, 37]
)

47.01063709417264

### Aggregating totals

In [11]:
def total(*pairs):
    """
    Returns the combined value of all the provided Census Bureau estimates, along with an approximated margin of error.
    
    Expects a series of arguments, each a paired list with the estimated value first and the margin of error second.
    """
    return sum([p[0] for p in pairs]), approximate_margin_of_error(*pairs)

In [12]:
total(
    (males_under_5, males_under_5_moe),
    (females_under_5, females_under_5_moe)
)

(19866960, 5437.757350231803)

In [13]:
total(
    [0, 22],
    [0, 22],
    [0, 29],
    [41, 37]
)

(41, 47.01063709417264)

### Aggregating medians

![](https://assets.documentcloud.org/documents/6165014/pages/How-to-Recalculate-a-Median-p1-normal.gif?1561138970)
![](https://assets.documentcloud.org/documents/6165014/pages/How-to-Recalculate-a-Median-p2-normal.gif?1561138970)
![](https://assets.documentcloud.org/documents/6165014/pages/How-to-Recalculate-a-Median-p4-normal.gif?1561138970)

In [40]:
def median(range_list):
    """
    Returns the estimated median from a set of ranged totals. 
    
    Expects a list of dictionaries with three keys:
    
        start: The minimum value in the range
        end: The maximum value in the range
        total: The count of people, households or other universe figure in the range

    """
    # Sort the list
    range_list.sort(key=lambda x: x['start'])
    
    # What is the total number in the universe
    universe = sum([d['total'] for d in range_list])
    
    # What is the midpoint of that total?
    midpoint = universe / 2.0
    
    # What group contains the midpoint?
    running_total = 0
    for range_ in range_list:
        # Here we find it...
        if midpoint >= running_total and midpoint <= (running_total + range_['total']):
            # How many households in the midrange are needed to reach the midpoint?
            midrange_gap = midpoint - running_total
            
            # What is the proportion of the group that would be needed to get the midpoint?
            midrange_gap_percent = midrange_gap / range_['total']
            
            # Apply this proportion to the width of the midrange
            midrange_gap_adjusted = (range_['end'] - range_['start']) * midrange_gap_percent

            # Estimate the median
            estimated_median = range_['start'] + midrange_gap_adjusted

            # Return the result
            return estimated_median
        else:
            running_total += range_['total']

    # If we got this far something is wrong
    raise ValueError("The midpoint of the total does not fall within a data range.")

In [41]:
data = [
    dict(start=-2500, end=9999, total=186),
    dict(start=10000, end=14999, total=78),
    dict(start=15000, end=19999, total=98),
    dict(start=20000, end=24999, total=287),
    dict(start=25000, end=29999, total=142),
    dict(start=30000, end=34999, total=90),
    dict(start=35000, end=39999, total=107),
    dict(start=40000, end=44999, total=104),
    dict(start=45000, end=49999, total=178),
    dict(start=50000, end=59999, total=106),
    dict(start=60000, end=74999, total=177),
    dict(start=75000, end=99999, total=262),
    dict(start=100000, end=124999, total=77),
    dict(start=125000, end=149999, total=100),
    dict(start=150000, end=199999, total=58),
    dict(start=200000, end=250001, total=18)
]

In [42]:
median(data)

42211.096153846156