/
analyze_2005.py
61 lines (47 loc) · 1.81 KB
/
analyze_2005.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import numpy as np
from common import TABLES_DIR
from common import analyze_generic
from common import load_non_rectangular
from common import to_int
def load_2005():
filename = os.path.join(TABLES_DIR, '2005_Table_1.csv')
sheet = load_non_rectangular(filename)
if sheet.shape != (100, 18):
raise ValueError('Unexpected data shape.')
return sheet
def parse_2005(sheet):
# Check header row.
headers = [
['All Races\nand Both Sexes', 'Educational Attainment'],
['', 'Total'],
]
if not np.all(sheet[4:6, 0:2] == headers):
raise ValueError('Unexpected row headers.')
if sheet[27, 0] != 'Footnotes:':
raise ValueError('Expected footnotes to begin at 27.')
data_rows = sheet[6:27, :]
# Make sure the last column is bogus.
if not (np.all(sheet[5, -2:] == ['Doctoral degree', '']) and
np.all(data_rows[:, -1] == '')):
raise ValueError('Expected bogus column.')
data_rows = data_rows[:, :-1]
column_names = sheet[5, 2:-1]
row_names = data_rows[:, 0]
row_totals = np.vectorize(to_int)(data_rows[:, 1])
cohort_data = data_rows[:, 2:]
if row_names.shape + column_names.shape != cohort_data.shape:
raise ValueError('Expected row/col names to match data.')
if np.any(cohort_data == ''):
raise ValueError('Expected to empty cells in data.')
# See footnote:
# A dash (-) represents zero or rounds to zero.
cohort_data = np.vectorize(to_int)(cohort_data)
if np.any(np.abs(np.sum(cohort_data, axis=1) - row_totals) > 5):
raise ValueError('Row totals do not match observed.')
return row_names, column_names, cohort_data
def analyze_2005():
def parse_func():
sheet = load_2005()
return parse_2005(sheet)
return analyze_generic(parse_func)