Skip to content

Commit

Permalink
Merge pull request #137 from mini-kep/dev1
Browse files Browse the repository at this point in the history
merge to master
  • Loading branch information
epogrebnyak committed Dec 12, 2017
2 parents 104520e + 116b131 commit 4e60802
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 205 deletions.
48 changes: 29 additions & 19 deletions issues/namelist.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# part of issue https://github.com/mini-kep/db/blob/master/doc/listing.md

import itertools
import fnmatch

Expand Down Expand Up @@ -52,7 +50,12 @@
"UST_7YEAR",
"WAGE_NOMINAL_rub",
"WAGE_REAL_rog",
"WAGE_REAL_yoy"
"WAGE_REAL_yoy",
"ZZZ"]


def get_names():
return NAMES

def extract_varname(label):
words = label.split('_')
Expand All @@ -63,22 +66,29 @@ def is_matched(name, pat):
return fnmatch.fnmatch(varhead, pat)

def make_namelist(patterns, names):
return sorted([name for pat in patterns for name in names
if is_matched(name, pat)])
namelist = [name for pat in patterns for name in names if is_matched(name, pat)]
return sorted(namelist)

def find_orphans(patterns, names):
found = make_namelist(patterns, names)
return list(set(names) - set(found))


if __name__ == '__main__':
# https://github.com/mini-kep/db/blob/master/doc/listing.md
if __name__ == '__main__':
from collections import OrderedDict
concepts = OrderedDict()
concepts.update({'GDP': ['GDP*']})
concepts.update({'Output': ['IND*', 'TRANSPORT_FREIGHT']})
concepts.update({'Prices': ['CPI*']})
concepts.update({'Retail trade': ['CPI*']})
concepts.update({'Government - revenue': ['GOV_REVENUE*']})
concepts.update({'Government - spending': ['GOV_EXP*']})
concepts.update({'Government - surplus': ['GOV_SURPLUS*']})
concepts.update({'Labour': ['WAGE_*', 'UNEMPL']})
concepts.update({'Exchange rate': ['USDRUR*']})
concepts.update({'Global': ['UST*', 'BRENT']})
print(concepts)
categories = OrderedDict(gdp=['GDP*'],
output=['IND*', 'TRANSPORT_FREIGHT'],
i=['INVESTMENT'],
xpi=['CPI*', 'PPI*'],
retail=['RETAIL_SALES*'],
gov=['GOV*'],
labor=['WAGE_*', 'UNEMPL'],
bop=['EXPORT*', 'IMPORT*'],
fx=['USDRUR*'],
glob=['UST*', 'BRENT'])
cat2names = {key:make_namelist(categories[key], NAMES)
for key in categories.keys()}
patterns = [pat for patterns in categories.values() for pat in patterns]
orp = find_orphans(patterns, NAMES)
assert orp == ['ZZZ']

112 changes: 86 additions & 26 deletions issues/test_namelist.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,95 @@
import pytest

from namelist import NAMES, make_namelist

def test_make_namelist_on_asterisk_retuRns_expected_list_of_strings():
result = make_namelist(patterns=['WAGE_*'], names=NAMES)
assert result == ['WAGE_NOMINAL_rub', 'WAGE_REAL_rog', 'WAGE_REAL_yoy']
from namelist import make_namelist

def test_make_namelist_returns_sorted_list():
names = ['WAGE_Z', 'WAGE_A']
result = make_namelist(patterns=['WAGE_*'], names=names)
assert result == ['WAGE_A', 'WAGE_Z']

def test_make_namelist_on_string_returns_expected_list_of_names():
result = make_namelist(patterns=['UNEMPL'], names=NAMES)
assert result == ['UNEMPL_pct']

def test_make_namelist_misses_name_in_the_middle():
result = make_namelist(patterns=['WAGE_*'], names=['UNNECESSARY_WAGE_1'])
assert result == []
NAMES = [
"BRENT",
"CPI_ALCOHOL_rog",
"CPI_FOOD_rog",
"CPI_NONFOOD_rog",
"CPI_rog",
"CPI_SERVICES_rog",
"EXPORT_GOODS_bln_usd",
"GDP_bln_rub",
"GDP_yoy",
"GOV_EXPENSE_CONSOLIDATED_bln_rub",
"GOV_EXPENSE_FEDERAL_bln_rub",
"GOV_EXPENSE_SUBFEDERAL_bln_rub",
"GOV_REVENUE_CONSOLIDATED_bln_rub",
"GOV_REVENUE_FEDERAL_bln_rub",
"GOV_REVENUE_SUBFEDERAL_bln_rub",
"GOV_SURPLUS_FEDERAL_bln_rub",
"GOV_SURPLUS_SUBFEDERAL_bln_rub",
"IMPORT_GOODS_bln_usd",
"INDPRO_rog",
"INDPRO_yoy",
"INVESTMENT_bln_rub",
"INVESTMENT_rog",
"INVESTMENT_yoy",
"RETAIL_SALES_bln_rub",
"RETAIL_SALES_FOOD_bln_rub",
"RETAIL_SALES_FOOD_rog",
"RETAIL_SALES_FOOD_yoy",
"RETAIL_SALES_NONFOOD_bln_rub",
"RETAIL_SALES_NONFOOD_rog",
"RETAIL_SALES_NONFOOD_yoy",
"RETAIL_SALES_rog",
"RETAIL_SALES_yoy",
"TRANSPORT_FREIGHT_bln_tkm",
"UNEMPL_pct",
"USDRUR_CB",
"UST_10YEAR",
"UST_1MONTH",
"UST_1YEAR",
"UST_20YEAR",
"UST_2YEAR",
"UST_30YEAR",
"UST_3MONTH",
"UST_3YEAR",
"UST_5YEAR",
"UST_6MONTH",
"UST_7YEAR",
"WAGE_NOMINAL_rub",
"WAGE_REAL_rog",
"WAGE_REAL_yoy",
"ZZZ"]

def test_make_namelist_misses_missing_string():
result = make_namelist(patterns="ABC", names=['DEF', 'XYZ'])
assert result == []

def test_make_namelist_ignores_lowercase_pattern():
result = make_namelist(patterns="def", names=['DEF', 'XYZ'])
assert result == []

def test_make_namelist_ignores_lowercase_name():
result = make_namelist(patterns="def", names=['def', 'XYZ'])
assert result == []
class Test_make_namelist():
def test_make_namelist_on_asterisk_returns_expected_list_of_strings(self):
result = make_namelist(patterns=['WAGE_*'], names=NAMES)
assert result == ['WAGE_NOMINAL_rub', 'WAGE_REAL_rog', 'WAGE_REAL_yoy']

def test_make_namelist_returns_sorted_list(self):
names = ['WAGE_Z', 'WAGE_A']
result = make_namelist(patterns=['WAGE_*'], names=names)
assert result == ['WAGE_A', 'WAGE_Z']

def test_make_namelist_on_string_returns_expected_list_of_names(self):
result = make_namelist(patterns=['UNEMPL'], names=NAMES)
assert result == ['UNEMPL_pct']

def test_make_namelist_misses_name_in_the_middle(self):
result = make_namelist(patterns=['WAGE_*'], names=['UNNECESSARY_WAGE_1'])
assert result == []

def test_make_namelist_misses_missing_string(self):
result = make_namelist(patterns="ABC", names=['DEF', 'XYZ'])
assert result == []

def test_make_namelist_ignores_lowercase_pattern(self):
result = make_namelist(patterns="def", names=['DEF', 'XYZ'])
assert result == []

def test_make_namelist_ignores_lowercase_name(self):
result = make_namelist(patterns="def", names=['def', 'XYZ'])
assert result == []

if __name__ == '__main__':
pytest.main([__file__])
pytest.main([__file__])




4 changes: 2 additions & 2 deletions src/csv2df/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def verify_tables(tables, pdef):
labels_in_tables = [t.label for t in tables]
labels_missed = [x for x in pdef.required if x not in labels_in_tables]
if labels_missed:
import pdb
pdb.set_trace()
#mport pdb
#pdb.set_trace()
raise ValueError("Missed labels: {}".format(labels_missed))


Expand Down
23 changes: 19 additions & 4 deletions src/csv2df/specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from collections import OrderedDict as odict

from csv2df.util_label import make_label
from csv2df.util_row_splitter import FUNC_MAPPER



# mapper dictionary to convert text in table headers to unit of measurement
Expand Down Expand Up @@ -290,9 +290,7 @@ def get_bounds(self, rows):
ParsingCommand("INDPRO",
"Индекс промышленного производства",
["yoy", "rog"]),
ParsingCommand("UNEMPL",
["Уровень безработицы", "Общая численность безработных"],
"pct"),
# TODO: can transform to one variable WAGE after parsing
ParsingCommand("WAGE_NOMINAL",
["Среднемесячная номинальная начисленная заработная плата работников организаций",
"Среднемесячная номинальная начисленная заработная плата одного работника"],
Expand All @@ -304,6 +302,23 @@ def get_bounds(self, rows):
ParsingCommand("TRANSPORT_FREIGHT",
"Коммерческий грузооборот транспорта",
"bln_tkm"),
ParsingCommand("AGROPROD",
["Индекс производства продукции сельского хозяйства в хозяйствах всех категорий",
"Продукция сельского хозяйства в хозяйствах всех категорий"],
"yoy"),
ParsingCommand("UNEMPL",
["Уровень безработицы", "Общая численность безработных"],
"pct"),
#ParsingCommand("UNEMPL_REGISTERED",
# "Численность официально зарегистрированных безработных в государственных учреждениях службы занятости",
# "pct"),
ParsingCommand("PPI",
["Индексы цен производителей промышленных товаров"],
"rog"),




]
PARSING_DEFINITION['default'] = Def(commands=_commands)

Expand Down
84 changes: 27 additions & 57 deletions src/tests/test_validator.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 8 11:01:45 2017
@author: PogrebnyakEV
"""

import pytest
import pandas as pd
from io import StringIO

import validator as vldr

from validator import ValidatorAnnual, ValidatorQtr, ValidatorMonthly

ANNUAL = [('GDP_bln_rub', 1999, 4823.0),
('GDP_yoy', 1999, 106.4),
('AGROPROD_yoy', 1999, 103.8),
]
QTR = [('GDP_bln_rub', 1999, {4: 1447}),
('CPI_rog', 1999, {1: 116.0, 2: 107.3, 3: 105.6, 4: 103.9})
]

MONTHLY = [('CPI_rog', 1999, {1: 108.4, 6: 101.9, 12: 101.3}),
('EXPORT_GOODS_bln_usd', 1999, {12: 9.7}),
('IMPORT_GOODS_bln_usd', 1999, {12: 4.0})
]

def to_dataframe(text):
return pd.read_csv(StringIO(text), sep="\t")


dfa_text = """ year CPI_ALCOHOL_rog CPI_FOOD_rog CPI_NONFOOD_rog CPI_SERVICES_rog CPI_rog EXPORT_GOODS_bln_usd GDP_bln_rub GDP_yoy GOV_EXPENSE_ACCUM_CONSOLIDATED_bln_rub GOV_EXPENSE_ACCUM_FEDERAL_bln_rub GOV_EXPENSE_ACCUM_SUBFEDERAL_bln_rub GOV_REVENUE_ACCUM_CONSOLIDATED_bln_rub GOV_REVENUE_ACCUM_FEDERAL_bln_rub GOV_REVENUE_ACCUM_SUBFEDERAL_bln_rub GOV_SURPLUS_ACCUM_FEDERAL_bln_rub GOV_SURPLUS_ACCUM_SUBFEDERAL_bln_rub IMPORT_GOODS_bln_usd INDPRO_yoy INVESTMENT_bln_rub INVESTMENT_yoy RETAIL_SALES_FOOD_bln_rub RETAIL_SALES_FOOD_yoy RETAIL_SALES_NONFOOD_bln_rub RETAIL_SALES_NONFOOD_yoy RETAIL_SALES_bln_rub RETAIL_SALES_yoy TRANSPORT_FREIGHT_bln_tkm UNEMPL_pct WAGE_NOMINAL_rub WAGE_REAL_yoy
1999-12-31 1999 143.2 135.0 139.2 134.0 136.5 75.6 4823.0 106.4 1258.0 666.9 653.8 1213.6 615.5 660.8 -51.4 7.0 39.5 670.4 105.3 866.1 93.6 931.3 94.7 1797.4 94.2 3372.0 13.0 1523.0 78.0
2000-12-31 2000 125.0 117.1 118.5 133.7 120.2 105.0 7306.0 110.0 1960.1 1029.2 1032.1 2097.7 1132.1 1065.8 102.9 33.8 44.9 1165.2 117.4 1093.2 107.5 1259.1 110.5 2352.3 109.0 3542.0 10.5 2223.0 120.9
Expand Down Expand Up @@ -59,50 +64,15 @@ def to_dataframe(text):
dfq = to_dataframe(dfq_text)
dfm = to_dataframe(dfm_text)


def yield_checkpoints():
for c in vldr.ANNUAL + vldr.QTR + vldr.MONTHLY:
for pt in vldr.serialise(c):
yield(pt)


assert vldr.CHECKPOINTS == list(yield_checkpoints())

checkpoint = ('a', 'GDP_bln_rub', 1999, 4823.0)
assert next(
vldr.serialise(checkpoint)) == {
'freq': 'a',
'label': 'GDP_bln_rub',
'period': False,
'value': 4823.0,
'year': 1999}

checkpoint2 = ('q', 'CPI_rog', 1999, {1: 116.0, 2: 107.3, 3: 105.6, 4: 103.9})
gen2 = vldr.serialise(checkpoint2)
assert next(gen2) == {
'freq': 'q',
'label': 'CPI_rog',
'year': 1999,
'period': 1,
'value': 116.0}
assert next(gen2) == {
'freq': 'q',
'label': 'CPI_rog',
'year': 1999,
'period': 2,
'value': 107.3}

checker = vldr.Validator(dfa, dfq, dfm)

pt = {
'freq': 'a',
'label': 'GDP_bln_rub',
'period': False,
'value': 4823.0,
'year': 1999}
z = checker.get_value(pt)
assert z == 4823
assert checker.is_included(pt)

for p in vldr.CHECKPOINTS:
assert checker.is_included(p)
#FIXME: split tests
def test_validate():
v = ValidatorAnnual(dfa, ANNUAL)
assert v.not_found() == ['AGROPROD_yoy']
v = ValidatorQtr(dfq, QTR)
assert v.not_found() == []
v = ValidatorMonthly(dfm, MONTHLY)
assert v.not_found() == []

if __name__ == "__main__":
pytest.main([__file__])

0 comments on commit 4e60802

Please sign in to comment.