From d01b1a7998872cab57c8281ff95452210bd342b3 Mon Sep 17 00:00:00 2001 From: Alexander Belikov Date: Sun, 17 Jul 2022 23:56:09 +0200 Subject: [PATCH 1/3] clean up --- graph_cast/architecture/schema.py | 1 + graph_cast/input/json_aux.py | 141 ++++++++++++++---------------- run/arango/ingest_csv.py | 8 +- run/arango/ingest_json.py | 13 +-- 4 files changed, 72 insertions(+), 91 deletions(-) diff --git a/graph_cast/architecture/schema.py b/graph_cast/architecture/schema.py index 0b9b1e13..84050126 100644 --- a/graph_cast/architecture/schema.py +++ b/graph_cast/architecture/schema.py @@ -43,6 +43,7 @@ def __init__( self._numeric_fields = numeric_fields # set of filters self._filters = [Filter(**item) for item in filters] + # currently not used self._transforms = [Transform(**item) for item in transforms] diff --git a/graph_cast/input/json_aux.py b/graph_cast/input/json_aux.py index 50837696..952d6f66 100644 --- a/graph_cast/input/json_aux.py +++ b/graph_cast/input/json_aux.py @@ -39,7 +39,6 @@ def apply_mapper(mapper: Dict, document, vertex_config: VertexConfig): if "transforms" in mapper: for t in mapper["transforms"]: t_ = Transform(**t) - # doc_.update(transform_foo(t, document)) doc_.update(transform_foo(t_, document)) if "map" in mapper: @@ -283,15 +282,6 @@ def pick_indexed_items_anchor_logic(items, indices, set_spec, anchor_key="anchor return items_ -def assign_edge_label(edges, label, condition): - edges_new = [(u, v, label) if condition(u, v) else (u, v, {}) for u, v in edges] - return edges_new - - -def clean_arobas(item): - return {k: v for k, v in item.items() if k[0] != "@"} - - def project_dict(item, keys, how="include"): if how == "include": return {k: v for k, v in item.items() if k in keys} @@ -308,17 +298,6 @@ def project_dicts(items, keys, how="include"): raise ValueError(f" `how` should be exclude or include : instead {how}") -def clean_aux_fields(pack): - pack_out = {} - for k, cpack in pack.items(): - if k != "@edges": - pack_out[k] = [clean_arobas(x) for x in cpack] - else: - pack_out[k] = [ - (clean_arobas(x[0]), clean_arobas(x[1]), x[2:]) for x in cpack - ] - return pack_out - def parse_edges(croot, edge_acc, mapping_fields): # TODO push mapping_fields etc to architecture @@ -353,11 +332,12 @@ def parse_edges(croot, edge_acc, mapping_fields): def merge_documents(docs, main_key="_key", anchor_key="anchor", anchor_value="main"): """ - docs contain docs with main_key and without + docs contain documents with main_key and documents without all docs without main_key should be merged with the doc that has doc[anchor_key] == anchor_value :param docs: :param main_key: :param anchor_key: + :param anchor_value: :return: list of docs, each of which contains main_key """ mains_, mains, auxs, anchors = [], [], [], [] @@ -431,57 +411,66 @@ def smart_merge( return agg -def get_json_data(source, pattern=None): - if source[-2:] == "gz": - open_foo = gzip.GzipFile - else: - open_foo = open - - with open_foo(source, "rb") as fp: - if pattern: - fps = FPSmart(fp, pattern) - else: - fps = fp - data = json.load(fps) - return data - - -# def foo_parallel(data, kwargs, n=None): -# func = partial(process_document_top, **kwargs) -# n_proc = 4 -# if n is not None: -# data = data[:n] -# with mp.Pool(n_proc) as p: -# r = p.map(func, data) -# return r - - -def parse_config(config=None): - """ - only parse_edges depends on json - - :param config: - :param prefix: - :return: - """ - - ( - vmap, - index_fields_dict, - extra_indices, - vfields, - blank_collections, - ) = parse_vcollection(config) - - edge_def, excl_fields = parse_edges(config["json"], [], defaultdict(list)) - - graphs_definition = define_graphs(edge_def, vmap) - graphs_definition = update_graph_extra_edges( - graphs_definition, vmap, config["extra_edges"] - ) - - vcollections = list( - set([graphs_definition[g]["source"] for g in graphs_definition]) - | set([graphs_definition[g]["target"] for g in graphs_definition]) - ) - return vcollections, vmap, graphs_definition, index_fields_dict, extra_indices +# def assign_edge_label(edges, label, condition): +# edges_new = [(u, v, label) if condition(u, v) else (u, v, {}) for u, v in edges] +# return edges_new + + +# def clean_arobas(item): +# return {k: v for k, v in item.items() if k[0] != "@"} + +# def clean_aux_fields(pack): +# pack_out = {} +# for k, cpack in pack.items(): +# if k != "@edges": +# pack_out[k] = [clean_arobas(x) for x in cpack] +# else: +# pack_out[k] = [ +# (clean_arobas(x[0]), clean_arobas(x[1]), x[2:]) for x in cpack +# ] +# return pack_out + +# def get_json_data(source, pattern=None): +# if source[-2:] == "gz": +# open_foo = gzip.GzipFile +# else: +# open_foo = open +# +# with open_foo(source, "rb") as fp: +# if pattern: +# fps = FPSmart(fp, pattern) +# else: +# fps = fp +# data = json.load(fps) +# return data +# +# +# def parse_config(config=None): +# """ +# only parse_edges depends on json +# +# :param config: +# :param prefix: +# :return: +# """ +# +# ( +# vmap, +# index_fields_dict, +# extra_indices, +# vfields, +# blank_collections, +# ) = parse_vcollection(config) +# +# edge_def, excl_fields = parse_edges(config["json"], [], defaultdict(list)) +# +# graphs_definition = define_graphs(edge_def, vmap) +# graphs_definition = update_graph_extra_edges( +# graphs_definition, vmap, config["extra_edges"] +# ) +# +# vcollections = list( +# set([graphs_definition[g]["source"] for g in graphs_definition]) +# | set([graphs_definition[g]["target"] for g in graphs_definition]) +# ) +# return vcollections, vmap, graphs_definition, index_fields_dict, extra_indices diff --git a/run/arango/ingest_csv.py b/run/arango/ingest_csv.py index 49b295d4..8a549588 100644 --- a/run/arango/ingest_csv.py +++ b/run/arango/ingest_csv.py @@ -1,7 +1,7 @@ import argparse import yaml import logging -from graph_cast.db.arango import get_arangodb_client +from graph_cast.db import ConfigFactory from graph_cast.main import ingest_csvs logger = logging.getLogger(__name__) @@ -94,14 +94,12 @@ with open(args.config_path, "r") as f: config = yaml.load(f, Loader=yaml.FullLoader) - db_client = get_arangodb_client( - args.protocol, args.id_addr, args.port, args.db, args.cred_name, args.cred_pass - ) + conn_conf = ConfigFactory.create_config(args=config) ingest_csvs( args.path, config, - db_client, + conn_conf, limit_files=args.limit_files, max_lines=args.max_lines, clean_start=args.clean_start, diff --git a/run/arango/ingest_json.py b/run/arango/ingest_json.py index 6936376b..4ecf559c 100644 --- a/run/arango/ingest_json.py +++ b/run/arango/ingest_json.py @@ -3,7 +3,7 @@ from os.path import expanduser from graph_cast.main import ingest_json_files -from graph_cast.db.arango import get_arangodb_client +from graph_cast.db import ConfigFactory import logging logger = logging.getLogger(__name__) @@ -95,19 +95,12 @@ logging.basicConfig(filename="ingest_json.log", level=logging.INFO) - db_client = get_arangodb_client( - protocol=args.protocol, - ip_addr=args.id_addr, - port=args.port, - database=args.db, - cred_name=args.login_name, - cred_pass=args.login_password, - ) + conn_conf = ConfigFactory.create_config(args=args) ingest_json_files( expanduser(args.datapath), config=config_, - conn_conf=db_client, + conn_conf=conn_conf, keyword=args.keyword, clean_start=clean_start, ) From fd78853186a25a32827cec0116eec1a39074660b Mon Sep 17 00:00:00 2001 From: Alexander Belikov Date: Sat, 22 Oct 2022 17:59:38 +0200 Subject: [PATCH 2/3] update of weights rules --- conf/json/kg_v0.yaml | 4 +++- conf/json/wos.yaml | 9 ++++++--- graph_cast/input/json_aux.py | 32 +++++++++++++++++++++++--------- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/conf/json/kg_v0.yaml b/conf/json/kg_v0.yaml index 848e7918..4f949f23 100644 --- a/conf/json/kg_v0.yaml +++ b/conf/json/kg_v0.yaml @@ -9,7 +9,9 @@ json: name: concept vertex: - name: publication - field: id + mapper: + id: publication + edges: - how: all diff --git a/conf/json/wos.yaml b/conf/json/wos.yaml index c7956ad6..45763a2d 100644 --- a/conf/json/wos.yaml +++ b/conf/json/wos.yaml @@ -14,7 +14,8 @@ json: name: publication condition: anchor: main - field: _key + mapper: + _key: publication - source: name: contributor @@ -25,7 +26,8 @@ json: name: publication condition: anchor: main - field: _key + mapper: + _key: publication - source: name: publisher @@ -36,7 +38,8 @@ json: name: publication condition: anchor: main - field: _key + mapper: + _key: publication edges: - how: all diff --git a/graph_cast/input/json_aux.py b/graph_cast/input/json_aux.py index 012ca7ee..768e62d7 100644 --- a/graph_cast/input/json_aux.py +++ b/graph_cast/input/json_aux.py @@ -145,20 +145,34 @@ def add_weights(mapper, agg): if "vertex" in edge_def: for item in edge_def["vertex"]: + # item + # name: publication + # condition: + # anchor: main + # keys: + # mapper: + # k1: q1 + + # should rather become index for the given vcollection (item["name"]) + keys_to_add = item["keys"] if "keys" in item else [] + keys_to_map = item["mapper"] if "mapper" in item else {} + keys_to_map.update({k: k for k in keys_to_add}) + vs = [doc for doc in agg[item["name"]]] if "condition" in item.keys(): c = item["condition"] - vs = [doc for doc in vs if all([q in doc for q in c])] + vs = [ + doc + for doc in vs + if all([doc[q] == v in doc for q, v in c.items()]) + ] if vs: + # TODO : possible issue doc = vs[0] - if "condition" not in item.keys() or ( - "condition" in item.keys() - and all([doc[k] == v for k, v in c.items()]) - ): - for edoc in edges: - edoc["attributes"].update( - {item["name"]: doc[item["field"]]} - ) + for edoc in edges: + edoc["attributes"].update( + {q: doc[k] for k, q in keys_to_map.items()} + ) agg[(source, target)] = edges return agg From 66ab4dc31f0c6ddfa814b3df7cf447dd9670efe4 Mon Sep 17 00:00:00 2001 From: Alexander Belikov Date: Sat, 22 Oct 2022 18:32:40 +0200 Subject: [PATCH 3/3] adding prettifiers --- .pre-commit-config.yaml | 38 +- Makefile | 12 +- conf/json/freshcaller.yaml | 57 +- conf/json/kg_v0.yaml | 122 +- conf/json/wos.yaml | 2161 ++++++++++++++---------------- conf/table/ibes.yaml | 170 ++- conf/table/ticker.yaml | 170 ++- conf/table/wos.yaml | 213 ++- poetry.lock | 135 +- pyproject.toml | 2 + test/data/merge_anchor_test.json | 98 +- test/ref/freshcaller_sizes.yaml | 5 +- test/ref/kg_v0_sizes.yaml | 10 +- 13 files changed, 1542 insertions(+), 1651 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 749aad32..27479d3d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,37 +1,49 @@ fail_fast: true repos: - - repo: https://github.com/myint/autoflake +- repo: https://github.com/myint/autoflake rev: v1.4 hooks: - - id: autoflake + - id: autoflake args: - - --in-place - - --remove-unused-variables + - --in-place + - --remove-unused-variables # - --remove-all-unused-imports - - repo: https://github.com/ambv/black +- repo: https://github.com/ambv/black rev: 22.3.0 hooks: - - id: black + - id: black args: [--diff, --check, -l 79] - - repo: https://github.com/PyCQA/isort +- repo: https://github.com/PyCQA/isort rev: 5.10.1 hooks: - - id: isort + - id: isort args: - - --line-length=79 - - --src=graph_cast - - repo: https://github.com/pre-commit/mirrors-mypy + - --line-length=79 + - --src=graph_cast +- repo: https://github.com/pre-commit/mirrors-mypy rev: v0.971 hooks: - - id: mypy + - id: mypy # args: -# - --module lm_service +# - --module graph_cast exclude: (^test/|^run/) # args: [--strict] +- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.4.0 + hooks: + - id: pretty-format-yaml + args: [--autofix, --indent, '4', --preserve-quotes] + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: pretty-format-json + args: [--autofix, --indent, '4'] + # - repo: https://github.com/pre-commit/mirrors-pylint # rev: v2.14.5 # hooks: diff --git a/Makefile b/Makefile index 115b2192..7f9117a3 100644 --- a/Makefile +++ b/Makefile @@ -21,10 +21,18 @@ isort: autoflake: autoflake --remove-unused-variables --verbose --in-place ./graph_cast/**/*py -all: autoflake black isort mypy +.PHONY: prettyyaml +prettyyaml: + find . -name "*yaml" -and -not -ipath './.*' -type f | xargs pretty-format-yaml --autofix --indent 4 + +.PHONY: prettyjson +prettyjson: + find . -name "*json" -and -not -ipath './.*' -type f | xargs pretty-format-json --autofix --indent 4 + +all: autoflake black isort mypy prettyyaml prettyjson #.PHONY: pylint #pylint: -# pylint lm_service +# pylint package_name diff --git a/conf/json/freshcaller.yaml b/conf/json/freshcaller.yaml index da4d4925..f256b387 100644 --- a/conf/json/freshcaller.yaml +++ b/conf/json/freshcaller.yaml @@ -3,51 +3,44 @@ general: json: type: item edges: - - - how: all - source: - name: participant - target: - name: call - maps: - - - how: dict + - how: all + source: + name: participant + target: name: call - - - type: item - descend_key: participants + maps: + - how: dict + name: call + - type: item + descend_key: participants + maps: + - type: list maps: - - - type: list - maps: - - - type: item - maps: - - - how: dict - name: participant + - type: item + maps: + - how: dict + name: participant vertex_collections: collections: call: basename: calls fields: - - id - - created_time + - id + - created_time index: - - id + - id participant: basename: participants fields: - - id - - participant_type + - id + - participant_type index: - - id + - id extra_index: - - - type: hash - unique: false - fields: - - issn + - type: hash + unique: false + fields: + - issn dummy: basename: dummies diff --git a/conf/json/kg_v0.yaml b/conf/json/kg_v0.yaml index 4f949f23..9d4fd4fb 100644 --- a/conf/json/kg_v0.yaml +++ b/conf/json/kg_v0.yaml @@ -3,25 +3,24 @@ general: json: type: item weights: - - source: - name: concept - target: - name: concept - vertex: - - name: publication - mapper: - id: publication + - source: + name: concept + target: + name: concept + vertex: + - name: publication + mapper: + id: publication edges: - - - how: all - source: - name: publication - target: - name: concept - anchor: meta - fields: - - anchor + - how: all + source: + name: publication + target: + name: concept + anchor: meta + fields: + - anchor # weights: # - source: # name: publication @@ -33,53 +32,44 @@ json: # condition: # anchor: main maps: - - - how: dict - name: publication - map: - publication_id: id + - how: dict + name: publication + map: + publication_id: id - - - type: item - descend_key: triples + - type: item + descend_key: triples + maps: + - type: list maps: - - - type: list + - type: item + edges: + - how: all + source: + name: concept + anchor: meta + target: + name: concept + anchor: core + weight_exclusive: + - tritype + maps: + - type: item + descend_key: triple_meta + maps: + - how: dict + __extra: + anchor: meta + name: concept + - type: list + descend_key: triple maps: - - - type: item - edges: - - - how: all - source: - name: concept - anchor: meta - target: - name: concept - anchor: core - weight_exclusive: - - tritype - maps: - - - type: item - descend_key: triple_meta - maps: - - - how: dict - __extra: - anchor: meta - name: concept - - - type: list - descend_key: triple - maps: - - - how: dict - __extra: - anchor: core - name: concept - map: - type: tritype + - how: dict + __extra: + anchor: core + name: concept + map: + type: tritype vertex_collections: @@ -87,14 +77,14 @@ vertex_collections: publication: basename: publications fields: - - id + - id index: - - id + - id concept: basename: concepts fields: - - hash - - text + - hash + - text index: - - hash + - hash diff --git a/conf/json/wos.yaml b/conf/json/wos.yaml index 45763a2d..1dc49cf4 100644 --- a/conf/json/wos.yaml +++ b/conf/json/wos.yaml @@ -4,1346 +4,1145 @@ json: type: item descend_key: REC weights: - - - source: - name: location - target: - name: organization - vertex: - - - name: publication - condition: - anchor: main - mapper: - _key: publication - - - source: - name: contributor - target: - name: location - vertex: - - - name: publication - condition: - anchor: main - mapper: - _key: publication - - - source: - name: publisher - target: - name: location - vertex: - - - name: publication - condition: - anchor: main - mapper: - _key: publication - edges: - - - how: all - source: - name: contributor - anchor: main - weight_exclusive: - - seq_no - - role - target: - name: publication - anchor: main - fields: - - anchor - - - how: all - source: - name: publication - anchor: main - target: - name: conference - - - how: all - source: - name: publication - anchor: main - fields: - - anchor - target: - name: date - anchor: main - - - how: all - source: - name: publication - anchor: main - fields: - - anchor - target: - name: medium_title - anchor: main - - - how: all - source: - name: publisher - anchor: main - weight_exclusive: - - seq_no - target: - name: publication - anchor: main - fields: - - anchor - - - how: all - source: - name: publication - anchor: main - target: - name: publication_type - - - how: all - source: - name: publication - anchor: main - target: - name: document_type - - - how: all - source: - name: publication - anchor: main - target: - name: language - weight_exclusive: - - type - - - how: all - source: - name: publication - anchor: main - target: - name: bib_id - - - how: all - source: - name: publication - anchor: main - target: - name: heading - - - how: all - source: - name: publication - anchor: main - target: - name: subheading - - - how: all - source: - name: publication - anchor: main - target: - name: subject - - - how: all - source: - name: publication - anchor: main - target: - name: some_id - - - how: all - source: - name: publication - anchor: main - target: - name: medium - - - how: all - source: - name: medium - target: - name: medium_title - anchor: main - - - how: all - source: - name: publication - anchor: main - target: - name: publication - anchor: reference - - - how: 1-n - source: - name: contributor - field: addr_no - anchor: main - type: list - all_value: 0 - target: - name: location - field: addr_no - anchor: main - type: value - - - how: all - source: - name: publication - anchor: main - target: - name: agency - - - how: all - source: - name: publication + - source: + name: location + target: + name: organization + vertex: + - name: publication + condition: anchor: main - target: - name: grant_id - - - how: all - source: - name: publication + mapper: + _key: publication + - source: + name: contributor + target: + name: location + vertex: + - name: publication + condition: anchor: main - target: - name: funding_text - - - how: all - source: - name: publication + mapper: + _key: publication + - source: + name: publisher + target: + name: location + vertex: + - name: publication + condition: anchor: main - target: - name: keyword - - - how: all - source: - name: publication - anchor: main - target: - name: abstract - - - how: all - source: - name: publication - anchor: main - target: - name: edition - maps: - - - how: dict + mapper: + _key: publication + edges: + - how: all + source: + name: contributor + anchor: main + weight_exclusive: + - seq_no + - role + target: name: publication - __extra: - anchor: main - map: - UID: _key - - - type: item - descend_key: static_data + anchor: main + fields: + - anchor + - how: all + source: + name: publication + anchor: main + target: + name: conference + - how: all + source: + name: publication + anchor: main + fields: + - anchor + target: + name: date + anchor: main + - how: all + source: + name: publication + anchor: main + fields: + - anchor + target: + name: medium_title + anchor: main + - how: all + source: + name: publisher + anchor: main + weight_exclusive: + - seq_no + target: + name: publication + anchor: main + fields: + - anchor + - how: all + source: + name: publication + anchor: main + target: + name: publication_type + - how: all + source: + name: publication + anchor: main + target: + name: document_type + - how: all + source: + name: publication + anchor: main + target: + name: language + weight_exclusive: + - type + - how: all + source: + name: publication + anchor: main + target: + name: bib_id + - how: all + source: + name: publication + anchor: main + target: + name: heading + - how: all + source: + name: publication + anchor: main + target: + name: subheading + - how: all + source: + name: publication + anchor: main + target: + name: subject + - how: all + source: + name: publication + anchor: main + target: + name: some_id + - how: all + source: + name: publication + anchor: main + target: + name: medium + - how: all + source: + name: medium + target: + name: medium_title + anchor: main + - how: all + source: + name: publication + anchor: main + target: + name: publication + anchor: reference + - how: 1-n + source: + name: contributor + field: addr_no + anchor: main + type: list + all_value: 0 + target: + name: location + field: addr_no + anchor: main + type: value + - how: all + source: + name: publication + anchor: main + target: + name: agency + - how: all + source: + name: publication + anchor: main + target: + name: grant_id + - how: all + source: + name: publication + anchor: main + target: + name: funding_text + - how: all + source: + name: publication + anchor: main + target: + name: keyword + - how: all + source: + name: publication + anchor: main + target: + name: abstract + - how: all + source: + name: publication + anchor: main + target: + name: edition + maps: + - how: dict + name: publication + __extra: + anchor: main + map: + UID: _key + - type: item + descend_key: static_data + maps: + - type: item + descend_key: summary maps: - - - type: item - descend_key: summary + - type: item + descend_key: EWUID + maps: + - type: list + descend_key: edition maps: - - - type: item - descend_key: EWUID + - how: dict + name: edition + map: + '@value': name + - type: item + descend_key: pub_info + maps: + - how: dict + name: publication + __extra: + anchor: main + map: + '@has_abstract': has_abstract + '@vol': volume + '@issue': issue + '@supplement': supplement + - how: dict + name: date + __extra: + anchor: main + transforms: + - foo: parse_date_standard + module: graph_cast.util.transform + input: + - '@sortdate' + output: + - year + - month + - day + - how: dict + name: publication_type + map: + '@pubtype': type + - type: item + descend_key: page + maps: + - how: dict + name: publication + __extra: + anchor: main + map: + '@begin': first_page + '@end': last_page + '@page_count': page_count + '#text': str_pages + - type: item + descend_key: titles + maps: + - type: list + descend_key: title + maps: + - how: dict + name: publication + __extra: + anchor: main + filter: + '@type': item + map: + '#text': title + - how: dict + name: medium_title + __extra: + anchor: main + unfilter: + '@type': item + map: + '#text': title + - type: item + descend_key: names + maps: + - type: list + descend_key: name + maps: + - type: item + maps: + - how: dict + name: contributor + __extra: + anchor: main + map: + '@seq_no': seq_no + '@addr_no': addr_no + '@dais_id': dais_id + '@role': role + email_addr: email + - type: item + descend_key: doctypes + maps: + - type: list + descend_key: doctype + maps: + - how: dict + name: document_type + map: + '#text': name + - type: item + descend_key: publishers + maps: + - type: list + descend_key: publisher + maps: + - type: item + edges: + - how: all + source: + name: publisher + weight_exclusive: + - seq_no + - role + target: + name: location + maps: + - type: item + descend_key: address_spec maps: - - - type: list - descend_key: edition - maps: - - - how: dict - name: edition - map: - '@value': name - - - type: item - descend_key: pub_info + - how: dict + name: location + map: + '@addr_no': addr_no + - type: item + descend_key: names maps: - - - how: dict - name: publication + - type: list + descend_key: name + maps: + - how: dict + name: publisher __extra: anchor: main map: - '@has_abstract': has_abstract - '@vol': volume - '@issue': issue - '@supplement': supplement - - - how: dict - name: date - __extra: - anchor: main - transforms: - - - foo: parse_date_standard - module: graph_cast.util.transform - input: - - '@sortdate' - output: - - year - - month - - day - - - how: dict - name: publication_type - map: - '@pubtype': type - - - type: item - descend_key: page - maps: - - - how: dict - name: publication - __extra: - anchor: main - map: - '@begin': first_page - '@end': last_page - '@page_count': page_count - '#text': str_pages - - - type: item - descend_key: titles + '@seq_no': seq_no + '@role': role + - type: item + descend_key: conferences + maps: + - type: list + descend_key: conference + edges: + - how: all + source: + name: conference + target: + name: conference_title + - how: all + source: + name: conference + target: + name: conference_info + - how: all + source: + name: conference + target: + name: conference_sponsor + - how: all + source: + name: conference + target: + name: location + - how: all + source: + name: conference + target: + name: date + maps: + - how: dict + name: conference + map: + '@conf_id': id + - type: item + descend_key: conf_titles + maps: + - type: list + descend_key: conf_title maps: - - - type: list - descend_key: title - maps: - - - how: dict - name: publication - __extra: - anchor: main - filter: - '@type': item - map: - '#text': title - - - how: dict - name: medium_title - __extra: - anchor: main - unfilter: - '@type': item - map: - '#text': title - - - type: item - descend_key: names + - how: dict + name: conference_title + map: + '#text': title + - type: item + descend_key: conf_infos + maps: + - type: list + descend_key: conf_info maps: - - - type: list - descend_key: name - maps: - - - type: item - maps: - - - how: dict - name: contributor - __extra: - anchor: main - map: - '@seq_no': seq_no - '@addr_no': addr_no - '@dais_id': dais_id - '@role': role - email_addr: email - - - type: item - descend_key: doctypes + - how: dict + name: conference_info + map: + '#text': text + - type: item + descend_key: conf_dates + maps: + - type: list + descend_key: conf_date maps: - - - type: list - descend_key: doctype - maps: - - - how: dict - name: document_type - map: - '#text': name - - - type: item - descend_key: publishers + - how: dict + name: date + transforms: + - foo: parse_date_conf + module: graph_cast.util.transform + input: + - '@conf_start' + output: + - year + - month + - day + - type: item + descend_key: conf_locations + maps: + - type: list + descend_key: conf_location + edges: + - how: all + source: + name: location + target: + name: organization + weight_exclusive: + - anchor maps: - - - type: list - descend_key: publisher - maps: - - - type: item - edges: - - - how: all - source: - name: publisher - weight_exclusive: - - seq_no - - role - target: - name: location - maps: - - - type: item - descend_key: address_spec - maps: - - - how: dict - name: location - map: - '@addr_no': addr_no - - - type: item - descend_key: names - maps: - - - type: list - descend_key: name - maps: - - - how: dict - name: publisher - __extra: - anchor: main - map: - '@seq_no': seq_no - '@role': role - - - type: item - descend_key: conferences + - how: dict + name: location + __extra: + anchor: conference + map: + conf_city: city + conf_state: state + - how: dict + name: organization + __extra: + anchor: conference + map: + conf_host: name + - type: item + descend_key: sponsors + maps: + - type: list + descend_key: sponsor maps: - - - type: list - descend_key: conference - edges: - - - how: all - source: - name: conference - target: - name: conference_title - - - how: all - source: - name: conference - target: - name: conference_info - - - how: all - source: - name: conference - target: - name: conference_sponsor - - - how: all - source: - name: conference - target: - name: location - - - how: all - source: - name: conference - target: - name: date - maps: - - - how: dict - name: conference - map: - '@conf_id': id - - - type: item - descend_key: conf_titles - maps: - - - type: list - descend_key: conf_title - maps: - - - how: dict - name: conference_title - map: - '#text': title - - - type: item - descend_key: conf_infos - maps: - - - type: list - descend_key: conf_info - maps: - - - how: dict - name: conference_info - map: - '#text': text - - - type: item - descend_key: conf_dates - maps: - - - type: list - descend_key: conf_date - maps: - - - how: dict - name: date - transforms: - - - foo: parse_date_conf - module: graph_cast.util.transform - input: - - '@conf_start' - output: - - year - - month - - day - - - type: item - descend_key: conf_locations - maps: - - - type: list - descend_key: conf_location - edges: - - - how: all - source: - name: location - target: - name: organization - weight_exclusive: - - anchor - maps: - - - how: dict - name: location - __extra: - anchor: conference - map: - conf_city: city - conf_state: state - - - how: dict - name: organization - __extra: - anchor: conference - map: - conf_host: name - - - type: item - descend_key: sponsors - maps: - - - type: list - descend_key: sponsor - maps: - - - how: dict - name: conference_sponsor - map: - '#text': name - - - type: item - descend_key: fullrecord_metadata + - how: dict + name: conference_sponsor + map: + '#text': name + - type: item + descend_key: fullrecord_metadata + maps: + - type: item + descend_key: languages + maps: + - type: list + descend_key: language maps: - - - type: item - descend_key: languages - maps: - - - type: list - descend_key: language - maps: - - - how: dict - name: language - map: - '#text': name - '@type': type - - - type: item - descend_key: normalized_languages - maps: - - - type: list - descend_key: language - maps: - - - how: dict - name: language - map: - '#text': name - '@type': type - - - type: item - descend_key: normalized_doctypes - maps: - - - type: list - descend_key: doctype - maps: - - - how: dict - name: document_type - map: - '#text': name - - - type: item - descend_key: references - maps: - - - type: list - descend_key: reference - maps: - - - type: item - edges: - - - how: all - source: - name: publication - target: - name: date - - - how: all - source: - name: contributor - target: - name: publication - values: - reference: true - - - how: all - source: - name: publication - target: - name: medium_title - maps: - - - how: dict - name: publication - __extra: - anchor: reference - map: - uid: _key - xref_doi: doi - page: first_page - citedTitle: title - - - type: item - descend_key: year - maps: - - - how: dict - name: date - transforms: - - - foo: parse_date_reference - module: graph_cast.util.transform - input: - - '#text' - output: - - year - - - how: dict - name: medium_title - map: - citedWork: title - - - how: dict - descend_key: reference - name: contributor - map: - citedAuthor: wos_standard - - - type: item - descend_key: addresses + - how: dict + name: language + map: + '#text': name + '@type': type + - type: item + descend_key: normalized_languages + maps: + - type: list + descend_key: language + maps: + - how: dict + name: language + map: + '#text': name + '@type': type + - type: item + descend_key: normalized_doctypes + maps: + - type: list + descend_key: doctype + maps: + - how: dict + name: document_type + map: + '#text': name + - type: item + descend_key: references + maps: + - type: list + descend_key: reference + maps: + - type: item + edges: + - how: all + source: + name: publication + target: + name: date + - how: all + source: + name: contributor + target: + name: publication + values: + reference: true + - how: all + source: + name: publication + target: + name: medium_title + maps: + - how: dict + name: publication + __extra: + anchor: reference + map: + uid: _key + xref_doi: doi + page: first_page + citedTitle: title + - type: item + descend_key: year maps: - - - type: list - descend_key: address_name - maps: - - - type: item - descend_key: address_spec - edges: - - - how: all - source: - name: location - target: - name: organization - weight_exclusive: - - pref - - - how: all - source: - name: organization - target: - name: suborganization - maps: - - - how: dict - name: location - __extra: - anchor: main - map: - '@addr_no': addr_no - - - type: item - descend_key: organizations - maps: - - - type: list - descend_key: organization - maps: - - - how: dict - name: organization - map: - '#text': name - '@pref': pref - - - type: item - descend_key: suborganizations - maps: - - - type: list - descend_key: suborganization - maps: - - - how: dict - name: suborganization - map: - '#text': name - - - type: item - descend_key: reprint_addresses + - how: dict + name: date + transforms: + - foo: parse_date_reference + module: graph_cast.util.transform + input: + - '#text' + output: + - year + - how: dict + name: medium_title + map: + citedWork: title + - how: dict + descend_key: reference + name: contributor + map: + citedAuthor: wos_standard + - type: item + descend_key: addresses + maps: + - type: list + descend_key: address_name + maps: + - type: item + descend_key: address_spec + edges: + - how: all + source: + name: location + target: + name: organization + weight_exclusive: + - pref + - how: all + source: + name: organization + target: + name: suborganization + maps: + - how: dict + name: location + __extra: + anchor: main + map: + '@addr_no': addr_no + - type: item + descend_key: organizations maps: - - - type: list - descend_key: address_name - edges: - - - how: 1-n - source: - name: contributor - field: addr_no - anchor: reprint - type: list - all_value: 0 - target: - name: location - anchor: reprint - field: addr_no - type: value - maps: - - - type: item - descend_key: address_spec - edges: - - - how: all - source: - name: location - target: - name: organization - weight_exclusive: - - pref - - - how: all - source: - name: organization - target: - name: suborganization - maps: - - - how: dict - name: location - __extra: - anchor: reprint - map: - '@addr_no': addr_no - - - type: item - descend_key: organizations - maps: - - - type: list - descend_key: organization - maps: - - - how: dict - name: organization - map: - '#text': name - '@pref': pref - - - type: item - descend_key: suborganizations - maps: - - - type: list - descend_key: suborganization - maps: - - - how: dict - name: suborganization - map: - '#text': name - - - type: item - descend_key: names - maps: - - - type: list - descend_key: name - maps: - - - type: item - maps: - - - how: dict - name: contributor - __extra: - anchor: reprint - map: - '@seq_no': seq_no - '@addr_no': addr_no - '@dais_id': dais_id - email_addr: email - - - type: item - descend_key: category_info + - type: list + descend_key: organization + maps: + - how: dict + name: organization + map: + '#text': name + '@pref': pref + - type: item + descend_key: suborganizations maps: - - - type: item - descend_key: headings - maps: - - - type: list - descend_key: heading - maps: - - - how: dict - name: heading - map: - '#text': name - - - type: item - descend_key: subheadings - maps: - - - type: list - descend_key: subheading - maps: - - - how: dict - name: subheading - map: - '#text': name - - - type: item - descend_key: subjects - maps: - - - type: list - descend_key: subject - maps: - - - how: dict - name: subject - map: - '#text': name - - - type: item - descend_key: fund_ack + - type: list + descend_key: suborganization + maps: + - how: dict + name: suborganization + map: + '#text': name + - type: item + descend_key: reprint_addresses + maps: + - type: list + descend_key: address_name + edges: + - how: 1-n + source: + name: contributor + field: addr_no + anchor: reprint + type: list + all_value: 0 + target: + name: location + anchor: reprint + field: addr_no + type: value + maps: + - type: item + descend_key: address_spec + edges: + - how: all + source: + name: location + target: + name: organization + weight_exclusive: + - pref + - how: all + source: + name: organization + target: + name: suborganization + maps: + - how: dict + name: location + __extra: + anchor: reprint + map: + '@addr_no': addr_no + - type: item + descend_key: organizations maps: - - - type: item - descend_key: fund_text - maps: - - - type: list - descend_key: p - maps: - - - how: dict - name: funding_text - map: - '#text': text - - - type: item - descend_key: grants - maps: - - - type: list - descend_key: grant - edges: - - - how: all - source: - name: agency - target: - name: grant_id - maps: - - - how: dict - name: agency - map: - grant_agency: name - - - type: item - descend_key: grant_ids - maps: - - - type: list - descend_key: grant_id - maps: - - - how: dict - name: grant_id - map: - '#text': id - - - type: item - descend_key: keywords + - type: list + descend_key: organization + maps: + - how: dict + name: organization + map: + '#text': name + '@pref': pref + - type: item + descend_key: suborganizations maps: - - - type: list - descend_key: keyword - maps: - - - how: dict - name: keyword - map: - '#text': name - - - type: item - descend_key: abstracts + - type: list + descend_key: suborganization + maps: + - how: dict + name: suborganization + map: + '#text': name + - type: item + descend_key: names + maps: + - type: list + descend_key: name maps: - - - type: list - descend_key: abstract - maps: - - - type: item - descend_key: abstract_text - maps: - - - type: list - descend_key: p - maps: - - - how: dict - name: abstract - map: - '#text': text - - - type: item - descend_key: item + - type: item + maps: + - how: dict + name: contributor + __extra: + anchor: reprint + map: + '@seq_no': seq_no + '@addr_no': addr_no + '@dais_id': dais_id + email_addr: email + - type: item + descend_key: category_info + maps: + - type: item + descend_key: headings + maps: + - type: list + descend_key: heading + maps: + - how: dict + name: heading + map: + '#text': name + - type: item + descend_key: subheadings + maps: + - type: list + descend_key: subheading + maps: + - how: dict + name: subheading + map: + '#text': name + - type: item + descend_key: subjects + maps: + - type: list + descend_key: subject + maps: + - how: dict + name: subject + map: + '#text': name + - type: item + descend_key: fund_ack + maps: + - type: item + descend_key: fund_text maps: - - - type: item - descend_key: ids + - type: list + descend_key: p + maps: + - how: dict + name: funding_text + map: + '#text': text + - type: item + descend_key: grants + maps: + - type: list + descend_key: grant + edges: + - how: all + source: + name: agency + target: + name: grant_id + maps: + - how: dict + name: agency + map: + grant_agency: name + - type: item + descend_key: grant_ids maps: - - - how: dict - name: some_id + - type: list + descend_key: grant_id + maps: + - how: dict + name: grant_id map: '#text': id - - - how: dict - name: bib_id - map: - bib_id: id - - - type: item - descend_key: keywords_plus + - type: item + descend_key: keywords + maps: + - type: list + descend_key: keyword + maps: + - how: dict + name: keyword + map: + '#text': name + - type: item + descend_key: abstracts + maps: + - type: list + descend_key: abstract + maps: + - type: item + descend_key: abstract_text + maps: + - type: list + descend_key: p maps: - - - type: list - descend_key: keyword_plus - maps: - - - how: dict - name: keyword - map: - '#text': name - - - type: item - descend_key: dynamic_data + - how: dict + name: abstract + map: + '#text': text + - type: item + descend_key: item maps: - - - type: item - descend_key: cluster_related + - type: item + descend_key: ids + maps: + - how: dict + name: some_id + map: + '#text': id + - how: dict + name: bib_id + map: + bib_id: id + - type: item + descend_key: keywords_plus + maps: + - type: list + descend_key: keyword_plus maps: - - - type: item - descend_key: identifiers - maps: - - - type: list - descend_key: identifier - maps: - - - how: dict - name: medium - filter: - '@type': issn - map: - '@value': issn - - - how: dict - name: publication - filter: - '@type': xref_doi - map: - '@value': doi + - how: dict + name: keyword + map: + '#text': name + - type: item + descend_key: dynamic_data + maps: + - type: item + descend_key: cluster_related + maps: + - type: item + descend_key: identifiers + maps: + - type: list + descend_key: identifier + maps: + - how: dict + name: medium + filter: + '@type': issn + map: + '@value': issn + - how: dict + name: publication + filter: + '@type': xref_doi + map: + '@value': doi vertex_collections: collections: publication: basename: publications fields: - - _key - - volume - - issue - - title - - accession_no - - has_abstract - - doi - - funding_text - - art_no - - first_page - - last_page - - page_count - - str_pages + - _key + - volume + - issue + - title + - accession_no + - has_abstract + - doi + - funding_text + - art_no + - first_page + - last_page + - page_count + - str_pages index: - - _key + - _key transforms: - - - foo: lower - class: str - fields: - - title + - foo: lower + class: str + fields: + - title medium: basename: media fields: - - issn - - isbn - - eissn - - eisbn - - accession_no - - meeting_abs - - parent_book_doi + - issn + - isbn + - eissn + - eisbn + - accession_no + - meeting_abs + - parent_book_doi index: - - issn - - isbn + - issn + - isbn extra_index: - - - type: hash - unique: false - fields: - - issn + - type: hash + unique: false + fields: + - issn medium_title: basename: media_title fields: - - title + - title index: - - title + - title transforms: - - - foo: lower - class: str - fields: - - title + - foo: lower + class: str + fields: + - title language: basename: languages fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name contributor: basename: contributors fields: - - first_name - - last_name - - email - - display_name - - wos_standard - - orcid_id - - r_id - - dais_id + - first_name + - last_name + - email + - display_name + - wos_standard + - orcid_id + - r_id + - dais_id index: - - first_name - - last_name - - email - - orcid_id - - r_id - - dais_id - - wos_standard + - first_name + - last_name + - email + - orcid_id + - r_id + - dais_id + - wos_standard transforms: - - - foo: lower - class: str - fields: - - first_name - - last_name - - display_name - - wos_standard - - - foo: standardize - module: graph_cast.util.transform - fields: - - wos_standard + - foo: lower + class: str + fields: + - first_name + - last_name + - display_name + - wos_standard + - foo: standardize + module: graph_cast.util.transform + fields: + - wos_standard location: basename: locations fields: - - country - - state - - city - - street - - full_address + - country + - state + - city + - street + - full_address index: - - country - - state - - city + - country + - state + - city extra_index: - - - type: hash - unique: false - fields: - - country + - type: hash + unique: false + fields: + - country transforms: - - - foo: lower - class: str - fields: - - country - - state - - city - - street - - full_address + - foo: lower + class: str + fields: + - country + - state + - city + - street + - full_address organization: basename: organizations fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name suborganization: basename: suborganizations fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name publication_type: basename: publication_types fields: - - type + - type index: - - type + - type document_type: basename: document_types fields: - - name + - name index: - - name + - name subject: basename: subjects fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name keyword: basename: keywords fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name edition: basename: editions fields: - - name + - name index: - - name + - name abstract: basename: abstracts fields: - - text + - text index: - - text + - text agency: basename: agencies fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name grant_id: basename: grant_ids fields: - - id + - id index: - - id + - id heading: basename: headings fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name subheading: basename: subheadings fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name funding_text: basename: funding_texts fields: - - text + - text index: - - text + - text date: basename: dates fields: - - year - - month - - day + - year + - month + - day index: - - year - - month - - day + - year + - month + - day transforms: - - - foo: try_int - module: graph_cast.util.transform - fields: - - year - - month - - day + - foo: try_int + module: graph_cast.util.transform + fields: + - year + - month + - day publisher: basename: publishers fields: - - display_name - - full_name + - display_name + - full_name index: - - display_name + - display_name transforms: - - - foo: lower - class: str - fields: - - display_name + - foo: lower + class: str + fields: + - display_name conference: basename: conferences fields: - - id + - id index: - - id + - id conference_title: basename: conference_titles fields: - - title + - title index: - - title + - title transforms: - - - foo: lower - class: str - fields: - - title + - foo: lower + class: str + fields: + - title conference_info: basename: conference_infos fields: - - text + - text index: - - text + - text conference_sponsor: basename: conference_sponsors fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name some_id: basename: some_ids fields: - - id + - id index: - - id + - id bib_id: basename: bib_ids fields: - - id + - id index: - - id + - id edge_collections: extra: - - - source: contributor - target: organization - by: publication - weight: - _key: pubid - year: year + - source: contributor + target: organization + by: publication + weight: + _key: pubid + year: year extra: merge_collections: - - publication \ No newline at end of file + - publication diff --git a/conf/table/ibes.yaml b/conf/table/ibes.yaml index 2c9e9b92..9c206d7b 100644 --- a/conf/table/ibes.yaml +++ b/conf/table/ibes.yaml @@ -1,118 +1,108 @@ general: name: ibes csv: - - - tabletype: ibes - encoding: ISO-8859-1 - transforms: - - foo: parse_date_ibes - module: graph_cast.util.transform - input: - - ANNDATS - - ANNTIMS - output: - - datetime_announce - - foo: parse_date_ibes - module: graph_cast.util.transform - input: - - REVDATS - - REVTIMS - output: - - datetime_review - - foo: cast_ibes_analyst - module: graph_cast.util.transform - input: - - ANALYST - output: - - last_name - - initial - vertex_collections: - - - type: ticker - map: - CUSIP: cusip - CNAME: cname - OFTIC: oftic - - - type: agency - map: - ESTIMID: aname - - - type: analyst - - - type: recommendation - map: - ERECCD: erec - ETEXT: etext - IRECCD: irec - ITEXT: itext - - - type: publication +- tabletype: ibes + encoding: ISO-8859-1 + transforms: + - foo: parse_date_ibes + module: graph_cast.util.transform + input: + - ANNDATS + - ANNTIMS + output: + - datetime_announce + - foo: parse_date_ibes + module: graph_cast.util.transform + input: + - REVDATS + - REVTIMS + output: + - datetime_review + - foo: cast_ibes_analyst + module: graph_cast.util.transform + input: + - ANALYST + output: + - last_name + - initial + vertex_collections: + - type: ticker + map: + CUSIP: cusip + CNAME: cname + OFTIC: oftic + - type: agency + map: + ESTIMID: aname + - type: analyst + - type: recommendation + map: + ERECCD: erec + ETEXT: etext + IRECCD: irec + ITEXT: itext + - type: publication vertex_collections: blanks: - - publication + - publication collections: publication: basename: publications fields: - - datetime_review - - datetime_announce + - datetime_review + - datetime_announce extra_index: - - type: hash - unique: false - fields: - - datetime_review - - type: hash - unique: false - fields: - - datetime_announce + - type: hash + unique: false + fields: + - datetime_review + - type: hash + unique: false + fields: + - datetime_announce ticker: basename: tickers fields: - - cusip - - cname - - oftic + - cusip + - cname + - oftic index: - - cusip - - cname - - oftic + - cusip + - cname + - oftic agency: basename: agencies fields: - - aname + - aname index: - - aname + - aname analyst: basename: analysts fields: - - last_name - - initial + - last_name + - initial index: - - last_name - - initial + - last_name + - initial recommendation: basename: recommendations fields: - - erec - - etext - - irec - - itext + - erec + - etext + - irec + - itext index: - - irec + - irec edge_collections: main: - - - source: publication - target: ticker - - - source: analyst - target: agency - weight: - - datetime_review - - datetime_announce - - - source: analyst - target: publication - - - source: publication - target: recommendation + - source: publication + target: ticker + - source: analyst + target: agency + weight: + - datetime_review + - datetime_announce + - source: analyst + target: publication + - source: publication + target: recommendation diff --git a/conf/table/ticker.yaml b/conf/table/ticker.yaml index 2684fa92..2ed384b0 100644 --- a/conf/table/ticker.yaml +++ b/conf/table/ticker.yaml @@ -1,107 +1,97 @@ general: name: ticker_history csv: - - - tabletype: _all - transforms: - - foo: round_str - module: graph_cast.util.transform - maps: - - - input: - - Open - - - input: - - Close - params: - ndigits: 3 - - foo: int - module: builtins - maps: - - - input: - - Volume - - foo: parse_date_yahoo - module: graph_cast.util.transform - input: - - Date - output: - - t_obs - vertex_collections: - - - type: ticker - map: - _filename: oftic - - - type: feature - map: - Open: - key: name - - - type: feature - map: - Close: - key: name - - - type: feature - map: - Volume: - key: name +- tabletype: _all + transforms: + - foo: round_str + module: graph_cast.util.transform + maps: + - input: + - Open + - input: + - Close + params: + ndigits: 3 + - foo: int + module: builtins + maps: + - input: + - Volume + - foo: parse_date_yahoo + module: graph_cast.util.transform + input: + - Date + output: + - t_obs + vertex_collections: + - type: ticker + map: + _filename: oftic + - type: feature + map: + Open: + key: name + - type: feature + map: + Close: + key: name + - type: feature + map: + Volume: + key: name vertex_collections: collections: ticker: basename: tickers fields: - - cusip - - cname - - oftic + - cusip + - cname + - oftic index: - - cusip - - cname - - oftic + - cusip + - cname + - oftic feature: basename: features fields: - - name - - value + - name + - value index: - - name - - value + - name + - value extra_index: - - type: hash - unique: false - fields: - - value - - type: hash - unique: false - fields: - - name + - type: hash + unique: false + fields: + - value + - type: hash + unique: false + fields: + - name filters: - - - a: - field: name - foo: __eq__ - value: Open - b: - field: value - foo: __gt__ - value: 0 - - - a: - field: name - foo: __eq__ - value: Close - b: - field: value - foo: __gt__ - value: 0 + - a: + field: name + foo: __eq__ + value: Open + b: + field: value + foo: __gt__ + value: 0 + - a: + field: name + foo: __eq__ + value: Close + b: + field: value + foo: __gt__ + value: 0 edge_collections: main: - - source: ticker - target: feature - weight: - - t_obs - index: - - fields: - - t_obs - unique: false \ No newline at end of file + - source: ticker + target: feature + weight: + - t_obs + index: + - fields: + - t_obs + unique: false diff --git a/conf/table/wos.yaml b/conf/table/wos.yaml index 60113c0d..9f328721 100644 --- a/conf/table/wos.yaml +++ b/conf/table/wos.yaml @@ -1,149 +1,126 @@ general: - name: wos + name: wos csv: - - - tabletype: publications - vertex_collections: - - - type: publication - map: - wos_id: _key - pubyear: year - pubmonth: month - pubday: day - - - type: medium - map: - source: title - - - type: language - - - tabletype: contributors - vertex_collections: - - - type: contributor - - - type: publication - map: - wos_id: _key - - - tabletype: institutions - vertex_collections: - - - type: organization - - - type: publication - map: - wos_id: _key - - - tabletype: refs - vertex_collections: - - - type: publication - map: - wos_id: _key - - - type: publication - map: - uid: _key +- tabletype: publications + vertex_collections: + - type: publication + map: + wos_id: _key + pubyear: year + pubmonth: month + pubday: day + - type: medium + map: + source: title + - type: language +- tabletype: contributors + vertex_collections: + - type: contributor + - type: publication + map: + wos_id: _key +- tabletype: institutions + vertex_collections: + - type: organization + - type: publication + map: + wos_id: _key +- tabletype: refs + vertex_collections: + - type: publication + map: + wos_id: _key + - type: publication + map: + uid: _key vertex_collections: collections: publication: basename: publications fields: - - _key - - accession_no - - title - - year - - month - - day + - _key + - accession_no + - title + - year + - month + - day numeric_fields: - - year - - month - - day + - year + - month + - day index: - - _key + - _key extra_index: - - - type: hash - unique: false - fields: - - title - - - type: hash - unique: false - fields: - - year + - type: hash + unique: false + fields: + - title + - type: hash + unique: false + fields: + - year medium: basename: media fields: - - issn - - isbn - - title - - eissn - - eisbn + - issn + - isbn + - title + - eissn + - eisbn index: - - issn - - isbn - - title + - issn + - isbn + - title extra_index: - - - type: hash - unique: false - fields: - - issn + - type: hash + unique: false + fields: + - issn language: basename: languages fields: - - language + - language index: - - language + - language contributor: basename: contributors fields: - - first_name - - last_name + - first_name + - last_name index: - - first_name - - last_name + - first_name + - last_name organization: basename: organizations fields: - - organization - - country - - city + - organization + - country + - city index: - - organization - - country - - city + - organization + - country + - city extra_index: - - - type: hash - unique: false - fields: - - country + - type: hash + unique: false + fields: + - country edge_collections: main: - - - source: publication - target: medium - - - source: publication - target: language - - - source: contributor - target: publication - - - source: organization - target: publication - - - source: publication - target: publication + - source: publication + target: medium + - source: publication + target: language + - source: contributor + target: publication + - source: organization + target: publication + - source: publication + target: publication extra: - - - source: contributor - target: organization - by: publication - weight: - _key: pubid - year: year + - source: contributor + target: organization + by: publication + weight: + _key: pubid + year: year diff --git a/poetry.lock b/poetry.lock index 27a9a4eb..9986dda4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -77,6 +77,31 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "config-formatter" +version = "1.1.0" +description = "An automatic formatter for .ini and .cfg configuration files" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +configupdater = ">=3.0" + +[package.extras] +dev = ["pytest (>=6.0.0)", "pre-commit (>=2.17.0)"] + +[[package]] +name = "configupdater" +version = "3.1.1" +description = "Parser like ConfigParser but for updating configuration files" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +testing = ["sphinx", "flake8", "pytest", "pytest-cov", "pytest-virtualenv", "pytest-xdist"] + [[package]] name = "distlib" version = "0.3.5" @@ -130,6 +155,22 @@ requirements_deprecated_finder = ["pipreqs", "pip-api"] colors = ["colorama (>=0.4.3,<0.5.0)"] plugins = ["setuptools"] +[[package]] +name = "language-formatters-pre-commit-hooks" +version = "2.4.0" +description = "List of pre-commit hooks meant to format your source code." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +config-formatter = "*" +packaging = "*" +requests = "*" +"ruamel.yaml" = "*" +toml-sort = "*" +tomlkit = "*" + [[package]] name = "mypy" version = "0.971" @@ -201,6 +242,17 @@ category = "main" optional = false python-versions = ">=3.8" +[[package]] +name = "packaging" +version = "21.3" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" + [[package]] name = "pandas" version = "1.4.3" @@ -258,6 +310,18 @@ pyyaml = ">=5.1" toml = "*" virtualenv = ">=20.0.8" +[[package]] +name = "pre-commit-hooks" +version = "4.3.0" +description = "Some out-of-the-box hooks for pre-commit." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +"ruamel.yaml" = ">=0.15" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + [[package]] name = "pyflakes" version = "2.5.0" @@ -288,6 +352,17 @@ dev = ["sphinx", "sphinx-rtd-theme", "zope.interface", "cryptography (>=3.3.1)", docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] +[[package]] +name = "pyparsing" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "main" +optional = false +python-versions = ">=3.6.8" + +[package.extras] +diagrams = ["railroad-diagrams", "jinja2"] + [[package]] name = "python-arango" version = "7.4.1" @@ -362,6 +437,29 @@ python-versions = "*" [package.dependencies] requests = ">=2.0.1,<3.0.0" +[[package]] +name = "ruamel.yaml" +version = "0.17.21" +description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" +category = "main" +optional = false +python-versions = ">=3" + +[package.dependencies] +"ruamel.yaml.clib" = {version = ">=0.2.6", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.11\""} + +[package.extras] +docs = ["ryd"] +jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"] + +[[package]] +name = "ruamel.yaml.clib" +version = "0.2.7" +description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml" +category = "main" +optional = false +python-versions = ">=3.5" + [[package]] name = "setuptools" version = "63.4.2" @@ -391,6 +489,17 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "toml-sort" +version = "0.20.1" +description = "Toml sorting library" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" + +[package.dependencies] +tomlkit = ">=0.8.0" + [[package]] name = "tomli" version = "2.0.1" @@ -399,6 +508,14 @@ category = "main" optional = false python-versions = ">=3.7" +[[package]] +name = "tomlkit" +version = "0.11.5" +description = "Style preserving TOML library" +category = "main" +optional = false +python-versions = ">=3.6,<4.0" + [[package]] name = "types-pyyaml" version = "6.0.11" @@ -456,7 +573,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "585faec2b525dbdf189eb20f97636dae52afd8e625d815ca89e3f69f4e27b4c6" +content-hash = "2ba8a7304ec906d232c798e3ec3fa97413113cfc66ef72871c386e88a350f001" [metadata.files] autoflake = [] @@ -499,6 +616,8 @@ colorama = [ {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, ] +config-formatter = [] +configupdater = [] distlib = [] filelock = [] identify = [] @@ -507,6 +626,7 @@ idna = [ {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] isort = [] +language-formatters-pre-commit-hooks = [] mypy = [] mypy-extensions = [ {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, @@ -516,6 +636,10 @@ neo4j = [] networkx = [] nodeenv = [] numpy = [] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] pandas = [ {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d51674ed8e2551ef7773820ef5dab9322be0828629f2cbf8d1fc31a0c4fed640"}, {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ad23db55efcc93fa878f7837267973b61ea85d244fc5ff0ccbcfa5638706c5"}, @@ -548,6 +672,7 @@ platformdirs = [ {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, ] pre-commit = [] +pre-commit-hooks = [] pyflakes = [] pygraphviz = [ {file = "pygraphviz-1.9.zip", hash = "sha256:fa18f7c6cea28341a4e466ed0cf05682b0a68288afe8dd7c9426782f7c1ae01c"}, @@ -556,6 +681,10 @@ pyjwt = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, ] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] python-arango = [ {file = "python-arango-7.4.1.tar.gz", hash = "sha256:20dfa59a84f5b0a7344b3c053e9f6fb6f6e9ba1cc2eea7290d3f1b52e5cbdfa0"}, {file = "python_arango-7.4.1-py3-none-any.whl", hash = "sha256:2c05d0a0d74754cc2ed36a0f39546215de82bc955cb364592a12bf07c5705fce"}, @@ -608,6 +737,8 @@ requests-toolbelt = [ {file = "requests-toolbelt-0.9.1.tar.gz", hash = "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"}, {file = "requests_toolbelt-0.9.1-py2.py3-none-any.whl", hash = "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f"}, ] +"ruamel.yaml" = [] +"ruamel.yaml.clib" = [] setuptools = [] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, @@ -617,10 +748,12 @@ toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] +toml-sort = [] tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +tomlkit = [] types-pyyaml = [] typing-extensions = [] urllib3 = [] diff --git a/pyproject.toml b/pyproject.toml index f79c72e6..443676da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ isort = "^5.10.1" autoflake = "^1.4" pre-commit = "^2.20.0" types-PyYAML = "^6.0.11" +language-formatters-pre-commit-hooks = "^2.4.0" +pre-commit-hooks = "^4.3.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/test/data/merge_anchor_test.json b/test/data/merge_anchor_test.json index 4108e594..93a9f8cd 100644 --- a/test/data/merge_anchor_test.json +++ b/test/data/merge_anchor_test.json @@ -1,74 +1,74 @@ [ { - "title":"SURVEY OF THE FINISH CHARACTERISTICS OF MACHINED OPTICAL-SURFACES" + "title": "SURVEY OF THE FINISH CHARACTERISTICS OF MACHINED OPTICAL-SURFACES" }, { - "_key":"WOS:A1979HV28900009", - "anchor":"reference", - "first_page":"93", - "title":"MEASUREMENT OF SURFACE TEXTURE AND TOPOGRAPHY BY DIFFERENTIAL LIGHT-SCATTERING", - "volume":"57" + "_key": "WOS:A1979HV28900009", + "anchor": "reference", + "first_page": "93", + "title": "MEASUREMENT OF SURFACE TEXTURE AND TOPOGRAPHY BY DIFFERENTIAL LIGHT-SCATTERING", + "volume": "57" }, { - "_key":"WOS:A1982PY45800018", - "anchor":"reference", - "first_page":"189", - "title":"SPECTRAL-ANALYSIS OF THE FINISH OF POLISHED OPTICAL-SURFACES", - "volume":"83" + "_key": "WOS:A1982PY45800018", + "anchor": "reference", + "first_page": "189", + "title": "SPECTRAL-ANALYSIS OF THE FINISH OF POLISHED OPTICAL-SURFACES", + "volume": "83" }, { - "_key":"WOS:A1985AHD0600004.2", - "anchor":"reference", - "volume":"24" + "_key": "WOS:A1985AHD0600004.2", + "anchor": "reference", + "volume": "24" }, { - "anchor":"main", - "has_abstract":"N", - "issue":"3", - "volume":"24" + "anchor": "main", + "has_abstract": "N", + "issue": "3", + "volume": "24" }, { - "_key":"WOS:A1977DT56200008", - "anchor":"reference", - "doi":"10.1117/12.7972054", - "first_page":"360", - "title":"MEASUREMENT OF FINISH OF DIAMOND-TURNED METAL-SURFACES BY DIFFERENTIAL LIGHT-SCATTERING", - "volume":"16" + "_key": "WOS:A1977DT56200008", + "anchor": "reference", + "doi": "10.1117/12.7972054", + "first_page": "360", + "title": "MEASUREMENT OF FINISH OF DIAMOND-TURNED METAL-SURFACES BY DIFFERENTIAL LIGHT-SCATTERING", + "volume": "16" }, { - "_key":"WOS:A1985AHD0600005", - "anchor":"main" + "_key": "WOS:A1985AHD0600005", + "anchor": "main" }, { - "_key":"WOS:A1984SL25000002", - "anchor":"reference", - "first_page":"101", - "title":"AN OPTICAL PROFILOMETER FOR SURFACE CHARACTERIZATION OF MAGNETIC MEDIA", - "volume":"27" + "_key": "WOS:A1984SL25000002", + "anchor": "reference", + "first_page": "101", + "title": "AN OPTICAL PROFILOMETER FOR SURFACE CHARACTERIZATION OF MAGNETIC MEDIA", + "volume": "27" }, { - "anchor":"main", - "first_page":"396", - "last_page":"403", - "page_count":"8", - "str_pages":"396-403" + "anchor": "main", + "first_page": "396", + "last_page": "403", + "page_count": "8", + "str_pages": "396-403" }, { - "_key":"WOS:A1984TZ60400012.2", - "anchor":"reference" + "_key": "WOS:A1984TZ60400012.2", + "anchor": "reference" }, { - "_key":"WOS:A1983SU56500014", - "anchor":"reference", - "first_page":"105", - "title":"DIRECT COMPARISON OF MECHANICAL AND OPTICAL MEASUREMENTS OF THE FINISH OF PRECISION-MACHINED SURFACES", - "volume":"429" + "_key": "WOS:A1983SU56500014", + "anchor": "reference", + "first_page": "105", + "title": "DIRECT COMPARISON OF MECHANICAL AND OPTICAL MEASUREMENTS OF THE FINISH OF PRECISION-MACHINED SURFACES", + "volume": "429" }, { - "_key":"WOS:A1984AAJ5100004", - "anchor":"reference", - "first_page":"18", - "title":"STATISTICAL EFFECTS IN THE MEASUREMENT AND CHARACTERIZATION OF SMOOTH SCATTERING SURFACES", - "volume":"511" + "_key": "WOS:A1984AAJ5100004", + "anchor": "reference", + "first_page": "18", + "title": "STATISTICAL EFFECTS IN THE MEASUREMENT AND CHARACTERIZATION OF SMOOTH SCATTERING SURFACES", + "volume": "511" } -] \ No newline at end of file +] diff --git a/test/ref/freshcaller_sizes.yaml b/test/ref/freshcaller_sizes.yaml index 15da0613..132d62ef 100644 --- a/test/ref/freshcaller_sizes.yaml +++ b/test/ref/freshcaller_sizes.yaml @@ -1,6 +1,5 @@ call: 1 participant: 2 -? !!python/tuple -- participant -- call +? - participant + - call : 2 diff --git a/test/ref/kg_v0_sizes.yaml b/test/ref/kg_v0_sizes.yaml index 568a1743..d14647c8 100644 --- a/test/ref/kg_v0_sizes.yaml +++ b/test/ref/kg_v0_sizes.yaml @@ -1,10 +1,8 @@ publication: 1 concept: 22 -? !!python/tuple -- concept -- concept +? - concept + - concept : 21 -? !!python/tuple -- publication -- concept +? - publication + - concept : 7