diff --git a/.mypy.ini b/.mypy.ini index e0edf5ac..614de251 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -1,3 +1,6 @@ [mypy] ignore_missing_imports = True pretty = True + +[mypy-yaml.*] +ignore_missing_imports = True diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 50e6ca89..27479d3d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,37 +1,49 @@ fail_fast: true repos: - - repo: https://github.com/myint/autoflake +- repo: https://github.com/myint/autoflake rev: v1.4 hooks: - - id: autoflake + - id: autoflake args: - - --in-place - - --remove-unused-variables + - --in-place + - --remove-unused-variables # - --remove-all-unused-imports - - repo: https://github.com/ambv/black +- repo: https://github.com/ambv/black rev: 22.3.0 hooks: - - id: black + - id: black args: [--diff, --check, -l 79] - - repo: https://github.com/PyCQA/isort +- repo: https://github.com/PyCQA/isort rev: 5.10.1 hooks: - - id: isort + - id: isort args: - - --line-length=79 - - --src=lm_service - - repo: https://github.com/pre-commit/mirrors-mypy + - --line-length=79 + - --src=graph_cast +- repo: https://github.com/pre-commit/mirrors-mypy rev: v0.971 hooks: - - id: mypy + - id: mypy # args: -# - --module lm_service - exclude: ^test/ +# - --module graph_cast + exclude: (^test/|^run/) # args: [--strict] +- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.4.0 + hooks: + - id: pretty-format-yaml + args: [--autofix, --indent, '4', --preserve-quotes] + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: pretty-format-json + args: [--autofix, --indent, '4'] + # - repo: https://github.com/pre-commit/mirrors-pylint # rev: v2.14.5 # hooks: diff --git a/Makefile b/Makefile index 53325a32..7f9117a3 100644 --- a/Makefile +++ b/Makefile @@ -19,12 +19,20 @@ isort: .PHONY: autoflake autoflake: - autoflake --remove-unused-variables --verbose --in-place ./lm_service/**/*py + autoflake --remove-unused-variables --verbose --in-place ./graph_cast/**/*py -all: autoflake black isort mypy +.PHONY: prettyyaml +prettyyaml: + find . -name "*yaml" -and -not -ipath './.*' -type f | xargs pretty-format-yaml --autofix --indent 4 + +.PHONY: prettyjson +prettyjson: + find . -name "*json" -and -not -ipath './.*' -type f | xargs pretty-format-json --autofix --indent 4 + +all: autoflake black isort mypy prettyyaml prettyjson #.PHONY: pylint #pylint: -# pylint lm_service +# pylint package_name diff --git a/conf/json/freshcaller.yaml b/conf/json/freshcaller.yaml index da4d4925..f256b387 100644 --- a/conf/json/freshcaller.yaml +++ b/conf/json/freshcaller.yaml @@ -3,51 +3,44 @@ general: json: type: item edges: - - - how: all - source: - name: participant - target: - name: call - maps: - - - how: dict + - how: all + source: + name: participant + target: name: call - - - type: item - descend_key: participants + maps: + - how: dict + name: call + - type: item + descend_key: participants + maps: + - type: list maps: - - - type: list - maps: - - - type: item - maps: - - - how: dict - name: participant + - type: item + maps: + - how: dict + name: participant vertex_collections: collections: call: basename: calls fields: - - id - - created_time + - id + - created_time index: - - id + - id participant: basename: participants fields: - - id - - participant_type + - id + - participant_type index: - - id + - id extra_index: - - - type: hash - unique: false - fields: - - issn + - type: hash + unique: false + fields: + - issn dummy: basename: dummies diff --git a/conf/json/kg_v0.yaml b/conf/json/kg_v0.yaml index 848e7918..9d4fd4fb 100644 --- a/conf/json/kg_v0.yaml +++ b/conf/json/kg_v0.yaml @@ -3,23 +3,24 @@ general: json: type: item weights: - - source: - name: concept - target: - name: concept - vertex: - - name: publication - field: id + - source: + name: concept + target: + name: concept + vertex: + - name: publication + mapper: + id: publication + edges: - - - how: all - source: - name: publication - target: - name: concept - anchor: meta - fields: - - anchor + - how: all + source: + name: publication + target: + name: concept + anchor: meta + fields: + - anchor # weights: # - source: # name: publication @@ -31,53 +32,44 @@ json: # condition: # anchor: main maps: - - - how: dict - name: publication - map: - publication_id: id + - how: dict + name: publication + map: + publication_id: id - - - type: item - descend_key: triples + - type: item + descend_key: triples + maps: + - type: list maps: - - - type: list + - type: item + edges: + - how: all + source: + name: concept + anchor: meta + target: + name: concept + anchor: core + weight_exclusive: + - tritype + maps: + - type: item + descend_key: triple_meta + maps: + - how: dict + __extra: + anchor: meta + name: concept + - type: list + descend_key: triple maps: - - - type: item - edges: - - - how: all - source: - name: concept - anchor: meta - target: - name: concept - anchor: core - weight_exclusive: - - tritype - maps: - - - type: item - descend_key: triple_meta - maps: - - - how: dict - __extra: - anchor: meta - name: concept - - - type: list - descend_key: triple - maps: - - - how: dict - __extra: - anchor: core - name: concept - map: - type: tritype + - how: dict + __extra: + anchor: core + name: concept + map: + type: tritype vertex_collections: @@ -85,14 +77,14 @@ vertex_collections: publication: basename: publications fields: - - id + - id index: - - id + - id concept: basename: concepts fields: - - hash - - text + - hash + - text index: - - hash + - hash diff --git a/conf/json/wos.yaml b/conf/json/wos.yaml index c7956ad6..1dc49cf4 100644 --- a/conf/json/wos.yaml +++ b/conf/json/wos.yaml @@ -4,1343 +4,1145 @@ json: type: item descend_key: REC weights: - - - source: - name: location - target: - name: organization - vertex: - - - name: publication - condition: - anchor: main - field: _key - - - source: - name: contributor - target: - name: location - vertex: - - - name: publication - condition: - anchor: main - field: _key - - - source: - name: publisher - target: - name: location - vertex: - - - name: publication - condition: - anchor: main - field: _key - edges: - - - how: all - source: - name: contributor - anchor: main - weight_exclusive: - - seq_no - - role - target: - name: publication - anchor: main - fields: - - anchor - - - how: all - source: - name: publication - anchor: main - target: - name: conference - - - how: all - source: - name: publication - anchor: main - fields: - - anchor - target: - name: date - anchor: main - - - how: all - source: - name: publication - anchor: main - fields: - - anchor - target: - name: medium_title - anchor: main - - - how: all - source: - name: publisher - anchor: main - weight_exclusive: - - seq_no - target: - name: publication - anchor: main - fields: - - anchor - - - how: all - source: - name: publication - anchor: main - target: - name: publication_type - - - how: all - source: - name: publication - anchor: main - target: - name: document_type - - - how: all - source: - name: publication - anchor: main - target: - name: language - weight_exclusive: - - type - - - how: all - source: - name: publication - anchor: main - target: - name: bib_id - - - how: all - source: - name: publication - anchor: main - target: - name: heading - - - how: all - source: - name: publication - anchor: main - target: - name: subheading - - - how: all - source: - name: publication - anchor: main - target: - name: subject - - - how: all - source: - name: publication - anchor: main - target: - name: some_id - - - how: all - source: - name: publication - anchor: main - target: - name: medium - - - how: all - source: - name: medium - target: - name: medium_title - anchor: main - - - how: all - source: - name: publication - anchor: main - target: - name: publication - anchor: reference - - - how: 1-n - source: - name: contributor - field: addr_no - anchor: main - type: list - all_value: 0 - target: - name: location - field: addr_no - anchor: main - type: value - - - how: all - source: - name: publication - anchor: main - target: - name: agency - - - how: all - source: - name: publication + - source: + name: location + target: + name: organization + vertex: + - name: publication + condition: anchor: main - target: - name: grant_id - - - how: all - source: - name: publication + mapper: + _key: publication + - source: + name: contributor + target: + name: location + vertex: + - name: publication + condition: anchor: main - target: - name: funding_text - - - how: all - source: - name: publication + mapper: + _key: publication + - source: + name: publisher + target: + name: location + vertex: + - name: publication + condition: anchor: main - target: - name: keyword - - - how: all - source: - name: publication - anchor: main - target: - name: abstract - - - how: all - source: - name: publication - anchor: main - target: - name: edition - maps: - - - how: dict + mapper: + _key: publication + edges: + - how: all + source: + name: contributor + anchor: main + weight_exclusive: + - seq_no + - role + target: name: publication - __extra: - anchor: main - map: - UID: _key - - - type: item - descend_key: static_data + anchor: main + fields: + - anchor + - how: all + source: + name: publication + anchor: main + target: + name: conference + - how: all + source: + name: publication + anchor: main + fields: + - anchor + target: + name: date + anchor: main + - how: all + source: + name: publication + anchor: main + fields: + - anchor + target: + name: medium_title + anchor: main + - how: all + source: + name: publisher + anchor: main + weight_exclusive: + - seq_no + target: + name: publication + anchor: main + fields: + - anchor + - how: all + source: + name: publication + anchor: main + target: + name: publication_type + - how: all + source: + name: publication + anchor: main + target: + name: document_type + - how: all + source: + name: publication + anchor: main + target: + name: language + weight_exclusive: + - type + - how: all + source: + name: publication + anchor: main + target: + name: bib_id + - how: all + source: + name: publication + anchor: main + target: + name: heading + - how: all + source: + name: publication + anchor: main + target: + name: subheading + - how: all + source: + name: publication + anchor: main + target: + name: subject + - how: all + source: + name: publication + anchor: main + target: + name: some_id + - how: all + source: + name: publication + anchor: main + target: + name: medium + - how: all + source: + name: medium + target: + name: medium_title + anchor: main + - how: all + source: + name: publication + anchor: main + target: + name: publication + anchor: reference + - how: 1-n + source: + name: contributor + field: addr_no + anchor: main + type: list + all_value: 0 + target: + name: location + field: addr_no + anchor: main + type: value + - how: all + source: + name: publication + anchor: main + target: + name: agency + - how: all + source: + name: publication + anchor: main + target: + name: grant_id + - how: all + source: + name: publication + anchor: main + target: + name: funding_text + - how: all + source: + name: publication + anchor: main + target: + name: keyword + - how: all + source: + name: publication + anchor: main + target: + name: abstract + - how: all + source: + name: publication + anchor: main + target: + name: edition + maps: + - how: dict + name: publication + __extra: + anchor: main + map: + UID: _key + - type: item + descend_key: static_data + maps: + - type: item + descend_key: summary maps: - - - type: item - descend_key: summary + - type: item + descend_key: EWUID + maps: + - type: list + descend_key: edition maps: - - - type: item - descend_key: EWUID + - how: dict + name: edition + map: + '@value': name + - type: item + descend_key: pub_info + maps: + - how: dict + name: publication + __extra: + anchor: main + map: + '@has_abstract': has_abstract + '@vol': volume + '@issue': issue + '@supplement': supplement + - how: dict + name: date + __extra: + anchor: main + transforms: + - foo: parse_date_standard + module: graph_cast.util.transform + input: + - '@sortdate' + output: + - year + - month + - day + - how: dict + name: publication_type + map: + '@pubtype': type + - type: item + descend_key: page + maps: + - how: dict + name: publication + __extra: + anchor: main + map: + '@begin': first_page + '@end': last_page + '@page_count': page_count + '#text': str_pages + - type: item + descend_key: titles + maps: + - type: list + descend_key: title + maps: + - how: dict + name: publication + __extra: + anchor: main + filter: + '@type': item + map: + '#text': title + - how: dict + name: medium_title + __extra: + anchor: main + unfilter: + '@type': item + map: + '#text': title + - type: item + descend_key: names + maps: + - type: list + descend_key: name + maps: + - type: item + maps: + - how: dict + name: contributor + __extra: + anchor: main + map: + '@seq_no': seq_no + '@addr_no': addr_no + '@dais_id': dais_id + '@role': role + email_addr: email + - type: item + descend_key: doctypes + maps: + - type: list + descend_key: doctype + maps: + - how: dict + name: document_type + map: + '#text': name + - type: item + descend_key: publishers + maps: + - type: list + descend_key: publisher + maps: + - type: item + edges: + - how: all + source: + name: publisher + weight_exclusive: + - seq_no + - role + target: + name: location + maps: + - type: item + descend_key: address_spec maps: - - - type: list - descend_key: edition - maps: - - - how: dict - name: edition - map: - '@value': name - - - type: item - descend_key: pub_info + - how: dict + name: location + map: + '@addr_no': addr_no + - type: item + descend_key: names maps: - - - how: dict - name: publication + - type: list + descend_key: name + maps: + - how: dict + name: publisher __extra: anchor: main map: - '@has_abstract': has_abstract - '@vol': volume - '@issue': issue - '@supplement': supplement - - - how: dict - name: date - __extra: - anchor: main - transforms: - - - foo: parse_date_standard - module: graph_cast.util.transform - input: - - '@sortdate' - output: - - year - - month - - day - - - how: dict - name: publication_type - map: - '@pubtype': type - - - type: item - descend_key: page - maps: - - - how: dict - name: publication - __extra: - anchor: main - map: - '@begin': first_page - '@end': last_page - '@page_count': page_count - '#text': str_pages - - - type: item - descend_key: titles + '@seq_no': seq_no + '@role': role + - type: item + descend_key: conferences + maps: + - type: list + descend_key: conference + edges: + - how: all + source: + name: conference + target: + name: conference_title + - how: all + source: + name: conference + target: + name: conference_info + - how: all + source: + name: conference + target: + name: conference_sponsor + - how: all + source: + name: conference + target: + name: location + - how: all + source: + name: conference + target: + name: date + maps: + - how: dict + name: conference + map: + '@conf_id': id + - type: item + descend_key: conf_titles + maps: + - type: list + descend_key: conf_title maps: - - - type: list - descend_key: title - maps: - - - how: dict - name: publication - __extra: - anchor: main - filter: - '@type': item - map: - '#text': title - - - how: dict - name: medium_title - __extra: - anchor: main - unfilter: - '@type': item - map: - '#text': title - - - type: item - descend_key: names + - how: dict + name: conference_title + map: + '#text': title + - type: item + descend_key: conf_infos + maps: + - type: list + descend_key: conf_info maps: - - - type: list - descend_key: name - maps: - - - type: item - maps: - - - how: dict - name: contributor - __extra: - anchor: main - map: - '@seq_no': seq_no - '@addr_no': addr_no - '@dais_id': dais_id - '@role': role - email_addr: email - - - type: item - descend_key: doctypes + - how: dict + name: conference_info + map: + '#text': text + - type: item + descend_key: conf_dates + maps: + - type: list + descend_key: conf_date maps: - - - type: list - descend_key: doctype - maps: - - - how: dict - name: document_type - map: - '#text': name - - - type: item - descend_key: publishers + - how: dict + name: date + transforms: + - foo: parse_date_conf + module: graph_cast.util.transform + input: + - '@conf_start' + output: + - year + - month + - day + - type: item + descend_key: conf_locations + maps: + - type: list + descend_key: conf_location + edges: + - how: all + source: + name: location + target: + name: organization + weight_exclusive: + - anchor maps: - - - type: list - descend_key: publisher - maps: - - - type: item - edges: - - - how: all - source: - name: publisher - weight_exclusive: - - seq_no - - role - target: - name: location - maps: - - - type: item - descend_key: address_spec - maps: - - - how: dict - name: location - map: - '@addr_no': addr_no - - - type: item - descend_key: names - maps: - - - type: list - descend_key: name - maps: - - - how: dict - name: publisher - __extra: - anchor: main - map: - '@seq_no': seq_no - '@role': role - - - type: item - descend_key: conferences + - how: dict + name: location + __extra: + anchor: conference + map: + conf_city: city + conf_state: state + - how: dict + name: organization + __extra: + anchor: conference + map: + conf_host: name + - type: item + descend_key: sponsors + maps: + - type: list + descend_key: sponsor maps: - - - type: list - descend_key: conference - edges: - - - how: all - source: - name: conference - target: - name: conference_title - - - how: all - source: - name: conference - target: - name: conference_info - - - how: all - source: - name: conference - target: - name: conference_sponsor - - - how: all - source: - name: conference - target: - name: location - - - how: all - source: - name: conference - target: - name: date - maps: - - - how: dict - name: conference - map: - '@conf_id': id - - - type: item - descend_key: conf_titles - maps: - - - type: list - descend_key: conf_title - maps: - - - how: dict - name: conference_title - map: - '#text': title - - - type: item - descend_key: conf_infos - maps: - - - type: list - descend_key: conf_info - maps: - - - how: dict - name: conference_info - map: - '#text': text - - - type: item - descend_key: conf_dates - maps: - - - type: list - descend_key: conf_date - maps: - - - how: dict - name: date - transforms: - - - foo: parse_date_conf - module: graph_cast.util.transform - input: - - '@conf_start' - output: - - year - - month - - day - - - type: item - descend_key: conf_locations - maps: - - - type: list - descend_key: conf_location - edges: - - - how: all - source: - name: location - target: - name: organization - weight_exclusive: - - anchor - maps: - - - how: dict - name: location - __extra: - anchor: conference - map: - conf_city: city - conf_state: state - - - how: dict - name: organization - __extra: - anchor: conference - map: - conf_host: name - - - type: item - descend_key: sponsors - maps: - - - type: list - descend_key: sponsor - maps: - - - how: dict - name: conference_sponsor - map: - '#text': name - - - type: item - descend_key: fullrecord_metadata + - how: dict + name: conference_sponsor + map: + '#text': name + - type: item + descend_key: fullrecord_metadata + maps: + - type: item + descend_key: languages + maps: + - type: list + descend_key: language maps: - - - type: item - descend_key: languages - maps: - - - type: list - descend_key: language - maps: - - - how: dict - name: language - map: - '#text': name - '@type': type - - - type: item - descend_key: normalized_languages - maps: - - - type: list - descend_key: language - maps: - - - how: dict - name: language - map: - '#text': name - '@type': type - - - type: item - descend_key: normalized_doctypes - maps: - - - type: list - descend_key: doctype - maps: - - - how: dict - name: document_type - map: - '#text': name - - - type: item - descend_key: references - maps: - - - type: list - descend_key: reference - maps: - - - type: item - edges: - - - how: all - source: - name: publication - target: - name: date - - - how: all - source: - name: contributor - target: - name: publication - values: - reference: true - - - how: all - source: - name: publication - target: - name: medium_title - maps: - - - how: dict - name: publication - __extra: - anchor: reference - map: - uid: _key - xref_doi: doi - page: first_page - citedTitle: title - - - type: item - descend_key: year - maps: - - - how: dict - name: date - transforms: - - - foo: parse_date_reference - module: graph_cast.util.transform - input: - - '#text' - output: - - year - - - how: dict - name: medium_title - map: - citedWork: title - - - how: dict - descend_key: reference - name: contributor - map: - citedAuthor: wos_standard - - - type: item - descend_key: addresses + - how: dict + name: language + map: + '#text': name + '@type': type + - type: item + descend_key: normalized_languages + maps: + - type: list + descend_key: language + maps: + - how: dict + name: language + map: + '#text': name + '@type': type + - type: item + descend_key: normalized_doctypes + maps: + - type: list + descend_key: doctype + maps: + - how: dict + name: document_type + map: + '#text': name + - type: item + descend_key: references + maps: + - type: list + descend_key: reference + maps: + - type: item + edges: + - how: all + source: + name: publication + target: + name: date + - how: all + source: + name: contributor + target: + name: publication + values: + reference: true + - how: all + source: + name: publication + target: + name: medium_title + maps: + - how: dict + name: publication + __extra: + anchor: reference + map: + uid: _key + xref_doi: doi + page: first_page + citedTitle: title + - type: item + descend_key: year maps: - - - type: list - descend_key: address_name - maps: - - - type: item - descend_key: address_spec - edges: - - - how: all - source: - name: location - target: - name: organization - weight_exclusive: - - pref - - - how: all - source: - name: organization - target: - name: suborganization - maps: - - - how: dict - name: location - __extra: - anchor: main - map: - '@addr_no': addr_no - - - type: item - descend_key: organizations - maps: - - - type: list - descend_key: organization - maps: - - - how: dict - name: organization - map: - '#text': name - '@pref': pref - - - type: item - descend_key: suborganizations - maps: - - - type: list - descend_key: suborganization - maps: - - - how: dict - name: suborganization - map: - '#text': name - - - type: item - descend_key: reprint_addresses + - how: dict + name: date + transforms: + - foo: parse_date_reference + module: graph_cast.util.transform + input: + - '#text' + output: + - year + - how: dict + name: medium_title + map: + citedWork: title + - how: dict + descend_key: reference + name: contributor + map: + citedAuthor: wos_standard + - type: item + descend_key: addresses + maps: + - type: list + descend_key: address_name + maps: + - type: item + descend_key: address_spec + edges: + - how: all + source: + name: location + target: + name: organization + weight_exclusive: + - pref + - how: all + source: + name: organization + target: + name: suborganization + maps: + - how: dict + name: location + __extra: + anchor: main + map: + '@addr_no': addr_no + - type: item + descend_key: organizations maps: - - - type: list - descend_key: address_name - edges: - - - how: 1-n - source: - name: contributor - field: addr_no - anchor: reprint - type: list - all_value: 0 - target: - name: location - anchor: reprint - field: addr_no - type: value - maps: - - - type: item - descend_key: address_spec - edges: - - - how: all - source: - name: location - target: - name: organization - weight_exclusive: - - pref - - - how: all - source: - name: organization - target: - name: suborganization - maps: - - - how: dict - name: location - __extra: - anchor: reprint - map: - '@addr_no': addr_no - - - type: item - descend_key: organizations - maps: - - - type: list - descend_key: organization - maps: - - - how: dict - name: organization - map: - '#text': name - '@pref': pref - - - type: item - descend_key: suborganizations - maps: - - - type: list - descend_key: suborganization - maps: - - - how: dict - name: suborganization - map: - '#text': name - - - type: item - descend_key: names - maps: - - - type: list - descend_key: name - maps: - - - type: item - maps: - - - how: dict - name: contributor - __extra: - anchor: reprint - map: - '@seq_no': seq_no - '@addr_no': addr_no - '@dais_id': dais_id - email_addr: email - - - type: item - descend_key: category_info + - type: list + descend_key: organization + maps: + - how: dict + name: organization + map: + '#text': name + '@pref': pref + - type: item + descend_key: suborganizations maps: - - - type: item - descend_key: headings - maps: - - - type: list - descend_key: heading - maps: - - - how: dict - name: heading - map: - '#text': name - - - type: item - descend_key: subheadings - maps: - - - type: list - descend_key: subheading - maps: - - - how: dict - name: subheading - map: - '#text': name - - - type: item - descend_key: subjects - maps: - - - type: list - descend_key: subject - maps: - - - how: dict - name: subject - map: - '#text': name - - - type: item - descend_key: fund_ack + - type: list + descend_key: suborganization + maps: + - how: dict + name: suborganization + map: + '#text': name + - type: item + descend_key: reprint_addresses + maps: + - type: list + descend_key: address_name + edges: + - how: 1-n + source: + name: contributor + field: addr_no + anchor: reprint + type: list + all_value: 0 + target: + name: location + anchor: reprint + field: addr_no + type: value + maps: + - type: item + descend_key: address_spec + edges: + - how: all + source: + name: location + target: + name: organization + weight_exclusive: + - pref + - how: all + source: + name: organization + target: + name: suborganization + maps: + - how: dict + name: location + __extra: + anchor: reprint + map: + '@addr_no': addr_no + - type: item + descend_key: organizations maps: - - - type: item - descend_key: fund_text - maps: - - - type: list - descend_key: p - maps: - - - how: dict - name: funding_text - map: - '#text': text - - - type: item - descend_key: grants - maps: - - - type: list - descend_key: grant - edges: - - - how: all - source: - name: agency - target: - name: grant_id - maps: - - - how: dict - name: agency - map: - grant_agency: name - - - type: item - descend_key: grant_ids - maps: - - - type: list - descend_key: grant_id - maps: - - - how: dict - name: grant_id - map: - '#text': id - - - type: item - descend_key: keywords + - type: list + descend_key: organization + maps: + - how: dict + name: organization + map: + '#text': name + '@pref': pref + - type: item + descend_key: suborganizations maps: - - - type: list - descend_key: keyword - maps: - - - how: dict - name: keyword - map: - '#text': name - - - type: item - descend_key: abstracts + - type: list + descend_key: suborganization + maps: + - how: dict + name: suborganization + map: + '#text': name + - type: item + descend_key: names + maps: + - type: list + descend_key: name maps: - - - type: list - descend_key: abstract - maps: - - - type: item - descend_key: abstract_text - maps: - - - type: list - descend_key: p - maps: - - - how: dict - name: abstract - map: - '#text': text - - - type: item - descend_key: item + - type: item + maps: + - how: dict + name: contributor + __extra: + anchor: reprint + map: + '@seq_no': seq_no + '@addr_no': addr_no + '@dais_id': dais_id + email_addr: email + - type: item + descend_key: category_info + maps: + - type: item + descend_key: headings + maps: + - type: list + descend_key: heading + maps: + - how: dict + name: heading + map: + '#text': name + - type: item + descend_key: subheadings + maps: + - type: list + descend_key: subheading + maps: + - how: dict + name: subheading + map: + '#text': name + - type: item + descend_key: subjects + maps: + - type: list + descend_key: subject + maps: + - how: dict + name: subject + map: + '#text': name + - type: item + descend_key: fund_ack + maps: + - type: item + descend_key: fund_text maps: - - - type: item - descend_key: ids + - type: list + descend_key: p + maps: + - how: dict + name: funding_text + map: + '#text': text + - type: item + descend_key: grants + maps: + - type: list + descend_key: grant + edges: + - how: all + source: + name: agency + target: + name: grant_id + maps: + - how: dict + name: agency + map: + grant_agency: name + - type: item + descend_key: grant_ids maps: - - - how: dict - name: some_id + - type: list + descend_key: grant_id + maps: + - how: dict + name: grant_id map: '#text': id - - - how: dict - name: bib_id - map: - bib_id: id - - - type: item - descend_key: keywords_plus + - type: item + descend_key: keywords + maps: + - type: list + descend_key: keyword + maps: + - how: dict + name: keyword + map: + '#text': name + - type: item + descend_key: abstracts + maps: + - type: list + descend_key: abstract + maps: + - type: item + descend_key: abstract_text + maps: + - type: list + descend_key: p maps: - - - type: list - descend_key: keyword_plus - maps: - - - how: dict - name: keyword - map: - '#text': name - - - type: item - descend_key: dynamic_data + - how: dict + name: abstract + map: + '#text': text + - type: item + descend_key: item maps: - - - type: item - descend_key: cluster_related + - type: item + descend_key: ids + maps: + - how: dict + name: some_id + map: + '#text': id + - how: dict + name: bib_id + map: + bib_id: id + - type: item + descend_key: keywords_plus + maps: + - type: list + descend_key: keyword_plus maps: - - - type: item - descend_key: identifiers - maps: - - - type: list - descend_key: identifier - maps: - - - how: dict - name: medium - filter: - '@type': issn - map: - '@value': issn - - - how: dict - name: publication - filter: - '@type': xref_doi - map: - '@value': doi + - how: dict + name: keyword + map: + '#text': name + - type: item + descend_key: dynamic_data + maps: + - type: item + descend_key: cluster_related + maps: + - type: item + descend_key: identifiers + maps: + - type: list + descend_key: identifier + maps: + - how: dict + name: medium + filter: + '@type': issn + map: + '@value': issn + - how: dict + name: publication + filter: + '@type': xref_doi + map: + '@value': doi vertex_collections: collections: publication: basename: publications fields: - - _key - - volume - - issue - - title - - accession_no - - has_abstract - - doi - - funding_text - - art_no - - first_page - - last_page - - page_count - - str_pages + - _key + - volume + - issue + - title + - accession_no + - has_abstract + - doi + - funding_text + - art_no + - first_page + - last_page + - page_count + - str_pages index: - - _key + - _key transforms: - - - foo: lower - class: str - fields: - - title + - foo: lower + class: str + fields: + - title medium: basename: media fields: - - issn - - isbn - - eissn - - eisbn - - accession_no - - meeting_abs - - parent_book_doi + - issn + - isbn + - eissn + - eisbn + - accession_no + - meeting_abs + - parent_book_doi index: - - issn - - isbn + - issn + - isbn extra_index: - - - type: hash - unique: false - fields: - - issn + - type: hash + unique: false + fields: + - issn medium_title: basename: media_title fields: - - title + - title index: - - title + - title transforms: - - - foo: lower - class: str - fields: - - title + - foo: lower + class: str + fields: + - title language: basename: languages fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name contributor: basename: contributors fields: - - first_name - - last_name - - email - - display_name - - wos_standard - - orcid_id - - r_id - - dais_id + - first_name + - last_name + - email + - display_name + - wos_standard + - orcid_id + - r_id + - dais_id index: - - first_name - - last_name - - email - - orcid_id - - r_id - - dais_id - - wos_standard + - first_name + - last_name + - email + - orcid_id + - r_id + - dais_id + - wos_standard transforms: - - - foo: lower - class: str - fields: - - first_name - - last_name - - display_name - - wos_standard - - - foo: standardize - module: graph_cast.util.transform - fields: - - wos_standard + - foo: lower + class: str + fields: + - first_name + - last_name + - display_name + - wos_standard + - foo: standardize + module: graph_cast.util.transform + fields: + - wos_standard location: basename: locations fields: - - country - - state - - city - - street - - full_address + - country + - state + - city + - street + - full_address index: - - country - - state - - city + - country + - state + - city extra_index: - - - type: hash - unique: false - fields: - - country + - type: hash + unique: false + fields: + - country transforms: - - - foo: lower - class: str - fields: - - country - - state - - city - - street - - full_address + - foo: lower + class: str + fields: + - country + - state + - city + - street + - full_address organization: basename: organizations fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name suborganization: basename: suborganizations fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name publication_type: basename: publication_types fields: - - type + - type index: - - type + - type document_type: basename: document_types fields: - - name + - name index: - - name + - name subject: basename: subjects fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name keyword: basename: keywords fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name edition: basename: editions fields: - - name + - name index: - - name + - name abstract: basename: abstracts fields: - - text + - text index: - - text + - text agency: basename: agencies fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name grant_id: basename: grant_ids fields: - - id + - id index: - - id + - id heading: basename: headings fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name subheading: basename: subheadings fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name funding_text: basename: funding_texts fields: - - text + - text index: - - text + - text date: basename: dates fields: - - year - - month - - day + - year + - month + - day index: - - year - - month - - day + - year + - month + - day transforms: - - - foo: try_int - module: graph_cast.util.transform - fields: - - year - - month - - day + - foo: try_int + module: graph_cast.util.transform + fields: + - year + - month + - day publisher: basename: publishers fields: - - display_name - - full_name + - display_name + - full_name index: - - display_name + - display_name transforms: - - - foo: lower - class: str - fields: - - display_name + - foo: lower + class: str + fields: + - display_name conference: basename: conferences fields: - - id + - id index: - - id + - id conference_title: basename: conference_titles fields: - - title + - title index: - - title + - title transforms: - - - foo: lower - class: str - fields: - - title + - foo: lower + class: str + fields: + - title conference_info: basename: conference_infos fields: - - text + - text index: - - text + - text conference_sponsor: basename: conference_sponsors fields: - - name + - name index: - - name + - name transforms: - - - foo: lower - class: str - fields: - - name + - foo: lower + class: str + fields: + - name some_id: basename: some_ids fields: - - id + - id index: - - id + - id bib_id: basename: bib_ids fields: - - id + - id index: - - id + - id edge_collections: extra: - - - source: contributor - target: organization - by: publication - weight: - _key: pubid - year: year + - source: contributor + target: organization + by: publication + weight: + _key: pubid + year: year extra: merge_collections: - - publication \ No newline at end of file + - publication diff --git a/conf/table/ibes.yaml b/conf/table/ibes.yaml index 2c9e9b92..9c206d7b 100644 --- a/conf/table/ibes.yaml +++ b/conf/table/ibes.yaml @@ -1,118 +1,108 @@ general: name: ibes csv: - - - tabletype: ibes - encoding: ISO-8859-1 - transforms: - - foo: parse_date_ibes - module: graph_cast.util.transform - input: - - ANNDATS - - ANNTIMS - output: - - datetime_announce - - foo: parse_date_ibes - module: graph_cast.util.transform - input: - - REVDATS - - REVTIMS - output: - - datetime_review - - foo: cast_ibes_analyst - module: graph_cast.util.transform - input: - - ANALYST - output: - - last_name - - initial - vertex_collections: - - - type: ticker - map: - CUSIP: cusip - CNAME: cname - OFTIC: oftic - - - type: agency - map: - ESTIMID: aname - - - type: analyst - - - type: recommendation - map: - ERECCD: erec - ETEXT: etext - IRECCD: irec - ITEXT: itext - - - type: publication +- tabletype: ibes + encoding: ISO-8859-1 + transforms: + - foo: parse_date_ibes + module: graph_cast.util.transform + input: + - ANNDATS + - ANNTIMS + output: + - datetime_announce + - foo: parse_date_ibes + module: graph_cast.util.transform + input: + - REVDATS + - REVTIMS + output: + - datetime_review + - foo: cast_ibes_analyst + module: graph_cast.util.transform + input: + - ANALYST + output: + - last_name + - initial + vertex_collections: + - type: ticker + map: + CUSIP: cusip + CNAME: cname + OFTIC: oftic + - type: agency + map: + ESTIMID: aname + - type: analyst + - type: recommendation + map: + ERECCD: erec + ETEXT: etext + IRECCD: irec + ITEXT: itext + - type: publication vertex_collections: blanks: - - publication + - publication collections: publication: basename: publications fields: - - datetime_review - - datetime_announce + - datetime_review + - datetime_announce extra_index: - - type: hash - unique: false - fields: - - datetime_review - - type: hash - unique: false - fields: - - datetime_announce + - type: hash + unique: false + fields: + - datetime_review + - type: hash + unique: false + fields: + - datetime_announce ticker: basename: tickers fields: - - cusip - - cname - - oftic + - cusip + - cname + - oftic index: - - cusip - - cname - - oftic + - cusip + - cname + - oftic agency: basename: agencies fields: - - aname + - aname index: - - aname + - aname analyst: basename: analysts fields: - - last_name - - initial + - last_name + - initial index: - - last_name - - initial + - last_name + - initial recommendation: basename: recommendations fields: - - erec - - etext - - irec - - itext + - erec + - etext + - irec + - itext index: - - irec + - irec edge_collections: main: - - - source: publication - target: ticker - - - source: analyst - target: agency - weight: - - datetime_review - - datetime_announce - - - source: analyst - target: publication - - - source: publication - target: recommendation + - source: publication + target: ticker + - source: analyst + target: agency + weight: + - datetime_review + - datetime_announce + - source: analyst + target: publication + - source: publication + target: recommendation diff --git a/conf/table/ticker.yaml b/conf/table/ticker.yaml index 2684fa92..2ed384b0 100644 --- a/conf/table/ticker.yaml +++ b/conf/table/ticker.yaml @@ -1,107 +1,97 @@ general: name: ticker_history csv: - - - tabletype: _all - transforms: - - foo: round_str - module: graph_cast.util.transform - maps: - - - input: - - Open - - - input: - - Close - params: - ndigits: 3 - - foo: int - module: builtins - maps: - - - input: - - Volume - - foo: parse_date_yahoo - module: graph_cast.util.transform - input: - - Date - output: - - t_obs - vertex_collections: - - - type: ticker - map: - _filename: oftic - - - type: feature - map: - Open: - key: name - - - type: feature - map: - Close: - key: name - - - type: feature - map: - Volume: - key: name +- tabletype: _all + transforms: + - foo: round_str + module: graph_cast.util.transform + maps: + - input: + - Open + - input: + - Close + params: + ndigits: 3 + - foo: int + module: builtins + maps: + - input: + - Volume + - foo: parse_date_yahoo + module: graph_cast.util.transform + input: + - Date + output: + - t_obs + vertex_collections: + - type: ticker + map: + _filename: oftic + - type: feature + map: + Open: + key: name + - type: feature + map: + Close: + key: name + - type: feature + map: + Volume: + key: name vertex_collections: collections: ticker: basename: tickers fields: - - cusip - - cname - - oftic + - cusip + - cname + - oftic index: - - cusip - - cname - - oftic + - cusip + - cname + - oftic feature: basename: features fields: - - name - - value + - name + - value index: - - name - - value + - name + - value extra_index: - - type: hash - unique: false - fields: - - value - - type: hash - unique: false - fields: - - name + - type: hash + unique: false + fields: + - value + - type: hash + unique: false + fields: + - name filters: - - - a: - field: name - foo: __eq__ - value: Open - b: - field: value - foo: __gt__ - value: 0 - - - a: - field: name - foo: __eq__ - value: Close - b: - field: value - foo: __gt__ - value: 0 + - a: + field: name + foo: __eq__ + value: Open + b: + field: value + foo: __gt__ + value: 0 + - a: + field: name + foo: __eq__ + value: Close + b: + field: value + foo: __gt__ + value: 0 edge_collections: main: - - source: ticker - target: feature - weight: - - t_obs - index: - - fields: - - t_obs - unique: false \ No newline at end of file + - source: ticker + target: feature + weight: + - t_obs + index: + - fields: + - t_obs + unique: false diff --git a/conf/table/wos.yaml b/conf/table/wos.yaml index 60113c0d..9f328721 100644 --- a/conf/table/wos.yaml +++ b/conf/table/wos.yaml @@ -1,149 +1,126 @@ general: - name: wos + name: wos csv: - - - tabletype: publications - vertex_collections: - - - type: publication - map: - wos_id: _key - pubyear: year - pubmonth: month - pubday: day - - - type: medium - map: - source: title - - - type: language - - - tabletype: contributors - vertex_collections: - - - type: contributor - - - type: publication - map: - wos_id: _key - - - tabletype: institutions - vertex_collections: - - - type: organization - - - type: publication - map: - wos_id: _key - - - tabletype: refs - vertex_collections: - - - type: publication - map: - wos_id: _key - - - type: publication - map: - uid: _key +- tabletype: publications + vertex_collections: + - type: publication + map: + wos_id: _key + pubyear: year + pubmonth: month + pubday: day + - type: medium + map: + source: title + - type: language +- tabletype: contributors + vertex_collections: + - type: contributor + - type: publication + map: + wos_id: _key +- tabletype: institutions + vertex_collections: + - type: organization + - type: publication + map: + wos_id: _key +- tabletype: refs + vertex_collections: + - type: publication + map: + wos_id: _key + - type: publication + map: + uid: _key vertex_collections: collections: publication: basename: publications fields: - - _key - - accession_no - - title - - year - - month - - day + - _key + - accession_no + - title + - year + - month + - day numeric_fields: - - year - - month - - day + - year + - month + - day index: - - _key + - _key extra_index: - - - type: hash - unique: false - fields: - - title - - - type: hash - unique: false - fields: - - year + - type: hash + unique: false + fields: + - title + - type: hash + unique: false + fields: + - year medium: basename: media fields: - - issn - - isbn - - title - - eissn - - eisbn + - issn + - isbn + - title + - eissn + - eisbn index: - - issn - - isbn - - title + - issn + - isbn + - title extra_index: - - - type: hash - unique: false - fields: - - issn + - type: hash + unique: false + fields: + - issn language: basename: languages fields: - - language + - language index: - - language + - language contributor: basename: contributors fields: - - first_name - - last_name + - first_name + - last_name index: - - first_name - - last_name + - first_name + - last_name organization: basename: organizations fields: - - organization - - country - - city + - organization + - country + - city index: - - organization - - country - - city + - organization + - country + - city extra_index: - - - type: hash - unique: false - fields: - - country + - type: hash + unique: false + fields: + - country edge_collections: main: - - - source: publication - target: medium - - - source: publication - target: language - - - source: contributor - target: publication - - - source: organization - target: publication - - - source: publication - target: publication + - source: publication + target: medium + - source: publication + target: language + - source: contributor + target: publication + - source: organization + target: publication + - source: publication + target: publication extra: - - - source: contributor - target: organization - by: publication - weight: - _key: pubid - year: year + - source: contributor + target: organization + by: publication + weight: + _key: pubid + year: year diff --git a/graph_cast/architecture/__init__.py b/graph_cast/architecture/__init__.py index 88ba3d0d..a6fdbce9 100644 --- a/graph_cast/architecture/__init__.py +++ b/graph_cast/architecture/__init__.py @@ -1,3 +1,3 @@ +from .general import Configurator, ConfiguratorType from .json import JConfigurator from .table import TConfigurator -from .general import Configurator, ConfiguratorType diff --git a/graph_cast/architecture/general.py b/graph_cast/architecture/general.py index 398db4c5..8a317aae 100644 --- a/graph_cast/architecture/general.py +++ b/graph_cast/architecture/general.py @@ -1,7 +1,11 @@ +from __future__ import annotations + +import abc import logging +from collections import Iterable, defaultdict from typing import TypeVar -from collections import defaultdict, Iterable -from graph_cast.architecture.schema import VertexConfig, GraphConfig + +from graph_cast.architecture.schema import GraphConfig, VertexConfig logger = logging.getLogger(__name__) @@ -19,6 +23,30 @@ def __init__(self, config): self.vertex_config.dbname, config["json"] if "json" in config else None, ) + self.current_fname: str | None = None + + @abc.abstractmethod + def set_current_resource_name(self, resource): + pass + + @property + def encoding(self): + return "utf-8" + + @property + def current_graphs(self): + return [] + + @property + def current_collections(self): + return [] + + @property + def current_transformations(self): + return [] + + def graph(self, u, v): + return self.graph_config.graph(u, v) class TransformException(BaseException): @@ -34,7 +62,10 @@ def transform_foo(transform, doc): args = [doc[k] for k in transform.input] transform_result = transform(*args) if isinstance(transform_result, Iterable): - upd = {k: v for k, v in zip(transform.output, transform_result)} + upd = { + k: v + for k, v in zip(transform.output, transform_result) + } else: upd = {transform.output[0]: transform_result} else: @@ -82,7 +113,9 @@ def __init__(self, **kwargs): del self._map["_filename"] def _process_maps(self): - self._map = {k: v for k, v in self._raw_map.items() if isinstance(v, str)} + self._map = { + k: v for k, v in self._raw_map.items() if isinstance(v, str) + } self._map_splitter = { k: v for k, v in self._raw_map.items() if not isinstance(v, str) } @@ -90,9 +123,13 @@ def _process_maps(self): def _check_map_splitter(self): for k, item in self._map_splitter.items(): if not isinstance(item, dict): - raise TypeError(f" self._raw_map should be a dict : {self._raw_map}") + raise TypeError( + f" self._raw_map should be a dict : {self._raw_map}" + ) if "key" not in item: - raise KeyError(f" item should contain 'key' and 'value' : {item}") + raise KeyError( + f" item should contain 'key' and 'value' : {item}" + ) if "value" not in item: item["value"] = "value" @@ -156,7 +193,9 @@ def __init__(self, inp): self._vcollections[cc["type"]] += [Mapper(**cc)] def __iter__(self): - return ((k, m) for k in self.collections for m in self._vcollections[k]) + return ( + (k, m) for k in self.collections for m in self._vcollections[k] + ) @property def collections(self): diff --git a/graph_cast/architecture/json.py b/graph_cast/architecture/json.py index 8a423efc..913a9260 100644 --- a/graph_cast/architecture/json.py +++ b/graph_cast/architecture/json.py @@ -1,5 +1,6 @@ from collections import defaultdict from copy import deepcopy + from graph_cast.architecture.general import Configurator @@ -23,3 +24,6 @@ def graph(self, u, v): def exclude_fields(self, k): return self.graph_config.exclude_fields(k) + + def set_current_resource_name(self, resource): + self.current_fname = resource diff --git a/graph_cast/architecture/schema.py b/graph_cast/architecture/schema.py index 0b9b1e13..3635fc92 100644 --- a/graph_cast/architecture/schema.py +++ b/graph_cast/architecture/schema.py @@ -1,6 +1,7 @@ from collections import defaultdict +from typing import Any, Dict, Set, Tuple + from graph_cast.architecture.transform import Transform -from typing import Set, Tuple, Any, Dict, DefaultDict, List class CollectionIndex: @@ -39,10 +40,13 @@ def __init__( self._fields = list(fields) self._index = CollectionIndex(*index) if extra_index is not None: - self._extra_indices = [CollectionIndex(**item) for item in extra_index] + self._extra_indices = [ + CollectionIndex(**item) for item in extra_index + ] self._numeric_fields = numeric_fields # set of filters self._filters = [Filter(**item) for item in filters] + # currently not used self._transforms = [Transform(**item) for item in transforms] @@ -163,7 +167,9 @@ def collections(self): def _init_vcollections(self, vconfig): self._vcollections = set(vconfig.keys()) - self._vcollections_all = {k: Vertex(name=k, **v) for k, v in vconfig.items()} + self._vcollections_all = { + k: Vertex(name=k, **v) for k, v in vconfig.items() + } def _init_names(self, vconfig): try: @@ -206,7 +212,9 @@ def index(self, vertex_name): def _init_extra_indexes(self, vconfig): self._extra_indices = { - k: v["extra_index"] for k, v in vconfig.items() if "extra_index" in v + k: v["extra_index"] + for k, v in vconfig.items() + if "extra_index" in v } def extra_index_list(self, vertex_name): @@ -233,7 +241,9 @@ def blank_collections(self): return iter(self._blank_collections) def _init_fields(self, vconfig): - self._vfields = {k: v["fields"] for k, v in vconfig.items() if "fields" in v} + self._vfields = { + k: v["fields"] for k, v in vconfig.items() if "fields" in v + } def fields(self, vertex_name): if vertex_name in self._vcollections: @@ -248,7 +258,9 @@ def fields(self, vertex_name): def _init_numeric_fields(self, vconfig): self._vcollection_numeric_fields_map = { - k: v["numeric_fields"] for k, v in vconfig.items() if "numeric_fields" in v + k: v["numeric_fields"] + for k, v in vconfig.items() + if "numeric_fields" in v } def numeric_fields_list(self, vertex_name): @@ -297,9 +309,13 @@ def __init__(self, econfig, vmap, jconfig=None): def _init_edges(self, config): # check that the edges are unique if "main" in config: - self._edges = {(item["source"], item["target"]) for item in config["main"]} + self._edges = { + (item["source"], item["target"]) for item in config["main"] + } if len(set(self._edges)) < len(self._edges): - raise ValueError(f" Potentially duplicate edges in edges definition") + raise ValueError( + f" Potentially duplicate edges in edges definition" + ) self._edges = set(self._edges) def _init_extra_edges(self, config): @@ -333,7 +349,7 @@ def _init_jedges(self, jconfig): def _parse_jedges( self, croot, edge_accumulator, exclusion_fields - ) -> (Set, DefaultDict[str, List]): + ) -> tuple[set, defaultdict[str, list]]: # TODO push mapping_fields etc to architecture """ extract edge definition and edge fields from definition dict @@ -359,8 +375,7 @@ def _parse_jedges( if "field" in evw["target"]: exclusion_fields[wname] += [evw["target"]["field"]] return edge_acc_ | edge_accumulator, exclusion_fields - else: - return set(), defaultdict(list) + return set(), defaultdict(list) def _define_graphs(self, config, vmap): @@ -396,7 +411,9 @@ def graph(self, u, v): try: return self._graphs[u, v] except: - raise KeyError(f" Requested graph {u, v} not present in GraphConfig") + raise KeyError( + f" Requested graph {u, v} not present in GraphConfig" + ) def weights(self, u, v): if (u, v) in self._graphs and "weight" in self._graphs[u, v]: diff --git a/graph_cast/architecture/table.py b/graph_cast/architecture/table.py index 01b4b732..9ecf9f42 100644 --- a/graph_cast/architecture/table.py +++ b/graph_cast/architecture/table.py @@ -1,20 +1,20 @@ from collections import defaultdict +from copy import deepcopy from itertools import permutations +from os import listdir +from os.path import isfile, join + from graph_cast.architecture.general import ( Configurator, - Mapper, LocalVertexCollections, + Mapper, ) from graph_cast.architecture.transform import Transform -from os import listdir -from os.path import isfile, join -from copy import deepcopy class TConfigurator(Configurator): def __init__(self, config): super().__init__(config) - self.current_fname = None self.mode = None self.modes2collections = defaultdict(LocalVertexCollections) @@ -26,7 +26,8 @@ def __init__(self, config): def set_mode(self, mode): """ - TConfigurator configure several types of tables, mode tells TConfigurator which type to table to deal with currently + TConfigurator configure several types of tables, mode tells TConfigurator + which type to table to deal with currently :param mode: :return: """ @@ -82,7 +83,9 @@ def _init_modes2graphs(self, subconfig, edges): if (u, v) in edges: self.modes2graphs[table_type] += [(u, v)] - self.modes2graphs = {k: list(set(v)) for k, v in self.modes2graphs.items()} + self.modes2graphs = { + k: list(set(v)) for k, v in self.modes2graphs.items() + } def discover_files(self, fpath, limit_files=None): for keyword in self.modes2graphs: @@ -94,12 +97,16 @@ def discover_files(self, fpath, limit_files=None): [ join(fpath, f) for f in listdir(fpath) - if isfile(join(fpath, f)) and (search_pattern in f) and ("csv" in f) + if isfile(join(fpath, f)) + and (search_pattern in f) + and ("csv" in f) ] ) if limit_files: - self.mode2files = {k: v[:limit_files] for k, v in self.mode2files.items()} + self.mode2files = { + k: v[:limit_files] for k, v in self.mode2files.items() + } def set_current_resource_name(self, tabular_resource): self.current_fname = tabular_resource @@ -108,26 +115,23 @@ def set_current_resource_name(self, tabular_resource): class TablesConfig: - _tables = set() + _tables: set[str] = set() # table_type -> [ {vertex_collection :vc, map: (table field -> collection field)} ] # vertex_collection -> (table field -> collection field) - table_collection_maps = dict() - - # table_type -> transforms - encodings_map = dict() + table_collection_maps: dict[str, dict[str, str]] = dict() - # table_type -> extra logic - logic = {} + # table_type -> encoding + encodings_map: dict[str, str] = dict() # table_type -> vertex_collections - _vertices = {} + _vertices: dict[str, str] = {} # table_type -> edge_collections - _edges = {} + _edges: dict[str, str] = {} # table_type -> transforms - _transforms = defaultdict(list) + _transforms: defaultdict[str, list[dict[str, str]]] = defaultdict(list) def __init__(self, vconfig, graph_config): self._init_tables(vconfig) @@ -178,9 +182,13 @@ def _init_transformations(self, subconfig): kwargs["input"] = cmap["input"] if "output" in cmap: kwargs["output"] = cmap["output"] - self._transforms[item["tabletype"]] += [Transform(**kwargs)] + self._transforms[item["tabletype"]] += [ + Transform(**kwargs) + ] else: - self._transforms[item["tabletype"]] += [Transform(**citem)] + self._transforms[item["tabletype"]] += [ + Transform(**citem) + ] def _init_encodings(self, subconfig): for item in subconfig: @@ -189,13 +197,6 @@ def _init_encodings(self, subconfig): else: self.encodings_map[item["tabletype"]] = None - def parse_logic(self, subconfig): - for item in subconfig: - if "logic" in item: - self.logic[item["tabletype"]] = item["logic"] - else: - self.logic[item["tabletype"]] = None - def vertices(self, table_type): return self._vertices[table_type] diff --git a/graph_cast/db/__init__.py b/graph_cast/db/__init__.py index e120d590..a4a0778a 100644 --- a/graph_cast/db/__init__.py +++ b/graph_cast/db/__init__.py @@ -1,4 +1,3 @@ +from .connection import Connection, ConnectionConfigType, ConnectionType from .factory import ConfigFactory from .manager import ConnectionManager -from .connection import Connection, ConnectionType -from .abstract_config import ConnectionConfigType diff --git a/graph_cast/db/abstract_config.py b/graph_cast/db/abstract_config.py index 22c6f13f..b28b04f6 100644 --- a/graph_cast/db/abstract_config.py +++ b/graph_cast/db/abstract_config.py @@ -1,28 +1,3 @@ -import abc -from typing import TypeVar -ConnectionConfigType = TypeVar("ConnectionConfigType", bound="ConnectionConfig") - -class ConnectionConfig(abc.ABC): - connection_class = None - - def __init__(self, **config): - self.protocol = config.get("protocol", "http") - self.ip_addr = config.get("ip_addr", None) - self.cred_name = config.get("cred_name", None) - self.cred_pass = config.get("cred_pass", None) - self.database = config.get("database", None) - self.port = config.get("port", None) - self.hosts = None - - -class WSGIConfig(ConnectionConfig): - connection_class = None - - def __init__(self, **config): - super(WSGIConfig, self).__init__(**config) - self.path = config.get("path", "/") - self.hosts = f"{self.protocol}://{self.ip_addr}:{self.port}{self.path}" - self.host = config.get("host", None) diff --git a/graph_cast/db/arango/config.py b/graph_cast/db/arango/config.py index 1da252e3..45cdbba5 100644 --- a/graph_cast/db/arango/config.py +++ b/graph_cast/db/arango/config.py @@ -1,5 +1,5 @@ -from graph_cast.db.abstract_config import ConnectionConfig from graph_cast.db.arango.connection import ArangoConnection +from graph_cast.db.connection import ConnectionConfig class ArangoConnectionConfig(ConnectionConfig): diff --git a/graph_cast/db/arango/connection.py b/graph_cast/db/arango/connection.py index 8975665b..ee8fec4b 100644 --- a/graph_cast/db/arango/connection.py +++ b/graph_cast/db/arango/connection.py @@ -1,14 +1,16 @@ import logging + from arango import ArangoClient + +from graph_cast.db import ConnectionConfigType from graph_cast.db.connection import Connection -from graph_cast.db.abstract_config import ConnectionConfigType logger = logging.getLogger(__name__) class ArangoConnection(Connection): def __init__(self, config: ConnectionConfigType): - super().__init__(config) + super().__init__() client = ArangoClient(hosts=config.hosts) self.conn = client.db( @@ -84,7 +86,9 @@ def define_edge_collections(self, graph_config): def define_vertex_indices(self, vertex_config): for c in vertex_config.collections: for index_dict in vertex_config.extra_index_list(c): - general_collection = self.conn.collection(vertex_config.dbname(c)) + general_collection = self.conn.collection( + vertex_config.dbname(c) + ) ih = general_collection.add_hash_index( fields=index_dict["fields"], unique=index_dict["unique"] ) @@ -94,7 +98,9 @@ def define_edge_indices(self, graph_config): item = graph_config.graph(u, v) if "index" in item: for index_dict in item["index"]: - general_collection = self.conn.collection(item["edge_name"]) + general_collection = self.conn.collection( + item["edge_name"] + ) ih = general_collection.add_hash_index( fields=index_dict["fields"], unique=index_dict["unique"], @@ -105,17 +111,25 @@ def create_collection_if_absent(self, g, vcol, index, unique=True): _ = g.create_vertex_collection(vcol) general_collection = self.conn.collection(vcol) if index is not None and index != ["_key"]: - ih = general_collection.add_hash_index(fields=index, unique=unique) + ih = general_collection.add_hash_index( + fields=index, unique=unique + ) return ih else: return None def delete_collections(self, cnames=(), gnames=(), delete_all=False): logger.info("collections (non system):") - logger.info([c for c in self.conn.collections() if c["name"][0] != "_"]) + logger.info( + [c for c in self.conn.collections() if c["name"][0] != "_"] + ) if delete_all: - cnames = [c["name"] for c in self.conn.collections() if c["name"][0] != "_"] + cnames = [ + c["name"] + for c in self.conn.collections() + if c["name"][0] != "_" + ] gnames = [g["name"] for g in self.conn.graphs()] for cn in cnames: @@ -123,7 +137,9 @@ def delete_collections(self, cnames=(), gnames=(), delete_all=False): self.conn.delete_collection(cn) logger.info("collections (after delete operation):") - logger.info([c for c in self.conn.collections() if c["name"][0] != "_"]) + logger.info( + [c for c in self.conn.collections() if c["name"][0] != "_"] + ) logger.info("graphs:") logger.info(self.conn.graphs()) diff --git a/graph_cast/db/arango/query.py b/graph_cast/db/arango/query.py index 4923b6d0..a1acf8e7 100644 --- a/graph_cast/db/arango/query.py +++ b/graph_cast/db/arango/query.py @@ -1,7 +1,8 @@ import gzip import json -from os.path import join import logging +from os.path import join + from arango import ArangoClient logger = logging.getLogger(__name__) @@ -41,7 +42,9 @@ def profile_query(query, nq, profile_times, fpath, limit=None, **kwargs): cursor = basic_query(query, profile=True, **kwargs) profiling += [cursor.profile()] cursor.close() - with open(join(fpath, f"query{nq}_profile{limit_str}.json"), "w") as fp: + with open( + join(fpath, f"query{nq}_profile{limit_str}.json"), "w" + ) as fp: json.dump(profiling, fp, indent=4) logger.info(f"starting actual query at {limit}") diff --git a/graph_cast/db/arango/util.py b/graph_cast/db/arango/util.py index ecf74668..b9af17bc 100644 --- a/graph_cast/db/arango/util.py +++ b/graph_cast/db/arango/util.py @@ -1,7 +1,8 @@ import json -from arango import ArangoClient import logging +from arango import ArangoClient + from graph_cast.util.transform import pick_unique_dict logger = logging.getLogger(__name__) @@ -18,7 +19,9 @@ def create_collection_if_absent(db_client, g, vcol, index, unique=True): return None -def get_arangodb_client(protocol, ip_addr, port, database, cred_name, cred_pass): +def get_arangodb_client( + protocol, ip_addr, port, database, cred_name, cred_pass +): hosts = f"{protocol}://{ip_addr}:{port}" client = ArangoClient(hosts=hosts) @@ -33,7 +36,9 @@ def delete_collections(sys_db, cnames=(), gnames=(), delete_all=False): logger.info([c for c in sys_db.collections() if c["name"][0] != "_"]) if delete_all: - cnames = [c["name"] for c in sys_db.collections() if c["name"][0] != "_"] + cnames = [ + c["name"] for c in sys_db.collections() if c["name"][0] != "_" + ] gnames = [g["name"] for g in sys_db.graphs()] for cn in cnames: diff --git a/graph_cast/db/connection.py b/graph_cast/db/connection.py index 8017b7da..61ccc9f8 100644 --- a/graph_cast/db/connection.py +++ b/graph_cast/db/connection.py @@ -1,18 +1,20 @@ -import logging -from graph_cast.db.abstract_config import ConnectionConfig -from typing import TypeVar import abc +import logging +from typing import Type, TypeVar -from graph_cast.db.arango.util import update_to_numeric, define_extra_edges +from graph_cast.db.arango.util import define_extra_edges, update_to_numeric logger = logging.getLogger(__name__) ConnectionType = TypeVar("ConnectionType", bound="Connection") +ConnectionConfigType = TypeVar( + "ConnectionConfigType", bound="ConnectionConfig" +) class Connection(abc.ABC): - def __init__(self, config: ConnectionConfig): + def __init__(self): pass @abc.abstractmethod @@ -35,10 +37,18 @@ def close(self): def define_indices(self, graph_config, vertex_config): pass + @abc.abstractmethod + def define_collections(self, graph_config, vertex_config): + pass + + @abc.abstractmethod + def delete_collections(self, cnames=(), gnames=(), delete_all=False): + pass + # @abc.abstractmethod - # def define_collections(self, graph_config, vertex_config): + # def get_collections(self): # pass - # + # @abc.abstractmethod # def define_vertex_collections(self, graph_config, vertex_config): # pass @@ -58,14 +68,27 @@ def define_indices(self, graph_config, vertex_config): # @abc.abstractmethod # def create_collection_if_absent(self, g, vcol, index, unique=True): # pass - # - # @abc.abstractmethod - # def delete_collections(self, cnames=(), gnames=(), delete_all=False): - # pass - # - # @abc.abstractmethod - # def get_collections(self): - # pass + + +class ConnectionConfig(abc.ABC): + connection_class: Type[Connection] + + def __init__(self, **config): + self.protocol = config.get("protocol", "http") + self.ip_addr = config.get("ip_addr", None) + self.cred_name = config.get("cred_name", None) + self.cred_pass = config.get("cred_pass", None) + self.database = config.get("database", None) + self.port = config.get("port", None) + self.hosts = None + + +class WSGIConfig(ConnectionConfig): + def __init__(self, **config): + super(WSGIConfig, self).__init__(**config) + self.path = config.get("path", "/") + self.hosts = f"{self.protocol}://{self.ip_addr}:{self.port}{self.path}" + self.host = config.get("host", None) def init_db(db_client: ConnectionType, conf_obj, clean_start): @@ -89,11 +112,13 @@ def concluding_db_transform(db_client: ConnectionType, conf_obj): # TODO this should be made part of atomic etl (not applied to the whole db) for cname in conf_obj.vertex_config.collections: for field in conf_obj.vertex_config.numeric_fields_list(cname): - query0 = update_to_numeric(conf_obj.vertex_config.dbname(cname), field) - cursor = db_client.execute(query0) + query0 = update_to_numeric( + conf_obj.vertex_config.dbname(cname), field + ) + db_client.execute(query0) # create edge u -> v from u->w, v->w edges # find edge_cols uw and vw for u, v in conf_obj.graph_config.extra_edges: query0 = define_extra_edges(conf_obj.graph(u, v)) - cursor = db_client.execute(query0) + db_client.execute(query0) diff --git a/graph_cast/db/factory.py b/graph_cast/db/factory.py index 1137c1df..d9e0f4be 100644 --- a/graph_cast/db/factory.py +++ b/graph_cast/db/factory.py @@ -1,12 +1,13 @@ import json import os from copy import deepcopy -from typing import Type + import yaml +from graph_cast.db import ConnectionConfigType from graph_cast.db.arango.config import ArangoConnectionConfig +from graph_cast.db.connection import ConnectionConfig, WSGIConfig from graph_cast.db.neo4j.config import Neo4jConnectionConfig -from graph_cast.db.abstract_config import ConnectionConfig, WSGIConfig class ConfigFactory: @@ -48,9 +49,3 @@ def create_config(cls, secret_path=None, args=None): return WSGIConfig(**config) else: raise NotImplementedError - - -class ConnectionFactory: - @classmethod - def create_connection(cls, config: Type[ConnectionConfig]): - return config.connection_class(config) diff --git a/graph_cast/db/manager.py b/graph_cast/db/manager.py index c092486c..b092bbf8 100644 --- a/graph_cast/db/manager.py +++ b/graph_cast/db/manager.py @@ -1,7 +1,6 @@ from typing import Optional -from graph_cast.db import ConfigFactory -from graph_cast.db.abstract_config import ConnectionConfigType -from graph_cast.db.factory import ConnectionFactory + +from graph_cast.db import ConfigFactory, ConnectionConfigType class ConnectionManager: @@ -11,16 +10,16 @@ def __init__( args=None, connection_config: Optional[ConnectionConfigType] = None, ): - if connection_config is None: - self.config: ConnectionConfigType = ConfigFactory.create_config( - secret_path, args - ) - else: - self.config: ConnectionConfigType = connection_config + self.config: ConnectionConfigType = ( + ConfigFactory.create_config(secret_path, args) + if connection_config is None + else connection_config + ) self.conn = None def __enter__(self): - self.conn = ConnectionFactory.create_connection(self.config) + cls = self.config.connection_class + self.conn = cls(self.config) return self.conn def __exit__(self, exc_type, exc_value, exc_traceback): diff --git a/graph_cast/db/neo4j/config.py b/graph_cast/db/neo4j/config.py index 6642d3a8..d1663c99 100644 --- a/graph_cast/db/neo4j/config.py +++ b/graph_cast/db/neo4j/config.py @@ -1,4 +1,4 @@ -from graph_cast.db.abstract_config import ConnectionConfig +from graph_cast.db.connection import ConnectionConfig from graph_cast.db.neo4j.connection import Neo4jConnection diff --git a/graph_cast/db/neo4j/connection.py b/graph_cast/db/neo4j/connection.py index 22518519..f6ce76ef 100644 --- a/graph_cast/db/neo4j/connection.py +++ b/graph_cast/db/neo4j/connection.py @@ -1,14 +1,16 @@ import logging + from neo4j import GraphDatabase + +from graph_cast.db import ConnectionConfigType from graph_cast.db.connection import Connection -from graph_cast.db.abstract_config import ConnectionConfigType logger = logging.getLogger(__name__) class Neo4jConnection(Connection): def __init__(self, config: ConnectionConfigType): - super().__init__(config) + super().__init__() driver = GraphDatabase.driver( uri=config.hosts, auth=(config.cred_name, config.cred_pass) ) @@ -127,7 +129,7 @@ def __init__(self, config: ConnectionConfigType): # # def get_collections(self): # return self.conn.collections() - # + def execute(self, query, params=None): cursor = self.conn.run(query, params) return cursor diff --git a/graph_cast/input/__init__.py b/graph_cast/input/__init__.py index 14ce9cc4..8022b76c 100644 --- a/graph_cast/input/__init__.py +++ b/graph_cast/input/__init__.py @@ -1,2 +1,2 @@ -from .table import table_to_collections from .json import jsondoc_to_collections +from .table import table_to_collections diff --git a/graph_cast/input/json.py b/graph_cast/input/json.py index f594545e..b4a9810c 100644 --- a/graph_cast/input/json.py +++ b/graph_cast/input/json.py @@ -1,20 +1,18 @@ -from typing import DefaultDict -from graph_cast.util import timer as timer +import logging import multiprocessing as mp from collections import defaultdict from functools import partial -from typing import List, Tuple -import logging +from typing import DefaultDict, List, Tuple from graph_cast.architecture import JConfigurator from graph_cast.input.json_aux import ( apply_mapper, - project_dicts, merge_documents, + project_dicts, ) +from graph_cast.util import timer as timer from graph_cast.util.transform import pick_unique_dict - logger = logging.getLogger(__name__) diff --git a/graph_cast/input/json_aux.py b/graph_cast/input/json_aux.py index dd3eceb2..768e62d7 100644 --- a/graph_cast/input/json_aux.py +++ b/graph_cast/input/json_aux.py @@ -1,15 +1,19 @@ import gzip import json -from collections import defaultdict, ChainMap +from collections import ChainMap, defaultdict from itertools import product +from typing import Dict -from graph_cast.input.util import parse_vcollection -from graph_cast.util.io import FPSmart -from graph_cast.input.util import define_graphs, update_graph_extra_edges -from graph_cast.architecture.schema import VertexConfig from graph_cast.architecture.general import transform_foo +from graph_cast.architecture.schema import VertexConfig from graph_cast.architecture.transform import Transform -from typing import Dict + +# from graph_cast.input.util import ( +# define_graphs, +# parse_vcollection, +# update_graph_extra_edges, +# ) +from graph_cast.util.io import FPSmart xml_dummy = "#text" @@ -24,12 +28,20 @@ def apply_mapper(mapper: Dict, document, vertex_config: VertexConfig): ("filter" not in mapper and "unfilter" not in mapper) or ( "filter" in mapper - and all([document[kk] == vv for kk, vv in mapper["filter"].items()]) + and all( + [ + document[kk] == vv + for kk, vv in mapper["filter"].items() + ] + ) ) or ( "unfilter" in mapper and any( - [document[kk] != vv for kk, vv in mapper["unfilter"].items()] + [ + document[kk] != vv + for kk, vv in mapper["unfilter"].items() + ] ) ) ): @@ -39,7 +51,6 @@ def apply_mapper(mapper: Dict, document, vertex_config: VertexConfig): if "transforms" in mapper: for t in mapper["transforms"]: t_ = Transform(**t) - # doc_.update(transform_foo(t, document)) doc_.update(transform_foo(t_, document)) if "map" in mapper: @@ -80,7 +91,7 @@ def apply_mapper(mapper: Dict, document, vertex_config: VertexConfig): raise KeyError("Mapper must have map key if it has vertex key") # traverse non terminal nodes elif "type" in mapper: - agg = defaultdict(list) + agg: defaultdict[str, list[str]] = defaultdict(list) if "descend_key" in mapper: if document and mapper["descend_key"] in document: document = document[mapper["descend_key"]] @@ -134,20 +145,34 @@ def add_weights(mapper, agg): if "vertex" in edge_def: for item in edge_def["vertex"]: + # item + # name: publication + # condition: + # anchor: main + # keys: + # mapper: + # k1: q1 + + # should rather become index for the given vcollection (item["name"]) + keys_to_add = item["keys"] if "keys" in item else [] + keys_to_map = item["mapper"] if "mapper" in item else {} + keys_to_map.update({k: k for k in keys_to_add}) + vs = [doc for doc in agg[item["name"]]] if "condition" in item.keys(): c = item["condition"] - vs = [doc for doc in vs if all([q in doc for q in c])] + vs = [ + doc + for doc in vs + if all([doc[q] == v in doc for q, v in c.items()]) + ] if vs: + # TODO : possible issue doc = vs[0] - if "condition" not in item.keys() or ( - "condition" in item.keys() - and all([doc[k] == v for k, v in c.items()]) - ): - for edoc in edges: - edoc["attributes"].update( - {item["name"]: doc[item["field"]]} - ) + for edoc in edges: + edoc["attributes"].update( + {q: doc[k] for k, q in keys_to_map.items()} + ) agg[(source, target)] = edges return agg @@ -199,7 +224,9 @@ def add_edges(mapper, agg, vertex_config): weight[k] = v[k] del v[k] if "values" in edge_def: - weight.update({k: v for k, v in edge_def["values"].items()}) + weight.update( + {k: v for k, v in edge_def["values"].items()} + ) agg[(source, target)] += [ { "source": project_dict(u, source_index), @@ -219,7 +246,9 @@ def add_edges(mapper, agg, vertex_config): target_items, target_index, edge_def["target"] ) - target_items = [item for item in target_items if target_field in item] + target_items = [ + item for item in target_items if target_field in item + ] if target_items: target_items = dict( @@ -232,10 +261,16 @@ def add_edges(mapper, agg, vertex_config): weight = dict() if "fields" in edge_def["source"]: weight.update( - {k: u[k] for k in edge_def["source"]["fields"] if k in u} + { + k: u[k] + for k in edge_def["source"]["fields"] + if k in u + } ) if "values" in edge_def: - weight.update({k: v for k, v in edge_def["values"].items()}) + weight.update( + {k: v for k, v in edge_def["values"].items()} + ) up = project_dict(u, source_index) if source_field in u: pointer = u[source_field] @@ -264,7 +299,9 @@ def add_edges(mapper, agg, vertex_config): return agg -def pick_indexed_items_anchor_logic(items, indices, set_spec, anchor_key="anchor"): +def pick_indexed_items_anchor_logic( + items, indices, set_spec, anchor_key="anchor" +): """ :param items: list of documents (dict) @@ -287,15 +324,6 @@ def pick_indexed_items_anchor_logic(items, indices, set_spec, anchor_key="anchor return items_ -def assign_edge_label(edges, label, condition): - edges_new = [(u, v, label) if condition(u, v) else (u, v, {}) for u, v in edges] - return edges_new - - -def clean_arobas(item): - return {k: v for k, v in item.items() if k[0] != "@"} - - def project_dict(item, keys, how="include"): if how == "include": return {k: v for k, v in item.items() if k in keys} @@ -307,21 +335,13 @@ def project_dicts(items, keys, how="include"): if how == "include": return [{k: v for k, v in item.items() if k in keys} for item in items] elif how == "exclude": - return [{k: v for k, v in item.items() if k not in keys} for item in items] + return [ + {k: v for k, v in item.items() if k not in keys} for item in items + ] else: - raise ValueError(f" `how` should be exclude or include : instead {how}") - - -def clean_aux_fields(pack): - pack_out = {} - for k, cpack in pack.items(): - if k != "@edges": - pack_out[k] = [clean_arobas(x) for x in cpack] - else: - pack_out[k] = [ - (clean_arobas(x[0]), clean_arobas(x[1]), x[2:]) for x in cpack - ] - return pack_out + raise ValueError( + f" `how` should be exclude or include : instead {how}" + ) def parse_edges(croot, edge_acc, mapping_fields): @@ -336,7 +356,9 @@ def parse_edges(croot, edge_acc, mapping_fields): if isinstance(croot, dict): if "maps" in croot: for m in croot["maps"]: - edge_acc_, mapping_fields = parse_edges(m, edge_acc, mapping_fields) + edge_acc_, mapping_fields = parse_edges( + m, edge_acc, mapping_fields + ) edge_acc += edge_acc_ if "edges" in croot: edge_acc_ = [] @@ -355,13 +377,16 @@ def parse_edges(croot, edge_acc, mapping_fields): return [], defaultdict(list) -def merge_documents(docs, main_key="_key", anchor_key="anchor", anchor_value="main"): +def merge_documents( + docs, main_key="_key", anchor_key="anchor", anchor_value="main" +): """ - docs contain docs with main_key and without + docs contain documents with main_key and documents without all docs without main_key should be merged with the doc that has doc[anchor_key] == anchor_value :param docs: :param main_key: :param anchor_key: + :param anchor_value: :return: list of docs, each of which contains main_key """ mains_, mains, auxs, anchors = [], [], [], [] @@ -401,7 +426,10 @@ def smart_merge( # merge non_standard onto standard (not replacing fields are already standard item) for item in agg[collection_name]: if discriminant_key in item: - if "wos_standard" in item and item[discriminant_key] == discriminant_value: + if ( + "wos_standard" in item + and item[discriminant_key] == discriminant_value + ): wos_standard[item["wos_standard"]] += [item] else: without_standard_heap += [item] @@ -460,38 +488,38 @@ def get_json_data(source, pattern=None): # return r -def parse_config(config=None): - """ - only parse_edges depends on json - - :param config: - :param prefix: - :return: - """ - - ( - vmap, - index_fields_dict, - extra_indices, - vfields, - blank_collections, - ) = parse_vcollection(config) - - edge_def, excl_fields = parse_edges(config["json"], [], defaultdict(list)) - - graphs_definition = define_graphs(edge_def, vmap) - graphs_definition = update_graph_extra_edges( - graphs_definition, vmap, config["extra_edges"] - ) - - vcollections = list( - set([graphs_definition[g]["source"] for g in graphs_definition]) - | set([graphs_definition[g]["target"] for g in graphs_definition]) - ) - return ( - vcollections, - vmap, - graphs_definition, - index_fields_dict, - extra_indices, - ) +# def parse_config(config=None): +# """ +# only parse_edges depends on json +# +# :param config: +# :param prefix: +# :return: +# """ +# +# ( +# vmap, +# index_fields_dict, +# extra_indices, +# vfields, +# blank_collections, +# ) = parse_vcollection(config) +# +# edge_def, excl_fields = parse_edges(config["json"], [], defaultdict(list)) +# +# graphs_definition = define_graphs(edge_def, vmap) +# graphs_definition = update_graph_extra_edges( +# graphs_definition, vmap, config["extra_edges"] +# ) +# +# vcollections = list( +# set([graphs_definition[g]["source"] for g in graphs_definition]) +# | set([graphs_definition[g]["target"] for g in graphs_definition]) +# ) +# return ( +# vcollections, +# vmap, +# graphs_definition, +# index_fields_dict, +# extra_indices, +# ) diff --git a/graph_cast/input/json_flow.py b/graph_cast/input/json_flow.py index c778b4c3..48f7e4b4 100644 --- a/graph_cast/input/json_flow.py +++ b/graph_cast/input/json_flow.py @@ -1,12 +1,12 @@ -from typing import List import logging +from typing import List from graph_cast.architecture import JConfigurator from graph_cast.db import ConnectionConfigType, ConnectionManager -from graph_cast.db.arango.util import upsert_docs_batch, insert_edges_batch +from graph_cast.db.arango.util import insert_edges_batch, upsert_docs_batch +from graph_cast.input.json import jsonlike_to_collections from graph_cast.util import timer as timer from graph_cast.util.transform import merge_doc_basis -from graph_cast.input.json import jsonlike_to_collections logger = logging.getLogger(__name__) @@ -34,7 +34,7 @@ def process_jsonlike( True, ) if not dry: - cursor = db_client.execute(query0) + db_client.execute(query0) logger.info(f" ingested {cnt} vertices {t_ingest.elapsed:.2f} sec") @@ -55,7 +55,7 @@ def process_jsonlike( ) if not dry: - cursor = db_client.execute(query0) + db_client.execute(query0) logger.info(f" ingested {cnt} edges {t_ingest_edges.elapsed:.2f} sec") diff --git a/graph_cast/input/table.py b/graph_cast/input/table.py index cc25cecf..156ec99c 100644 --- a/graph_cast/input/table.py +++ b/graph_cast/input/table.py @@ -1,7 +1,9 @@ -from collections import defaultdict, ChainMap -from itertools import product, combinations, chain +from __future__ import annotations + import logging -from typing import List, Dict, DefaultDict, Tuple +from collections import ChainMap, defaultdict +from itertools import chain, combinations, product +from typing import Any, Dict, List from graph_cast.architecture import ConfiguratorType from graph_cast.architecture.general import transform_foo @@ -13,10 +15,15 @@ def table_to_collections( rows: List[List], header_dict: Dict[str, int], conf: ConfiguratorType, -) -> Tuple[DefaultDict, DefaultDict]: - - vdocs = defaultdict(list) - edocs = defaultdict(list) +) -> tuple[ + defaultdict[str, list[dict[str, Any]]], + defaultdict[tuple[str, str], list[dict[str, Any]]], +]: + + vdocs: defaultdict[str, list[list[dict[str, Any]]]] = defaultdict(list) + edocs: defaultdict[tuple[str, str], list[dict[str, Any]]] = defaultdict( + list + ) vertex_conf = conf.vertex_config rows_raw = [{k: item[v] for k, v in header_dict.items()} for item in rows] @@ -42,14 +49,18 @@ def table_to_collections( for vcol, local_map in conf.current_collections: vdoc_acc = [] - current_fields = set(vertex_conf.index(vcol)) | set(vertex_conf.fields(vcol)) + current_fields = set(vertex_conf.index(vcol)) | set( + vertex_conf.fields(vcol) + ) default_input = current_fields & ( transformation_outputs | set(header_dict.keys()) ) if default_input: - vdoc_acc += [[{f: item[f] for f in default_input} for item in rows_working]] + vdoc_acc += [ + [{f: item[f] for f in default_input} for item in rows_working] + ] if local_map.active: vdoc_acc += [[local_map(item) for item in rows_working]] @@ -79,6 +90,7 @@ def table_to_collections( and v not in vertex_conf.blank_collections ): if conf.graph(u, v)["type"] == "direct": + ziter: product | combinations if u != v: ziter = product(vdocs[u], vdocs[v]) else: @@ -106,7 +118,8 @@ def table_to_collections( cfields = conf.graph_config.weights(*g) if cfields: weights = [ - {f: item[f] for f in cfields} for item in rows_working + {f: item[f] for f in cfields} + for item in rows_working ] ebatch = [ {**item, **{"attributes": attr}} @@ -114,17 +127,21 @@ def table_to_collections( ] edocs[g].extend(ebatch) + vdocs_output: defaultdict[str, list[dict[str, Any]]] = defaultdict(list) for u, vlists in vdocs.items(): - vdocs[u] = [ + stub = [ [ item for item in vlist if not any( - [f"_status@{xkey}" in item for xkey in vertex_conf.fields(u)] + [ + f"_status@{xkey}" in item + for xkey in vertex_conf.fields(u) + ] ) ] for vlist in vlists ] - vdocs[u] = list(chain.from_iterable(vdocs[u])) + vdocs_output[u] = list(chain.from_iterable(stub)) - return vdocs, edocs + return vdocs_output, edocs diff --git a/graph_cast/input/table_flow.py b/graph_cast/input/table_flow.py index e956f169..68188a8c 100644 --- a/graph_cast/input/table_flow.py +++ b/graph_cast/input/table_flow.py @@ -1,21 +1,19 @@ -import queue -from typing import Union, Optional import multiprocessing as mp +import queue +from typing import Optional, Union + import pandas as pd -from graph_cast.db import ( - ConnectionConfigType, - ConnectionManager, -) +from graph_cast.architecture import ConfiguratorType +from graph_cast.db import ConnectionConfigType, ConnectionManager from graph_cast.db.arango.util import ( + insert_edges_batch, insert_return_batch, upsert_docs_batch, - insert_edges_batch, ) from graph_cast.input import table_to_collections from graph_cast.input.table import logger -from graph_cast.util.io import ChunkerDataFrame, Chunker -from graph_cast.architecture import ConfiguratorType +from graph_cast.util.io import AbsChunker, Chunker, ChunkerDataFrame def process_table( @@ -40,7 +38,7 @@ def process_table( logger.info(f"batch_size : {batch_size}") if isinstance(tabular_resource, pd.DataFrame): - chk = ChunkerDataFrame( + chk: AbsChunker = ChunkerDataFrame( tabular_resource, batch_size=batch_size, n_lines_max=max_lines ) elif isinstance(tabular_resource, str): @@ -100,7 +98,9 @@ def process_table( edocuments[(vfrom, vto)].extend( [ {"source": x, "target": y} - for x, y in zip(vdocuments[vfrom], vdocuments[vto]) + for x, y in zip( + vdocuments[vfrom], vdocuments[vto] + ) ] ) @@ -122,7 +122,7 @@ def process_table( # query0 = define_extra_edges(conf_obj.graph(u, v)) # cursor = db_config.execute(query0) - logger.info(f" processed so far: {chk.j} lines") + logger.info(f" processed so far: {chk.units_processed} lines") def process_table_with_queue(tasks: mp.Queue, **kwargs): diff --git a/graph_cast/input/util.py b/graph_cast/input/util.py index 387811b6..5f8a9659 100644 --- a/graph_cast/input/util.py +++ b/graph_cast/input/util.py @@ -1,49 +1,50 @@ import logging + from graph_cast.architecture import ConfiguratorType logger = logging.getLogger(__name__) -def parse_vcollection(config, conf_obj: ConfiguratorType): - - # vertex_type -> vertex_collection_name - conf_obj.dbname = { - k: f'{v["basename"]}' for k, v in config["vertex_collections"].items() - } - - # vertex_collection_name -> indices - conf_obj.index = { - k: v["index"] if "index" in v else ["_key"] - for k, v in config["vertex_collections"].items() - } - logger.info("index_fields_dict") - logger.info(f"{conf_obj.index}") - - # vertex_collection_name -> extra_index - # in addition to index from field_definition - conf_obj.extra_indices = { - k: v["extra_index"] - for k, v in config["vertex_collections"].items() - if "extra_index" in v - } - - # vertex_collection_name -> fields - conf_obj.fields = { - k: (v["fields"] if "fields" in v else []) - for k, v in config["vertex_collections"].items() - } - - conf_obj.blank_collections = [ - k - for k, v in config["vertex_collections"].items() - if "extra" in v and "blank" in v["extra"] - ] - - conf_obj.numeric_fields_list = { - k: v["numeric_fields"] - for k, v in config["vertex_collections"].items() - if "numeric_fields" in v - } +# def parse_vcollection(config, conf_obj: ConfiguratorType): +# +# # vertex_type -> vertex_collection_name +# conf_obj.dbname = { +# k: f'{v["basename"]}' for k, v in config["vertex_collections"].items() +# } +# +# # vertex_collection_name -> indices +# conf_obj.index = { +# k: v["index"] if "index" in v else ["_key"] +# for k, v in config["vertex_collections"].items() +# } +# logger.info("index_fields_dict") +# logger.info(f"{conf_obj.index}") +# +# # vertex_collection_name -> extra_index +# # in addition to index from field_definition +# conf_obj.extra_indices = { +# k: v["extra_index"] +# for k, v in config["vertex_collections"].items() +# if "extra_index" in v +# } +# +# # vertex_collection_name -> fields +# conf_obj.fields = { +# k: (v["fields"] if "fields" in v else []) +# for k, v in config["vertex_collections"].items() +# } +# +# conf_obj.blank_collections = [ +# k +# for k, v in config["vertex_collections"].items() +# if "extra" in v and "blank" in v["extra"] +# ] +# +# conf_obj.numeric_fields_list = { +# k: v["numeric_fields"] +# for k, v in config["vertex_collections"].items() +# if "numeric_fields" in v +# } def define_graphs(edge_def, vmap): diff --git a/graph_cast/main.py b/graph_cast/main.py index 13a0bade..74f63822 100644 --- a/graph_cast/main.py +++ b/graph_cast/main.py @@ -1,20 +1,20 @@ import gzip import json +import logging import multiprocessing as mp from functools import partial from os import listdir from os.path import isfile, join -import logging from typing import Optional import graph_cast.input.json -import graph_cast.input.table_flow import graph_cast.input.table +import graph_cast.input.table_flow +from graph_cast.architecture import JConfigurator, TConfigurator +from graph_cast.db import ConnectionConfigType, ConnectionManager from graph_cast.db.connection import init_db from graph_cast.input.json_flow import process_jsonlike from graph_cast.util import timer as timer -from graph_cast.architecture import TConfigurator, JConfigurator -from graph_cast.db import ConnectionManager, ConnectionConfigType logger = logging.getLogger(__name__) @@ -56,7 +56,9 @@ def ingest_json_files( init_db(db_client, conf_obj, clean_start) # file discovery <- move this foo to JConfigurator - files = sorted([f for f in listdir(fpath) if isfile(join(fpath, f)) if "json" in f]) + files = sorted( + [f for f in listdir(fpath) if isfile(join(fpath, f)) if "json" in f] + ) if keyword is not None: files = [f for f in files if keyword in f] @@ -66,7 +68,9 @@ def ingest_json_files( with gzip.GzipFile(join(fpath, filename), "rb") as fps: with timer.Timer() as t_pro: data = json.load(fps) - process_jsonlike(data, conf_obj, conn_conf, ncores=ncores, dry=dry) + process_jsonlike( + data, conf_obj, conn_conf, ncores=ncores, dry=dry + ) logger.info(f" processing {filename} took {t_pro.elapsed:.2f} sec") @@ -122,13 +126,14 @@ def ingest_csvs( with timer.Timer() as klepsidra: if n_thread > 1: func = partial( - graph_cast.input.table_flow.process_table_with_queue, **kwargs + graph_cast.input.table_flow.process_table_with_queue, + **kwargs, ) assert ( mp.get_start_method() == "fork" ), "Requires 'forking' operating system" processes = [] - tasks = mp.Queue() + tasks: mp.Queue = mp.Queue() for item in conf_obj.mode2files[mode]: tasks.put(item) for w in range(n_thread): diff --git a/graph_cast/util/io.py b/graph_cast/util/io.py index d93788a2..aa915c20 100644 --- a/graph_cast/util/io.py +++ b/graph_cast/util/io.py @@ -1,17 +1,36 @@ from __future__ import annotations -import gzip + +import abc import csv -import re +import gc +import gzip +import io import json import logging -import io import pkgutil -import gc +import re +from typing import TypeVar + +AbsChunkerType = TypeVar("AbsChunkerType", bound="AbsChunker") logger = logging.getLogger(__name__) -class Chunker: +class AbsChunker(abc.ABC): + def __init__(self): + self.units_processed = 0 + + def pop(self): + pass + + def pop_header(self): + pass + + def done(self): + pass + + +class Chunker(AbsChunker): def __init__( self, fname=None, @@ -29,10 +48,10 @@ def __init__( :param n_lines_max: :param encoding: """ + super().__init__() if fname is None and pkg_spec is None: raise ValueError(f" both fname and file_obj are None") - self.acc = [] - self.j = 0 + self.batch_size = batch_size self.n_lines_max: int | None = n_lines_max @@ -46,12 +65,18 @@ def __init__( self.file_obj = open(fname, "rt") else: bytes_ = pkgutil.get_data(*pkg_spec) - if pkg_spec[1][-2:] == "gz": - self.file_obj = gzip.GzipFile(fileobj=io.BytesIO(bytes_), mode="r") + if isinstance(bytes_, bytes): + if pkg_spec[1][-2:] == "gz": + self.file_obj = gzip.GzipFile( + fileobj=io.BytesIO(bytes_), mode="r" + ) + else: + self.file_obj = io.BytesIO(bytes_) else: - self.file_obj = io.BytesIO(bytes_) - self.file_obj = io.TextIOWrapper(self.file_obj, encoding="utf-8") - self.done = False + raise TypeError(f"bytes_ should be a bytes Type") + + self.file_obj = io.TextIOWrapper(self.file_obj, encoding="utf-8") # type: ignore + self._done = False def pop_header(self): header = self.file_obj.readline().rstrip("\n") @@ -60,7 +85,8 @@ def pop_header(self): def pop(self): if self.n_lines_max is None or ( - self.n_lines_max is not None and self.j < self.n_lines_max + self.n_lines_max is not None + and self.units_processed < self.n_lines_max ): lines = self.file_obj.readlines(self.batch_size) lines2 = [ @@ -68,35 +94,37 @@ def pop(self): for line in lines ] if self.n_lines_max is not None and ( - self.j + len(lines2) > self.n_lines_max + self.units_processed + len(lines2) > self.n_lines_max ): - lines2 = lines2[: (self.n_lines_max - self.j)] - self.j += len(lines2) + lines2 = lines2[: (self.n_lines_max - self.units_processed)] + self.units_processed += len(lines2) if not lines2: - self.done = True + self._done = True self.file_obj.close() return [] else: return lines2 else: - self.done = True + self._done = True self.file_obj.close() return [] + @property def done(self): - return self.done + return self._done -class ChunkerDataFrame: +class ChunkerDataFrame(AbsChunker): def __init__(self, df, batch_size, n_lines_max=None): - self.acc = [] - self.j = 0 + super().__init__() self.batch_size = batch_size self.n_lines_max = n_lines_max self.file_obj = df self.done = False - self.idx = [i for i in range(0, self.file_obj.shape[0], self.batch_size)][::-1] + self.idx = [ + i for i in range(0, self.file_obj.shape[0], self.batch_size) + ][::-1] def pop_header(self): return self.file_obj.columns @@ -104,8 +132,10 @@ def pop_header(self): def pop(self): if self.idx: cid = self.idx.pop() - lines = self.file_obj.iloc[cid : cid + self.batch_size].values.tolist() - self.j += len(lines) + lines = self.file_obj.iloc[ + cid : cid + self.batch_size + ].values.tolist() + self.units_processed += len(lines) if not lines: self.done = True return False @@ -150,7 +180,9 @@ def push(self, item): gc.collect() def stop(self): - return self.maxchunks is not None and (self.chunk_count >= self.maxchunks) + return self.maxchunks is not None and ( + self.chunk_count >= self.maxchunks + ) def items_processed(self): return self.iprocessed @@ -173,7 +205,7 @@ def read(self, n): return self.transform(s).encode() def transform(self, s): - m = self.p.search(s) + self.p.search(s) r = self.p.sub(self.sub, s, count=self.count) return r diff --git a/graph_cast/util/pkg.py b/graph_cast/util/pkg.py index 6ccfb8ab..a83f984e 100644 --- a/graph_cast/util/pkg.py +++ b/graph_cast/util/pkg.py @@ -1,14 +1,14 @@ import gzip +import io import json -import yaml +import logging import pickle -import pandas as pd import pkgutil -import io -import logging -from os.path import expanduser from collections.abc import Iterable +from os.path import expanduser +import pandas as pd +import yaml logger = logging.getLogger(__name__) diff --git a/graph_cast/util/transform.py b/graph_cast/util/transform.py index f69b5840..d4380504 100644 --- a/graph_cast/util/transform.py +++ b/graph_cast/util/transform.py @@ -1,7 +1,7 @@ import json import logging -from datetime import datetime import time +from datetime import datetime day_endings = ["st", "nd", "rd", "th"] @@ -161,7 +161,8 @@ def try_int(x): def clear_first_level_nones(docs, keys_keep_nones=None): docs = [ - {k: v for k, v in tdict.items() if v or k in keys_keep_nones} for tdict in docs + {k: v for k, v in tdict.items() if v or k in keys_keep_nones} + for tdict in docs ] return docs diff --git a/graph_cast/xml/io.py b/graph_cast/xml/io.py index b69da4da..f48966f1 100644 --- a/graph_cast/xml/io.py +++ b/graph_cast/xml/io.py @@ -1,11 +1,13 @@ -import logging import gzip -from shutil import copyfileobj -from contextlib import contextmanager -from graph_cast.util.io import ChunkFlusherMono, FPSmart +import logging import xml.etree.ElementTree as et +from contextlib import contextmanager +from shutil import copyfileobj + import xmltodict +from graph_cast.util.io import ChunkFlusherMono, FPSmart + logger = logging.getLogger(__name__) @@ -36,7 +38,9 @@ def parse_simple(fp, good_cf): rec_ = "REC" for event, pub in context: if event == "end" and pub.tag == rec_: - item = et.tostring(pub, encoding="utf8", method="xml").decode("utf") + item = et.tostring(pub, encoding="utf8", method="xml").decode( + "utf" + ) obj = xmltodict.parse( item, force_cdata=True, @@ -92,7 +96,9 @@ def convert( target_prefix = target.split(".")[0] good_cf = ChunkFlusherMono(target_prefix, chunksize, maxchunks) if how == "standard": - bad_cf = ChunkFlusherMono(target_prefix, chunksize, maxchunks, suffix="bad") + bad_cf = ChunkFlusherMono( + target_prefix, chunksize, maxchunks, suffix="bad" + ) if isinstance(source, str): if source[-2:] == "gz": @@ -102,7 +108,9 @@ def convert( else: raise ValueError("Unknown file type") - with (open_foo(source, "rb") if isinstance(source, str) else nullcontext()) as fp: + with ( + open_foo(source, "rb") if isinstance(source, str) else nullcontext() + ) as fp: if pattern: fp = FPSmart(fp, pattern) else: @@ -113,7 +121,9 @@ def convert( # terminal flush good_cf.flush_chunk() - logger.error(f" not an error : {good_cf.items_processed()} good records") + logger.error( + f" not an error : {good_cf.items_processed()} good records" + ) if how == "standard": bad_cf.flush_chunk() logger.error(f"{bad_cf.items_processed()} bad records") diff --git a/poetry.lock b/poetry.lock index a05ac8d5..9986dda4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -77,6 +77,31 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "config-formatter" +version = "1.1.0" +description = "An automatic formatter for .ini and .cfg configuration files" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +configupdater = ">=3.0" + +[package.extras] +dev = ["pytest (>=6.0.0)", "pre-commit (>=2.17.0)"] + +[[package]] +name = "configupdater" +version = "3.1.1" +description = "Parser like ConfigParser but for updating configuration files" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +testing = ["sphinx", "flake8", "pytest", "pytest-cov", "pytest-virtualenv", "pytest-xdist"] + [[package]] name = "distlib" version = "0.3.5" @@ -130,6 +155,22 @@ requirements_deprecated_finder = ["pipreqs", "pip-api"] colors = ["colorama (>=0.4.3,<0.5.0)"] plugins = ["setuptools"] +[[package]] +name = "language-formatters-pre-commit-hooks" +version = "2.4.0" +description = "List of pre-commit hooks meant to format your source code." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +config-formatter = "*" +packaging = "*" +requests = "*" +"ruamel.yaml" = "*" +toml-sort = "*" +tomlkit = "*" + [[package]] name = "mypy" version = "0.971" @@ -201,6 +242,17 @@ category = "main" optional = false python-versions = ">=3.8" +[[package]] +name = "packaging" +version = "21.3" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" + [[package]] name = "pandas" version = "1.4.3" @@ -258,6 +310,18 @@ pyyaml = ">=5.1" toml = "*" virtualenv = ">=20.0.8" +[[package]] +name = "pre-commit-hooks" +version = "4.3.0" +description = "Some out-of-the-box hooks for pre-commit." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +"ruamel.yaml" = ">=0.15" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + [[package]] name = "pyflakes" version = "2.5.0" @@ -288,6 +352,17 @@ dev = ["sphinx", "sphinx-rtd-theme", "zope.interface", "cryptography (>=3.3.1)", docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] +[[package]] +name = "pyparsing" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "main" +optional = false +python-versions = ">=3.6.8" + +[package.extras] +diagrams = ["railroad-diagrams", "jinja2"] + [[package]] name = "python-arango" version = "7.4.1" @@ -362,6 +437,29 @@ python-versions = "*" [package.dependencies] requests = ">=2.0.1,<3.0.0" +[[package]] +name = "ruamel.yaml" +version = "0.17.21" +description = "ruamel.yaml is a YAML parser/emitter that supports roundtrip preservation of comments, seq/map flow style, and map key order" +category = "main" +optional = false +python-versions = ">=3" + +[package.dependencies] +"ruamel.yaml.clib" = {version = ">=0.2.6", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.11\""} + +[package.extras] +docs = ["ryd"] +jinja2 = ["ruamel.yaml.jinja2 (>=0.2)"] + +[[package]] +name = "ruamel.yaml.clib" +version = "0.2.7" +description = "C version of reader, parser and emitter for ruamel.yaml derived from libyaml" +category = "main" +optional = false +python-versions = ">=3.5" + [[package]] name = "setuptools" version = "63.4.2" @@ -391,6 +489,17 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "toml-sort" +version = "0.20.1" +description = "Toml sorting library" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" + +[package.dependencies] +tomlkit = ">=0.8.0" + [[package]] name = "tomli" version = "2.0.1" @@ -399,6 +508,22 @@ category = "main" optional = false python-versions = ">=3.7" +[[package]] +name = "tomlkit" +version = "0.11.5" +description = "Style preserving TOML library" +category = "main" +optional = false +python-versions = ">=3.6,<4.0" + +[[package]] +name = "types-pyyaml" +version = "6.0.11" +description = "Typing stubs for PyYAML" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "typing-extensions" version = "4.3.0" @@ -448,7 +573,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "af2d81ae4fe1fc303270ec6504661c426804163deb0427ed3289137f352539d5" +content-hash = "2ba8a7304ec906d232c798e3ec3fa97413113cfc66ef72871c386e88a350f001" [metadata.files] autoflake = [] @@ -491,6 +616,8 @@ colorama = [ {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, ] +config-formatter = [] +configupdater = [] distlib = [] filelock = [] identify = [] @@ -499,6 +626,7 @@ idna = [ {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] isort = [] +language-formatters-pre-commit-hooks = [] mypy = [] mypy-extensions = [ {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, @@ -508,6 +636,10 @@ neo4j = [] networkx = [] nodeenv = [] numpy = [] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] pandas = [ {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d51674ed8e2551ef7773820ef5dab9322be0828629f2cbf8d1fc31a0c4fed640"}, {file = "pandas-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ad23db55efcc93fa878f7837267973b61ea85d244fc5ff0ccbcfa5638706c5"}, @@ -540,6 +672,7 @@ platformdirs = [ {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, ] pre-commit = [] +pre-commit-hooks = [] pyflakes = [] pygraphviz = [ {file = "pygraphviz-1.9.zip", hash = "sha256:fa18f7c6cea28341a4e466ed0cf05682b0a68288afe8dd7c9426782f7c1ae01c"}, @@ -548,6 +681,10 @@ pyjwt = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, ] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] python-arango = [ {file = "python-arango-7.4.1.tar.gz", hash = "sha256:20dfa59a84f5b0a7344b3c053e9f6fb6f6e9ba1cc2eea7290d3f1b52e5cbdfa0"}, {file = "python_arango-7.4.1-py3-none-any.whl", hash = "sha256:2c05d0a0d74754cc2ed36a0f39546215de82bc955cb364592a12bf07c5705fce"}, @@ -600,6 +737,8 @@ requests-toolbelt = [ {file = "requests-toolbelt-0.9.1.tar.gz", hash = "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"}, {file = "requests_toolbelt-0.9.1-py2.py3-none-any.whl", hash = "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f"}, ] +"ruamel.yaml" = [] +"ruamel.yaml.clib" = [] setuptools = [] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, @@ -609,10 +748,13 @@ toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] +toml-sort = [] tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +tomlkit = [] +types-pyyaml = [] typing-extensions = [] urllib3 = [] virtualenv = [] diff --git a/pyproject.toml b/pyproject.toml index 266db956..443676da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,9 @@ mypy = "^0.971" isort = "^5.10.1" autoflake = "^1.4" pre-commit = "^2.20.0" +types-PyYAML = "^6.0.11" +language-formatters-pre-commit-hooks = "^2.4.0" +pre-commit-hooks = "^4.3.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/run/arango/delete_collections.py b/run/arango/delete_collections.py index 76ee9263..0348ad22 100644 --- a/run/arango/delete_collections.py +++ b/run/arango/delete_collections.py @@ -1,7 +1,8 @@ -from arango import ArangoClient import argparse from os import environ +from arango import ArangoClient + parser = argparse.ArgumentParser() parser.add_argument("-c", "--collection", default="all", help="test setting") parser.add_argument( diff --git a/run/arango/ingest_csv.py b/run/arango/ingest_csv.py index a103c585..6d9c80b4 100644 --- a/run/arango/ingest_csv.py +++ b/run/arango/ingest_csv.py @@ -1,8 +1,9 @@ import argparse import logging -from graph_cast.util import ResourceHandler -from graph_cast.main import ingest_csvs + from graph_cast.db import ConfigFactory +from graph_cast.main import ingest_csvs +from graph_cast.util import ResourceHandler logger = logging.getLogger(__name__) diff --git a/run/arango/ingest_json.py b/run/arango/ingest_json.py index 3ab3605f..ec4d6aa8 100644 --- a/run/arango/ingest_json.py +++ b/run/arango/ingest_json.py @@ -1,10 +1,12 @@ import argparse -import yaml +import logging from os.path import expanduser +import yaml + +from graph_cast.db import ConfigFactory from graph_cast.main import ingest_json_files -from graph_cast.db.arango import get_arangodb_client -import logging +from graph_cast.util import ResourceHandler logger = logging.getLogger(__name__) @@ -68,7 +70,9 @@ "--keyword", default="DSSHPSH", help="prefix for files to be processed" ) - parser.add_argument("--prefix", default="wos", help="prefix for collection names") + parser.add_argument( + "--prefix", default="wos", help="prefix for collection names" + ) parser.add_argument( "--clean-start", diff --git a/run/arango/ingest_pipeline.py b/run/arango/ingest_pipeline.py deleted file mode 100644 index 1de12ff1..00000000 --- a/run/arango/ingest_pipeline.py +++ /dev/null @@ -1,220 +0,0 @@ -from os import listdir -from os.path import isfile, join, expanduser -import argparse -import logging -import zipfile -import yaml -import random -import string - -from pathlib import Path -import shutil -from graph_cast.xml.io import convert -import graph_cast.util.timer as timer -from graph_cast.main import ingest_json_files -from graph_cast.db.arango import get_arangodb_client - -logger = logging.getLogger(__name__) - - -def file_is(fname, patterns): - morphemes = fname.split(".") - l_aux = len(patterns) - flag = all(x == y for x, y in zip(morphemes[-l_aux:], patterns)) - return flag - - -def fetch_proper_filenames(fpath, maxyears, maxunits): - files = sorted([join(fpath, f) for f in listdir(fpath) if isfile(join(fpath, f))]) - - if args.maxyears: - files = files[:maxyears] - - for f in files: - logger.info(f) - - acc = [] - - for f in files: - with zipfile.ZipFile(f, "r") as zip_ref: - fcontent = sorted( - [item for item in zip_ref.namelist() if file_is(item, ["xml", "gz"])] - ) - logger.info(fcontent) - acc += [(f, y) for y in fcontent] - - if args.maxunits: - acc = acc[:maxunits] - logger.info(acc) - return acc - - -def create_tmp(fpath): - Path(fpath).mkdir(parents=True, exist_ok=True) - - -def process_units( - units, - tmp_dir, - config, - db_client, - init_collections, - dry, -): - with timer.Timer() as t_full: - for j, (zfile, unit) in enumerate(units): - with timer.Timer() as t: - convert_unit(zfile, unit, tmp_dir) - logger.info( - f"{unit} from {zfile.split('/')[-1]} conversion to json took {t.elapsed:.2f} sec" - ) - with timer.Timer() as t_ingest: - ingest_json_files( - tmp_dir, - config, - conn_conf=db_client, - keyword="json", - clean_start="all" if j == 0 and init_collections else None, - dry=dry, - ) - - logger.info( - f"{unit} from {zfile.split('/')[0]} ingestion to db took {t_ingest.elapsed:.2f} sec" - ) - clean_tmp(tmp_dir) - - logger.info(f"complete processing of {len(units)} took {t_full.elapsed:.2f} sec") - - -def convert_unit(fpath, unit, tmp_dir): - prefix = unit.split(".") - tmp_fname = prefix[0] + ".json.gz" - with zipfile.ZipFile(fpath) as z: - z.extract(unit, tmp_dir) - convert( - join(tmp_dir, unit), - join(tmp_dir, tmp_fname), - chunksize=10000, - maxchunks=None, - how="simple", - ) - - -def clean_tmp(tmp_dir): - shutil.rmtree(tmp_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - logging.basicConfig( - filename="ingest_pipeline.log", - format="%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - level=logging.INFO, - ) - - parser.add_argument("--path", type=str, help="path to zip file") - - parser.add_argument( - "--maxyears", - type=int, - nargs="?", - const=1, - help="max number of years to process", - ) - - parser.add_argument( - "--maxunits", - type=int, - nargs="?", - const=1, - help="max number of units to process", - ) - - parser.add_argument( - "--config-path", - type=str, - default="../conf/wos.yaml", - help="", - ) - - parser.add_argument( - "-i", - "--id-addr", - default="127.0.0.1", - type=str, - help="port for arangodb connection", - ) - - parser.add_argument( - "--protocol", - default="http", - type=str, - help="protocol for arangodb connection", - ) - - parser.add_argument( - "-p", - "--port", - default=8529, - type=int, - help="port for arangodb connection", - ) - - parser.add_argument( - "-l", - "--cred-name", - default="root", - help="login name for arangodb connection", - ) - - parser.add_argument( - "-w", - "--cred-pass", - default="123", - help="login password for arangodb connection", - ) - - parser.add_argument("--db", default="_system", help="db for arangodb connection") - - parser.add_argument("--dry", action="store_true") - - parser.add_argument("--init-collections", action="store_true") - - args = parser.parse_args() - - logger.info( - f" Initialize collections : {args.init_collections}; dry ingestion: {args.dry} " - ) - logger.info(f" max years : {args.maxyears} | max total units : {args.maxunits}") - - with open(args.config_path, "r") as f: - config_ = yaml.load(f, Loader=yaml.FullLoader) - - working_path = expanduser(args.path) - units = fetch_proper_filenames(working_path, args.maxyears, args.maxunits) - - tmp_path = join( - "/tmp", - "tmp_" + "".join(random.choices(string.ascii_uppercase + string.digits, k=6)), - ) - create_tmp(tmp_path) - - sys_db = get_arangodb_client( - args.protocol, - args.ip_addr, - args.port, - args.database, - args.cred_name, - args.cred_pass, - ) - - process_units( - units, - tmp_path, - config_, - db_client=sys_db, - init_collections=args.init_collections, - dry=args.dry, - ) diff --git a/run/arango/queries.py b/run/arango/queries.py index 84146f95..d6d15db9 100644 --- a/run/arango/queries.py +++ b/run/arango/queries.py @@ -1,6 +1,7 @@ -import pandas as pd import string +import pandas as pd + puncts = list(string.punctuation) all_stops = puncts + stop_words_nltk diff --git a/run/arango/run_query.py b/run/arango/run_query.py index e51a17f6..72c72197 100644 --- a/run/arango/run_query.py +++ b/run/arango/run_query.py @@ -1,7 +1,10 @@ +import argparse + import numpy as np from arango import ArangoClient + from graph_cast.db.arango import profile_query -import argparse + from .queries import qdict @@ -18,7 +21,9 @@ def str2bool(v): parser = argparse.ArgumentParser() -parser.add_argument("-t", "--test", default=False, type=str2bool, help="test setting") +parser.add_argument( + "-t", "--test", default=False, type=str2bool, help="test setting" +) parser.add_argument("--verbose", default=True, type=str2bool, help="verbosity") parser.add_argument( "-n", "--nprofile", default=1, type=int, help="number of times to profile" @@ -53,7 +58,9 @@ def str2bool(v): "-p", "--port", default=8529, type=int, help="port for arangodb connection" ) -parser.add_argument("--db", default="_system", help="db for arangodb connection") +parser.add_argument( + "--db", default="_system", help="db for arangodb connection" +) parser.add_argument( "-l", @@ -110,7 +117,9 @@ def str2bool(v): elif nq == "6": r = [current_query["__pids_head"]] else: - r = sys_db.aql.execute(f'RETURN LENGTH({current_query["main_collection"]})') + r = sys_db.aql.execute( + f'RETURN LENGTH({current_query["main_collection"]})' + ) n = list(r)[0] if current_query["main_collection"] == "publications" and nq != "6": @@ -141,7 +150,9 @@ def str2bool(v): for limit in limits: print(q0) if limit: - q = q0.replace("__insert_limit", f"LIMIT {2*limit} SORT RAND() LIMIT {limit} ") + q = q0.replace( + "__insert_limit", f"LIMIT {2*limit} SORT RAND() LIMIT {limit} " + ) else: q = q0.replace("__insert_limit", f"") if "__issns" in current_query: diff --git a/run/plot_schemas.py b/run/plot_schemas.py index d01d5847..86a4cd39 100644 --- a/run/plot_schemas.py +++ b/run/plot_schemas.py @@ -1,15 +1,15 @@ -import yaml +import argparse import os -import networkx as nx from collections import defaultdict from itertools import product -from os.path import join, dirname, realpath -import argparse +from os.path import dirname, join, realpath -from graph_cast.input.json_aux import parse_edges -from graph_cast.architecture.table import TConfigurator -from graph_cast.architecture.json import JConfigurator +import networkx as nx +import yaml +from graph_cast.architecture.json import JConfigurator +from graph_cast.architecture.table import TConfigurator +from graph_cast.input.json_aux import parse_edges """ @@ -152,7 +152,9 @@ def plot_vc2fields(self): ( f"{k}:{item}", { - "type": "def_field" if item in index_fields else "field", + "type": "def_field" + if item in index_fields + else "field", "label": item, }, ) @@ -160,7 +162,10 @@ def plot_vc2fields(self): ] nodes += nodes_collection nodes += nodes_fields - edges += [(x[0], y[0]) for x, y in product(nodes_collection, nodes_fields)] + edges += [ + (x[0], y[0]) + for x, y in product(nodes_collection, nodes_fields) + ] g.add_nodes_from(nodes) g.add_edges_from(edges) @@ -207,7 +212,9 @@ def plot_vc2fields(self): for k in vconf.collections: level_index = [f"{k}:{item}" for item in vconf.index(k)] - index_subgraph = ag.add_subgraph(level_index, name=f"cluster_{k}:def") + index_subgraph = ag.add_subgraph( + level_index, name=f"cluster_{k}:def" + ) index_subgraph.node_attr["style"] = "filled" index_subgraph.node_attr["label"] = "definition" @@ -250,7 +257,8 @@ def plot_source2vc(self): nodes += nodes_table nodes += nodes_collection edges += [ - (nt[0], nc[0]) for nt, nc in product(nodes_table, nodes_collection) + (nt[0], nc[0]) + for nt, nc in product(nodes_table, nodes_collection) ] g.add_nodes_from(nodes) @@ -375,10 +383,16 @@ def plot_source2vc_detailed(self): cmap = item["map"] else: cmap = dict() - fields_collection_complementary = set(ref_fields) - set(cmap.values()) - cmap.update({qq: qq for qq in list(fields_collection_complementary)}) + fields_collection_complementary = set(ref_fields) - set( + cmap.values() + ) + cmap.update( + {qq: qq for qq in list(fields_collection_complementary)} + ) - index_fields = self.config["vertex_collections"][cname]["index"] + index_fields = self.config["vertex_collections"][cname][ + "index" + ] node_collection = ( f"collection:{cname}", @@ -392,7 +406,9 @@ def plot_source2vc_detailed(self): ( f"collection:field:{kk}", { - "type": "def_field" if kk in index_fields else "field", + "type": "def_field" + if kk in index_fields + else "field", "label": kk, }, ) @@ -402,7 +418,9 @@ def plot_source2vc_detailed(self): (f"table:field:{kk}", f"collection:field:{vv}") for kk, vv in cmap.items() ] - edge_table_fields = [(f"table:{k}", q) for q, _ in nodes_fields_table] + edge_table_fields = [ + (f"table:{k}", q) for q, _ in nodes_fields_table + ] edge_collection_fields = [ (q, node_collection[0]) for q, _ in nodes_fields_collection ] @@ -412,7 +430,9 @@ def plot_source2vc_detailed(self): + nodes_fields_table + nodes_fields_collection ) - edges += edges_fields + edge_table_fields + edge_collection_fields + edges += ( + edges_fields + edge_table_fields + edge_collection_fields + ) g.add_nodes_from(nodes) g.add_edges_from(edges) @@ -431,7 +451,7 @@ def plot_source2vc_detailed(self): for e in g.edges(data=True): s, t, _ = e - target_props = g.nodes[s] + g.nodes[s] upd_dict = { # "style": edge_status[target_props["type"]], "arrowhead": "vee" @@ -442,13 +462,19 @@ def plot_source2vc_detailed(self): ag = nx.nx_agraph.to_agraph(g) for k, props in self.config["vertex_collections"].items(): - level_index = [f"collection:field:{item}" for item in props["index"]] - index_subgraph = ag.add_subgraph(level_index, name=f"cluster_{k[:3]}:def") + level_index = [ + f"collection:field:{item}" for item in props["index"] + ] + index_subgraph = ag.add_subgraph( + level_index, name=f"cluster_{k[:3]}:def" + ) index_subgraph.node_attr["style"] = "filled" index_subgraph.node_attr["label"] = "definition" ag.draw( - os.path.join(self.figgpath, f"{self.prefix}_source2vc_detailed.pdf"), + os.path.join( + self.figgpath, f"{self.prefix}_source2vc_detailed.pdf" + ), "pdf", prog="dot", ) @@ -457,7 +483,9 @@ def plot_source2vc_detailed(self): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("-c", "--config", default=None, help="config file name") + parser.add_argument( + "-c", "--config", default=None, help="config file name" + ) parser.add_argument( "-p", "--prune-low-degree-nodes", diff --git a/run/unzip.py b/run/unzip.py index ca22045a..19aa310d 100644 --- a/run/unzip.py +++ b/run/unzip.py @@ -1,5 +1,5 @@ -import zipfile import argparse +import zipfile if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/test/arangos/test_db_access.py b/test/arangos/test_db_access.py index a08c4cfc..5384c31b 100644 --- a/test/arangos/test_db_access.py +++ b/test/arangos/test_db_access.py @@ -1,10 +1,10 @@ -import unittest -from os.path import join, dirname, realpath import logging import sys -from graph_cast.db.arango.util import insert_return_batch +import unittest +from os.path import dirname, realpath -from graph_cast.db import ConnectionManager, ConfigFactory +from graph_cast.db import ConfigFactory, ConnectionManager +from graph_cast.db.arango.util import insert_return_batch logger = logging.getLogger(__name__) @@ -30,7 +30,9 @@ def test_db_access(self): with ConnectionManager(connection_config=conn_conf) as db_client: cnames = [ - c["name"] for c in db_client.get_collections() if c["name"][0] != "_" + c["name"] + for c in db_client.get_collections() + if c["name"][0] != "_" ] for c in cnames: logger.info(c) @@ -42,7 +44,9 @@ def test_insert_return(self): conn_conf = ConfigFactory.create_config(args=db_args) with ConnectionManager(connection_config=conn_conf) as db_client: cnames = [ - c["name"] for c in db_client.get_collections() if c["name"][0] != "_" + c["name"] + for c in db_client.get_collections() + if c["name"][0] != "_" ] docs = [{"value": i} for i in range(5)] diff --git a/test/arangos/test_ingest_csv.py b/test/arangos/test_ingest_csv.py index d956d6ff..4c2fc269 100644 --- a/test/arangos/test_ingest_csv.py +++ b/test/arangos/test_ingest_csv.py @@ -1,12 +1,12 @@ -import unittest -from os.path import join, dirname, realpath -import logging import argparse +import logging +import unittest +from os.path import dirname, join, realpath from pprint import pprint +from graph_cast.db import ConfigFactory, ConnectionManager from graph_cast.main import ingest_csvs from graph_cast.util import ResourceHandler, equals -from graph_cast.db import ConnectionManager, ConfigFactory logger = logging.getLogger(__name__) @@ -35,7 +35,7 @@ def __init__(self, reset): self.reset = reset def _atomic(self, mode): - db = f"{mode}_test" + f"{mode}_test" path = join(self.cpath, f"../data/csv/{mode}") schema_config = ResourceHandler.load(f"conf.table", f"{mode}.yaml") @@ -64,7 +64,9 @@ def _atomic(self, mode): db_client.delete_collections([], [], delete_all=True) if not self.reset: - ref_vc = ResourceHandler.load(f"test.ref.csv", f"{mode}_sizes.yaml") + ref_vc = ResourceHandler.load( + f"test.ref.csv", f"{mode}_sizes.yaml" + ) flag = equals(vc, ref_vc) if not flag: pprint(f"ref keys: {sorted(ref_vc.keys())}") @@ -74,7 +76,9 @@ def _atomic(self, mode): self.assertTrue(flag) else: - ResourceHandler.dump(vc, join(self.cpath, f"../ref/csv/{mode}_sizes.yaml")) + ResourceHandler.dump( + vc, join(self.cpath, f"../ref/csv/{mode}_sizes.yaml") + ) def runTest(self): for mode in self.modes: @@ -83,7 +87,9 @@ def runTest(self): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--reset", action="store_true", help="reset test results") + parser.add_argument( + "--reset", action="store_true", help="reset test results" + ) args = parser.parse_args() suite = unittest.TestSuite() suite.addTest(TestIngestCSV(args.reset)) diff --git a/test/arangos/test_ingest_json.py b/test/arangos/test_ingest_json.py index 60d525f5..097d087f 100644 --- a/test/arangos/test_ingest_json.py +++ b/test/arangos/test_ingest_json.py @@ -1,13 +1,14 @@ -import unittest -from os.path import join, dirname, realpath -import yaml -import logging import argparse -from pprint import pprint +import logging +import unittest +from os.path import dirname, join, realpath + import pandas as pd -from graph_cast.db import ConnectionManager, ConfigFactory -from graph_cast.util import ResourceHandler, equals +import yaml + +from graph_cast.db import ConfigFactory, ConnectionManager from graph_cast.main import ingest_json_files +from graph_cast.util import ResourceHandler, equals logger = logging.getLogger(__name__) @@ -52,7 +53,9 @@ def _atomic(self, mode): vc[c["name"]] = size if not self.reset: - ref_vc = ResourceHandler.load(f"test.ref.json", f"{mode}_sizes.yaml") + ref_vc = ResourceHandler.load( + f"test.ref.json", f"{mode}_sizes.yaml" + ) flag = equals(vc, ref_vc) if not flag: print(vc) @@ -60,7 +63,9 @@ def _atomic(self, mode): self.assertTrue(flag) else: - ResourceHandler.dump(vc, join(self.cpath, f"../ref/json/{mode}_sizes.yaml")) + ResourceHandler.dump( + vc, join(self.cpath, f"../ref/json/{mode}_sizes.yaml") + ) def test_modes(self): for mode in self.modes: @@ -95,7 +100,9 @@ def runTest(self): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--reset", action="store_true", help="reset test results") + parser.add_argument( + "--reset", action="store_true", help="reset test results" + ) args = parser.parse_args() suite = unittest.TestSuite() suite.addTest(TestIngestJSON(args.reset)) diff --git a/test/data/merge_anchor_test.json b/test/data/merge_anchor_test.json index 4108e594..93a9f8cd 100644 --- a/test/data/merge_anchor_test.json +++ b/test/data/merge_anchor_test.json @@ -1,74 +1,74 @@ [ { - "title":"SURVEY OF THE FINISH CHARACTERISTICS OF MACHINED OPTICAL-SURFACES" + "title": "SURVEY OF THE FINISH CHARACTERISTICS OF MACHINED OPTICAL-SURFACES" }, { - "_key":"WOS:A1979HV28900009", - "anchor":"reference", - "first_page":"93", - "title":"MEASUREMENT OF SURFACE TEXTURE AND TOPOGRAPHY BY DIFFERENTIAL LIGHT-SCATTERING", - "volume":"57" + "_key": "WOS:A1979HV28900009", + "anchor": "reference", + "first_page": "93", + "title": "MEASUREMENT OF SURFACE TEXTURE AND TOPOGRAPHY BY DIFFERENTIAL LIGHT-SCATTERING", + "volume": "57" }, { - "_key":"WOS:A1982PY45800018", - "anchor":"reference", - "first_page":"189", - "title":"SPECTRAL-ANALYSIS OF THE FINISH OF POLISHED OPTICAL-SURFACES", - "volume":"83" + "_key": "WOS:A1982PY45800018", + "anchor": "reference", + "first_page": "189", + "title": "SPECTRAL-ANALYSIS OF THE FINISH OF POLISHED OPTICAL-SURFACES", + "volume": "83" }, { - "_key":"WOS:A1985AHD0600004.2", - "anchor":"reference", - "volume":"24" + "_key": "WOS:A1985AHD0600004.2", + "anchor": "reference", + "volume": "24" }, { - "anchor":"main", - "has_abstract":"N", - "issue":"3", - "volume":"24" + "anchor": "main", + "has_abstract": "N", + "issue": "3", + "volume": "24" }, { - "_key":"WOS:A1977DT56200008", - "anchor":"reference", - "doi":"10.1117/12.7972054", - "first_page":"360", - "title":"MEASUREMENT OF FINISH OF DIAMOND-TURNED METAL-SURFACES BY DIFFERENTIAL LIGHT-SCATTERING", - "volume":"16" + "_key": "WOS:A1977DT56200008", + "anchor": "reference", + "doi": "10.1117/12.7972054", + "first_page": "360", + "title": "MEASUREMENT OF FINISH OF DIAMOND-TURNED METAL-SURFACES BY DIFFERENTIAL LIGHT-SCATTERING", + "volume": "16" }, { - "_key":"WOS:A1985AHD0600005", - "anchor":"main" + "_key": "WOS:A1985AHD0600005", + "anchor": "main" }, { - "_key":"WOS:A1984SL25000002", - "anchor":"reference", - "first_page":"101", - "title":"AN OPTICAL PROFILOMETER FOR SURFACE CHARACTERIZATION OF MAGNETIC MEDIA", - "volume":"27" + "_key": "WOS:A1984SL25000002", + "anchor": "reference", + "first_page": "101", + "title": "AN OPTICAL PROFILOMETER FOR SURFACE CHARACTERIZATION OF MAGNETIC MEDIA", + "volume": "27" }, { - "anchor":"main", - "first_page":"396", - "last_page":"403", - "page_count":"8", - "str_pages":"396-403" + "anchor": "main", + "first_page": "396", + "last_page": "403", + "page_count": "8", + "str_pages": "396-403" }, { - "_key":"WOS:A1984TZ60400012.2", - "anchor":"reference" + "_key": "WOS:A1984TZ60400012.2", + "anchor": "reference" }, { - "_key":"WOS:A1983SU56500014", - "anchor":"reference", - "first_page":"105", - "title":"DIRECT COMPARISON OF MECHANICAL AND OPTICAL MEASUREMENTS OF THE FINISH OF PRECISION-MACHINED SURFACES", - "volume":"429" + "_key": "WOS:A1983SU56500014", + "anchor": "reference", + "first_page": "105", + "title": "DIRECT COMPARISON OF MECHANICAL AND OPTICAL MEASUREMENTS OF THE FINISH OF PRECISION-MACHINED SURFACES", + "volume": "429" }, { - "_key":"WOS:A1984AAJ5100004", - "anchor":"reference", - "first_page":"18", - "title":"STATISTICAL EFFECTS IN THE MEASUREMENT AND CHARACTERIZATION OF SMOOTH SCATTERING SURFACES", - "volume":"511" + "_key": "WOS:A1984AAJ5100004", + "anchor": "reference", + "first_page": "18", + "title": "STATISTICAL EFFECTS IN THE MEASUREMENT AND CHARACTERIZATION OF SMOOTH SCATTERING SURFACES", + "volume": "511" } -] \ No newline at end of file +] diff --git a/test/db/test_config_factory.py b/test/db/test_config_factory.py index a609e972..334bf163 100644 --- a/test/db/test_config_factory.py +++ b/test/db/test_config_factory.py @@ -1,4 +1,5 @@ import unittest + from graph_cast.db.factory import ConfigFactory diff --git a/test/neo4js/test_db_access.py b/test/neo4js/test_db_access.py index 4d90d187..53decd23 100644 --- a/test/neo4js/test_db_access.py +++ b/test/neo4js/test_db_access.py @@ -1,10 +1,10 @@ -import unittest -from os.path import join, dirname, realpath import logging import sys -from graph_cast.db.arango.util import insert_return_batch +import unittest +from os.path import dirname, realpath -from graph_cast.db import ConnectionManager, ConfigFactory +from graph_cast.db import ConfigFactory, ConnectionManager +from graph_cast.db.arango.util import insert_return_batch logger = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def test_db_access(self): with ConnectionManager(connection_config=conn_conf) as db_client: query = "MATCH(n) RETURN n LIMIT 3" result = db_client.execute(query, params=None) - r = [record for record in result] + [record for record in result] if __name__ == "__main__": diff --git a/test/ref/freshcaller_sizes.yaml b/test/ref/freshcaller_sizes.yaml index 15da0613..132d62ef 100644 --- a/test/ref/freshcaller_sizes.yaml +++ b/test/ref/freshcaller_sizes.yaml @@ -1,6 +1,5 @@ call: 1 participant: 2 -? !!python/tuple -- participant -- call +? - participant + - call : 2 diff --git a/test/ref/kg_v0_sizes.yaml b/test/ref/kg_v0_sizes.yaml index 568a1743..d14647c8 100644 --- a/test/ref/kg_v0_sizes.yaml +++ b/test/ref/kg_v0_sizes.yaml @@ -1,10 +1,8 @@ publication: 1 concept: 22 -? !!python/tuple -- concept -- concept +? - concept + - concept : 21 -? !!python/tuple -- publication -- concept +? - publication + - concept : 7 diff --git a/test/transform/test_cast_analyst.py b/test/transform/test_cast_analyst.py index 8642fce3..9648fc67 100644 --- a/test/transform/test_cast_analyst.py +++ b/test/transform/test_cast_analyst.py @@ -1,4 +1,5 @@ import unittest + from graph_cast.util.transform import cast_ibes_analyst diff --git a/test/transform/test_filter.py b/test/transform/test_filter.py index 8b777b1b..6220c08e 100644 --- a/test/transform/test_filter.py +++ b/test/transform/test_filter.py @@ -1,8 +1,9 @@ -import unittest +import argparse import logging +import unittest from os.path import dirname, realpath -import argparse -from graph_cast.architecture.schema import Filter, Condition + +from graph_cast.architecture.schema import Condition, Filter logger = logging.getLogger(__name__) @@ -43,7 +44,9 @@ def runTest(self): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--reset", action="store_true", help="reset test results") + parser.add_argument( + "--reset", action="store_true", help="reset test results" + ) args = parser.parse_args() suite = unittest.TestSuite() unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/test/transform/test_localvertexcol.py b/test/transform/test_localvertexcol.py index 5bcb0fd8..020b6697 100644 --- a/test/transform/test_localvertexcol.py +++ b/test/transform/test_localvertexcol.py @@ -1,5 +1,6 @@ -import unittest import logging +import unittest + from graph_cast.architecture.general import LocalVertexCollections logger = logging.getLogger(__name__) diff --git a/test/transform/test_mapper.py b/test/transform/test_mapper.py index 3627eaf9..2f7e35b4 100644 --- a/test/transform/test_mapper.py +++ b/test/transform/test_mapper.py @@ -1,5 +1,6 @@ -import unittest import logging +import unittest + from graph_cast.architecture.general import Mapper logger = logging.getLogger(__name__) diff --git a/test/transform/test_transform.py b/test/transform/test_transform.py index 50b0fa20..9d27b2c2 100644 --- a/test/transform/test_transform.py +++ b/test/transform/test_transform.py @@ -1,12 +1,14 @@ -import unittest -from os.path import join, dirname, realpath import logging -from graph_cast.architecture.transform import Transform +import unittest +from os.path import dirname, realpath + import yaml + from graph_cast.architecture.table import TConfigurator -from graph_cast.util.io import Chunker +from graph_cast.architecture.transform import Transform from graph_cast.input import table_to_collections from graph_cast.util import ResourceHandler, equals +from graph_cast.util.io import Chunker logger = logging.getLogger(__name__) @@ -52,7 +54,7 @@ def test_transform_problems(self): header = chk.pop_header() header_dict = dict(zip(header, range(len(header)))) - while not chk.done: + while not chk._done: lines = chk.pop() if lines: vdocuments, edocuments = table_to_collections( diff --git a/test/transform/test_transform_jsonlike.py b/test/transform/test_transform_jsonlike.py index c68447ba..81492dc1 100644 --- a/test/transform/test_transform_jsonlike.py +++ b/test/transform/test_transform_jsonlike.py @@ -1,10 +1,11 @@ -import unittest -import logging import argparse -from os.path import join, dirname, realpath +import logging +import unittest +from os.path import dirname, join, realpath + +from graph_cast.architecture import JConfigurator from graph_cast.input import jsondoc_to_collections from graph_cast.util import ResourceHandler, equals -from graph_cast.architecture import JConfigurator logger = logging.getLogger(__name__) @@ -18,7 +19,9 @@ def __init__(self, reset): self.reset = reset def _atomic(self, mode): - jsonlike = ResourceHandler.load(f"test.data.json.{mode}", f"{mode}.json.gz") + jsonlike = ResourceHandler.load( + f"test.data.json.{mode}", f"{mode}.json.gz" + ) config = ResourceHandler.load(f"conf.json", f"{mode}.yaml") conf_obj = JConfigurator(config) @@ -34,7 +37,9 @@ def _atomic(self, mode): print(ref_vc) self.assertTrue(flag) else: - ResourceHandler.dump(vc, join(self.cpath, f"../ref/{mode}_sizes.yaml")) + ResourceHandler.dump( + vc, join(self.cpath, f"../ref/{mode}_sizes.yaml") + ) def runTest(self): for mode in self.modes: @@ -43,7 +48,9 @@ def runTest(self): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--reset", action="store_true", help="reset test results") + parser.add_argument( + "--reset", action="store_true", help="reset test results" + ) args = parser.parse_args() suite = unittest.TestSuite() suite.addTest(TestTransformJsonlike(args.reset)) diff --git a/test/transform/test_transform_table.py b/test/transform/test_transform_table.py index 66a3131e..b0704631 100644 --- a/test/transform/test_transform_table.py +++ b/test/transform/test_transform_table.py @@ -1,12 +1,12 @@ -import unittest -import logging import argparse +import logging +import unittest + +from graph_cast.architecture import TConfigurator from graph_cast.input import table_to_collections from graph_cast.util import ResourceHandler, equals -from graph_cast.architecture import TConfigurator from graph_cast.util.transform import pick_unique_dict - logger = logging.getLogger(__name__) @@ -52,7 +52,9 @@ def runTest(self): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--reset", action="store_true", help="reset test results") + parser.add_argument( + "--reset", action="store_true", help="reset test results" + ) args = parser.parse_args() suite = unittest.TestSuite() suite.addTest(TestTransformTable(args.reset))