# Use to explore and find issues with registry metadata

## Setup

In [1]:
## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## show time that this notebook was executed 
from datetime import datetime

## packages to work with objects 
import re
import requests
import pandas as pd
from textwrap import wrap  ## for plot labels

## to get around bugs
import nest_asyncio
nest_asyncio.apply()

In [2]:
## record when cell blocks are executed
print('The time that this notebook was executed is...')
print('Local time (PST, West Coast USA): ')
print(datetime.now())
print('UTC time: ')
print(datetime.utcnow())

The time that this notebook was executed is...
Local time (PST, West Coast USA): 
2021-04-19 12:22:14.717728
UTC time: 
2021-04-19 19:22:14.717849


In [46]:
## old request, doesn't work

## setup request
headers = {"content-type": "application/json"}
registry_url = "https://smart-api.info/api/query/?q=%22%22&fields=info%2Ctags%2C_meta.uptime_status&raw=1&size=1000"

## make request: check for success (200)
registry_request = requests.get(registry_url, headers=headers)
registry_request.status_code

400

In [None]:
registry = registry_request.json()
registry

### Getting and parsing registry info

In [47]:
## setup request
# headers = {"content-type": "application/json"}
registry_url = "https://smart-api.info/api/query/?q=__all__&fields=info,tags,_status&raw=1&size=1000"

## make request: check for success (200)
registry_request = requests.get(registry_url, headers={})
registry_request.status_code

200

In [48]:
registry = registry_request.json()

In [49]:
registry['hits']['hits'][0]

{'_index': 'smartapi_docs',
 '_type': '_doc',
 '_id': 'be87344696148a41f577aca202ce84df',
 '_score': 1.0,
 '_source': {'_status': {'refresh_status': 200,
   'refresh_ts': '2021-03-31T07:00:28+00:00',
   'uptime_status': 'unknown',
   'uptime_ts': '2021-03-31T07:05:16.381984'},
  'info': {'termsOfService': 'https://creativecommons.org/licenses/by/4.0/',
   'title': 'dcm4chee-arc',
   'version': '5.23.0'},
  'tags': [{'name': 'QIDO-RS',
    'description': 'Query based on ID for DICOM Objects by RESTful Services',
    'externalDocs': {'description': 'DICOM PS3.18',
     'url': 'http://dicom.nema.org/medical/dicom/current/output/html/part18.html'}},
   {'name': 'WADO-RS',
    'description': 'Web Access to DICOM Objects by RESTful Services',
    'externalDocs': {'description': 'DICOM PS3.18',
     'url': 'http://dicom.nema.org/medical/dicom/current/output/html/part18.html'}},
   {'name': 'WADO-URI',
    'description': 'Web Access to DICOM Objects by URI',
    'externalDocs': {'description':

In [50]:
## get response
registry = registry_request.json()

## parse response to get the information: uptime status, tags, x-translator, api name
registry_data = []
for ele in registry['hits']['hits']:
    tempDict = {'uptimeStatus': ele['_source']['_status'].get('uptime_status'),
                'refreshStatus': ele['_source']['_status']['refresh_status'],
                'name': ele['_source']['info']['title']}
    
    if ele['_source'].get('tags'):
        temp = [i.get('name') for i in ele['_source']['tags']]
        tempDict.update({"tags": temp})
    else:
        tempDict.update({"tags": []})
    
    if ele['_source']['info'].get('x-translator'):
        tempDict.update({            
            'team': ele['_source']['info']['x-translator'].get('team'),
            'component': ele['_source']['info']['x-translator'].get('component')
        })
    else:
        tempDict.update({            
            'team': [],
            'component': None
        })
        
    if ele['_source']['info'].get('x-trapi'):
        tempDict.update({            
            'trapi_version': ele['_source']['info']['x-trapi'].get('version')
        })
    else:
        tempDict.update({            
            'trapi_version': None
        })
        
    registry_data.append(tempDict)

In [51]:
registry_df = pd.DataFrame.from_dict(registry_data)
registry_df = registry_df[['name', 'team', 'component', 'trapi_version', 'uptimeStatus', 'refreshStatus', 'tags']]

registry_df.shape

(203, 7)

### Issues

In [52]:
## currently not a problem since changed schema to allow team to be a string 
##   (as long as it matches one of the enums)
teamNotList = [True if not isinstance(x, list) else False for x in registry_df['team']]
registry_df[teamNotList]

Unnamed: 0,name,team,component,trapi_version,uptimeStatus,refreshStatus,tags
75,ARAX Translator Reasoner,Expander Agent,ARA,,good,499,"[predicates, query, translator, reasoner, resp..."
76,RTX KG2,Expander Agent,ARA,1.0.0,good,499,"[predicates, query, translator, trapi, entity]"


In [53]:
## allow these entries' team value (currently not in enum)
registry_df[registry_df['name'].str.contains('Ontology-KP')]

Unnamed: 0,name,team,component,trapi_version,uptimeStatus,refreshStatus,tags
10,Ontology-KP API,[Standards Reference Implementation Team],KP,,unknown,200,[translator]


In [54]:
## who has x-trapi?
registry_df[~ registry_df['trapi_version'].isna()]

Unnamed: 0,name,team,component,trapi_version,uptimeStatus,refreshStatus,tags
4,imProving Agent,[imProving Agent],ARA,1.0.0,unknown,499,"[predicates, query, translator, reasoner, SPOKE]"
12,Columbia Open Health Data (COHD),[Clinical Data Provider],KP,1.0.0,good,200,"[Metadata, OMOP, Clinical Frequencies, Concept..."
17,Columbia Open Health Data (COHD) for COVID-19 ...,[Clinical Data Provider],KP,1.0.0,good,200,"[Metadata, OMOP, Clinical Frequencies, Concept..."
51,Molecular Data Provider for NCATS Biomedical T...,[Molecular Data Provider],KP,1.0.0,unknown,200,"[predicates, query, translator, trapi]"
76,RTX KG2,Expander Agent,ARA,1.0.0,good,499,"[predicates, query, translator, trapi, entity]"
81,BioThings Explorer ReasonerStdAPI,[Exploring Agent],ARA,1.0.0,good,200,"[1.0.0, 0.9.2, translator, trapi, biothings, m..."
144,OpenPredict API,[Clinical Data Provider],KP,1.0.0,good,200,"[translator, trapi]"
177,Translator Knowledge Collaboratory API,[Clinical Data Provider],KP,1.0.0,unknown,200,"[trapi, translator]"


In [11]:
registry_df[registry_df['name'].str.contains('COHD')].tags.to_list()

[['Metadata',
  'OMOP',
  'Clinical Frequencies',
  'Concept Associations',
  'Temporal Clinical Data',
  'translator',
  'trapi'],
 ['Metadata',
  'OMOP',
  'Clinical Frequencies',
  'Concept Associations',
  'Temporal Clinical Data',
  'translator',
  'trapi']]

### Finishing that parsing

Having team as an array is good when there is more than one team involved in making the API (this happens often with Service Provider). 

Having team (and tags) as an array is bad when you want to group-by/summarize....so for now I turned them to strings

In [55]:
## changing the column to strings to make it easier to sort values
registry_df['team'] = [",".join(x) if isinstance(x, list) else x for x in registry_df['team']]
registry_df['tags'] = [",".join(x) if isinstance(x, list) else x for x in registry_df['tags']]

## replacing empty strings with None
registry_df['team'] = [None if not x else x for x in registry_df['team']]
registry_df['tags'] = [None if not x else x for x in registry_df['tags']]

### Getting and parsing metaKG info

The MetaKG is currently based on annotations from x-bte. The operations are all from APIs tagged translator 

In [3]:
headers = {"content-type": "application/json"}

metaKG_url = "https://smart-api.info/api/metakg"

metaKG_request = requests.get(metaKG_url, headers=headers)
metaKG_request.status_code
## if the output is successful, code 200

200

In [4]:
metaKG = metaKG_request.json()

In [13]:
multiwellness = []
clinicalrisk = []

for edge in metaKG['associations']:
    if edge['api']['name'] == 'Clinical Risk KP API':
        clinicalrisk.append(edge)
    elif edge['api']['name'] == 'Multiomics Wellness KP API':
        multiwellness.append(edge)

In [15]:
clinicalrisk[0]

{'subject': 'Disease',
 'object': 'Disease',
 'predicate': 'associated_with_risk_for',
 'provided_by': 'Multiomics Provider',
 'api': {'name': 'Clinical Risk KP API',
  'smartapi': {'metadata': 'https://storage.googleapis.com/multiomics_provider_kp_data/clinical_risk_kp.yaml',
   'id': '1bef5ecbb0b9aee90023ce9faa2c8974',
   'ui': 'https://smart-api.info/ui/1bef5ecbb0b9aee90023ce9faa2c8974'},
  'x-translator': {'component': 'KP',
   'team': ['Multiomics Provider', 'Service Provider']}}}

In [5]:
metaKG_operations = []
for edge in metaKG['associations']:
    tempdict = {
        "subject": edge['subject'],
        "object": edge['object'],
        "predicate": edge['predicate'],
        "provided_by": edge.get('provided_by'),
        "api_name": edge['api']['name']
    }
    if edge['api'].get('x-translator'):
        tempteam = edge['api']['x-translator'].get('team')
        if isinstance(tempteam, list):
            tempteam = ",".join(tempteam)
            if not tempteam:
                tempteam = None  
        tempdict.update({
            "api_team": tempteam,
            "api_component": edge['api']['x-translator'].get('component'),
        })
    else:
        tempdict.update({
            "api_team": None,
            "api_component": None,
        })
    metaKG_operations.append(tempdict)

In [7]:
operations_summary = pd.DataFrame.from_dict(metaKG_operations)
operations_summary.shape
## describes this many operations

(21896, 7)

In [8]:
operations_summary[operations_summary['api_name'] == 'Multiomics Wellness KP API']

Unnamed: 0,subject,object,predicate,provided_by,api_name,api_team,api_component
837,ChemicalSubstance,ChemicalSubstance,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
838,ChemicalSubstance,ChemicalSubstance,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
839,ChemicalSubstance,ChemicalSubstance,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
840,ChemicalSubstance,ChemicalSubstance,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
841,ChemicalSubstance,ChemicalSubstance,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
...,...,...,...,...,...,...,...
960,Protein,Protein,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
961,Protein,Protein,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
962,Protein,Protein,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP
963,Protein,Protein,correlated_with,Multiomics Provider,Multiomics Wellness KP API,"Multiomics Provider,Service Provider",KP


### DON'T WRITE OVER THIS

Previously there were more APIs specified (SEMMED, Automat) that are now missing...

In [60]:
operations_summary.api_name.nunique()

32

In [61]:
operations_summary.api_name.value_counts().reset_index().sort_values(by = ["api_name", "index"], ascending=[True,True])

Unnamed: 0,index,api_name
29,LINCS Data Portal API,1
27,LitVar API,1
30,Ontology Lookup Service API,1
28,OpenTarget API,1
31,QuickGO API,1
23,BioThings DGIdb API,2
20,DISEASES API,2
21,Drug Response KP API,2
22,EBIgene2phenotype API,2
24,Human Phenotype Ontology API,2


## Registry API-level summary

### Comparing translator tag and x-translator use

In [None]:
registry_df['hasTranslatorTag'] =  [False if x is None else x for x in registry_df.tags.str.contains('translator')]
registry_df['hasXTranslator'] =  [False if x is None else True for x in registry_df.team]
registry_df[['hasTranslatorTag', 'hasXTranslator']].value_counts().reset_index(name = "Num of APIs")

So there are 97 APIs with the "translator" tag, and 87 of them currently have the x-translator extension. There is one API with the x-translator extension and no "translator" tag. 

In [None]:
## This API doesn't have the translator tag but has x-translator
registry_df[(registry_df['hasXTranslator'] == True) & 
            (registry_df['hasTranslatorTag'] == False)]

### Translator-tagged / x-translator APIs

In [None]:
## grab only the APIs with the translator tag 
translator_apis = registry_df[(registry_df['hasTranslatorTag']) | 
            (registry_df['hasXTranslator'])].copy()
translator_apis.drop(columns = ['hasTranslatorTag'], inplace = True)
translator_apis = translator_apis[['team', 'component', 'name', 'uptimeStatus', 'refreshStatus', 'tags']]
translator_apis.sort_values(by=['team', 'component', 'name'], 
                            ignore_index = True, na_position = 'last', inplace = True)

Translator-tagged APIs with x-translator information. Another review may need to be done to address duplicate API registrations / different API registrations for BTE. 

In [None]:
summarytable1 = translator_apis[~ translator_apis['team'].isna()].copy()
summarytable1

The APIs WITHOUT x-translator information

In [None]:
## APIs without x-translator information
summarytable2 = translator_apis[translator_apis['team'].isna()].copy()
summarytable2

#### APIs by team

In [None]:
team_summary = translator_apis.team.value_counts().reset_index()
team_summary.columns = ['name', 'Num of APIs']
team_summary['plot_names'] = [ '\n'.join(wrap(i, 30)) for i in team_summary['name']]

team_summary.plot.barh(x='plot_names', y='Num of APIs', fontsize = 10, figsize=(11,8), rot=0, 
                       legend=False, title = "Num of APIs by team", xlabel="")
## table view
team_summary[['name', 'Num of APIs']]

#### APIs by component

In [None]:
component_summary = translator_apis.component.value_counts().reset_index()
component_summary
component_summary.plot.bar(x='index', y='component', rot=0,
                       legend=False, title = "Num of APIs by team", xlabel="")

#### APIs by uptime status

In [None]:
status_summary = translator_apis.uptimeStatus.value_counts().reset_index()
status_summary
status_summary.plot.bar(x='index', y='uptimeStatus', rot=0,
                       legend=False, title = "Num of APIs by team", xlabel="")

## using x-translator and metaKG together

The MetaKG is currently based on annotations from x-bte. The operations are all from APIs tagged translator 

### predicates by team

In [None]:
operations1 = operations_summary[['predicate', 'api_name', 'api_team']].drop_duplicates()
operations1.api_team.value_counts().reset_index(name = 'Num of unique predicates')