# Use to explore and find issues with registry metadata

## Setup

In [1]:
## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## show time that this notebook was executed 
from datetime import datetime

## packages to work with objects 
import re
import requests
import pandas as pd
from textwrap import wrap  ## for plot labels

## to get around bugs
import nest_asyncio
nest_asyncio.apply()

In [2]:
## record when cell blocks are executed
print('The time that this notebook was executed is...')
print('Local time (PST, West Coast USA): ')
print(datetime.now())
print('UTC time: ')
print(datetime.utcnow())

The time that this notebook was executed is...
Local time (PST, West Coast USA): 
2021-06-07 22:54:02.082662
UTC time: 
2021-06-08 05:54:02.082866


### Getting and parsing metaKG info

In [3]:
headers = {"content-type": "application/json"}

metaKG_url = "https://smart-api.info/api/metakg"

metaKG_request = requests.get(metaKG_url, headers=headers)
metaKG_request.status_code
## if the output is successful, code 200

200

In [4]:
metaKG = metaKG_request.json()

In [5]:
metaKG_operations = []
for edge in metaKG['associations']:
    tempdict = {
        "subject": edge['subject'],
        "object": edge['object'],
        "predicate": edge['predicate'],
        "provided_by": edge.get('provided_by'),
        "api_name": edge['api']['name']
    }
    if edge['api'].get('x-translator'):
        tempteam = edge['api']['x-translator'].get('team')
        if isinstance(tempteam, list):
            tempteam = ",".join(tempteam)
            if not tempteam:
                tempteam = None  
        tempdict.update({
            "api_team": tempteam,
            "api_component": edge['api']['x-translator'].get('component'),
        })
    else:
        tempdict.update({
            "api_team": None,
            "api_component": None,
        })
    metaKG_operations.append(tempdict)

In [6]:
operations_summary = pd.DataFrame.from_dict(metaKG_operations)
operations_summary.shape
## describes this many operations

(10063, 7)

## Registry API-level summary

In [7]:
operations_summary.api_name.value_counts().reset_index().sort_values(by = ["api_name", "index"], ascending=[True,True])

Unnamed: 0,index,api_name
34,LINCS Data Portal API,1
36,LitVar API,1
35,Ontology Lookup Service API,1
37,OpenTarget API,1
38,QuickGO API,1
30,BioThings DGIdb API,2
26,DISEASES API,2
33,Drug Response KP API,2
28,EBI Proteins API,2
32,EBIgene2phenotype API,2


In [8]:
operations_summary.api_name.nunique()

39

In [9]:
## this means every RTX KG2 subject-predicate-object group is unique
operations_summary[operations_summary['api_name'] == 'RTX KG2'].shape
operations_summary[operations_summary['api_name'] == 'RTX KG2'].drop_duplicates().shape

(8315, 7)

(8315, 7)

In [10]:
## this means something seems off with the Clinical Risk KP API ingest...
operations_summary[operations_summary['api_name'] == 'Clinical Risk KP API'].shape
operations_summary[operations_summary['api_name'] == 'Clinical Risk KP API'].drop_duplicates().shape

(648, 7)

(64, 7)

In [24]:
clinical_risk = []
for edge in metaKG['associations']:
    if edge['api']['name'] == 'Clinical Risk KP API':
        tempdict = {
            "subject": edge['subject'],
            "object": edge['object'],
            "predicate": edge['predicate'],
            "provided_by": edge.get('provided_by'),
            "record_source": edge['api']['smartapi']['metadata'],
            "record_id": edge['api']['smartapi']['id']
        }
        clinical_risk.append(tempdict)

clinical_risk_df = pd.DataFrame.from_dict(clinical_risk)

In [27]:
## yeah, so all the metadata we have is the same
clinical_risk_df.drop_duplicates().shape

(64, 6)

In [33]:
clinical_risk_df['predicate'].unique()

array(['related_to', 'associated_with_risk_for',
       'negatively_correlated_with',
       'negatively_associated_with_risk_for'], dtype=object)

In [39]:
clinical_risk_df[(clinical_risk_df['subject'] == 'Disease')  &
                 (clinical_risk_df['object'] == 'PhenotypicFeature')
                ].shape

(72, 6)

In [31]:
clinical_risk_df[(clinical_risk_df['subject'] == 'ChemicalSubstance') &
                 (clinical_risk_df['object'] == 'ChemicalSubstance') &
                 (clinical_risk_df['predicate'] == 'related_to')
                ].record_source[0]

'https://raw.githubusercontent.com/Hadlock-Lab/clinical_risk_kp/master/ehr_risk_kp.yaml'