# eTransafe Concordance analysis

This is the use scenario that has been described by Thomas Steger-Hartmann in a publication with Matthew Clark.
The idea is to compare animal observations with clinical observations for the various drugs
1. determine the drugs that have been used in the preclinical and the clinical domain
2. compare the individual SOCs for preclinical and clinical 
3. compute the concordance matrix
6. Visualize the matrix

(C) 2021 Erasmus University Medical Center, Rotterdam, The Netherlands
Author: Erik M. van Mulligen, e.vanmulligen@erasmusmc.nl

In [55]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from knowledgehub.api import KnowledgeHubAPI
from Concordance.condordance_utils import getAllFindings, getDrugsMapping, map_soc, create_soc
from Concordance.mapper import Mapper

import ipywidgets as w
from IPython.display import display, Markdown, clear_output, Javascript
from ipypublish import nb_setup
import numpy as np
import seaborn as sns
import pandas
import json
import matplotlib.pyplot as plt
from pprint import pprint
import mysql.connector

In [56]:
api = KnowledgeHubAPI(server='DEV', client_secret='3db5a6d7-4694-48a4-8a2e-e9c30d78f9ab')
mapper = Mapper(api)

## Authenticate for KnowledgeHub

In [57]:
username = w.Text(value='tester',placeholder='Knowledge Hub account', description='username:', disabled=False)
password = w.Password(value='', placeholder='Knowledge Hub password', description='password:', disabled=False)
loginBtn = w.Button(description='Login')
status = w.Output()

def on_button_clicked(_):
    if api.login(username.value, password.value) == False:
        print("Failed to login")
    else:
        print("successfully logged in")
        display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.get_selected_index()+2)'))

loginBtn.on_click(on_button_clicked)
w.VBox([username, password, loginBtn])

VBox(children=(Text(value='tester', description='username:', placeholder='Knowledge Hub account'), Password(de…

successfully logged in


<IPython.core.display.Javascript object>

## Authenticate for the data stored in the database

In [58]:
global db

dbhost = w.Text(value='localhost',placeholder='database host', description='host:', disabled=False)
dbdatabase = w.Text(value='concordance',placeholder='database name', description='database:', disabled=False)
dbusername = w.Text(value='root',placeholder='database username', description='username:', disabled=False)
dbpassword = w.Password(value='', placeholder='database password', description='password:', disabled=False)
dbLoginBtn = w.Button(description='Login')
status = w.Output()

def dbLoginBtn_click(_):
    global db
    try:
        db = mysql.connector.connect(host=dbhost.value, database=dbdatabase.value, user=dbusername.value, password=dbpassword.value)
        print("successfully logged in database")
        display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.get_selected_index()+2)'))
    except Exception as e:
        print("failed to log in database")
        sys.exit(0)
    
dbLoginBtn.on_click(dbLoginBtn_click)
w.VBox([dbhost, dbdatabase, dbusername, dbpassword, dbLoginBtn])

VBox(children=(Text(value='localhost', description='host:', placeholder='database host'), Text(value='concorda…

successfully logged in database


<IPython.core.display.Javascript object>

## The database
In order to be able to compute the concordance tables we have constructed a database with all preclinical and clinical findings found for drugs (i.e. inchikeys) that are both found in the preclinical and clinical data. For the preclinical data we restricted to findings that are treatment related and not in the control group. For each of the preclinical findings we checked with the semantic service whether the equivalent clinical finding was found in the clinical findings and vice versa. This is stored in the database as 'mapped' per finding. For each of the preclinical and clinical findings we derived the system organ class it belongs to. This is stored in the database as 'SOC' per finding.

In [59]:
all_preclinical_findings = {mapper.getKey(f): {'findingCode': f[0], 'specimenOrganCode': f[1], 'SOC': f[2]} for f in getAllFindings(db, "preclinical_findings", "WHERE findingCode IS NOT NULL")}
all_clinical_findings = {mapper.getKey(f): {'findingCode': f[0], 'specimenOrganCode':f[1], 'SOC': f[2]} for f in getAllFindings(db, "clinical_findings", "WHERE findingCode IS NOT NULL")}
all_preclinical_mapped_findings = {mapper.getKey(f): {'findingCode': f[0], 'specimenOrganCode': f[1], 'SOC': f[2]} for f in getAllFindings(db, "preclinical_findings", "WHERE findingCode IS NOT NULL AND mapped IS true")}
all_preclinical_non_mapped_findings = {mapper.getKey(f): {'findingCode': f[0], 'specimenOrganCode': f[1], 'SOC': f[2]} for f in getAllFindings(db, "preclinical_findings", "WHERE findingCode IS NOT NULL AND mapped IS false")}
all_clinical_mapped_findings = {mapper.getKey(f): {'findingCode': f[0], 'specimenOrganCode': f[1], 'SOC': f[2]} for f in getAllFindings(db, "clinical_findings", "WHERE findingCode IS NOT NULL AND mapped IS true")}
print(f'{len(all_preclinical_findings)} all_preclinical_findings')
print(f'{len(all_preclinical_mapped_findings)} all_preclinical_mapped_findings')
print(f'{len(all_preclinical_non_mapped_findings)} all_preclinical_non_mapped_findings')
print(f'{len(all_clinical_findings)} all_clinical_findings')
print(f'{len(all_clinical_mapped_findings)} all_clinical_mapped_findings')    

6048 all_preclinical_findings
4195 all_preclinical_mapped_findings
1689 all_preclinical_non_mapped_findings
8730 all_clinical_findings
2339 all_clinical_mapped_findings


## Drug mapping
We maintain a list of drugs that can be found in the preclinical and clinical data with its inchikey. Per drug we have stored the finding ids that are associated per database.

In [60]:
with open('../data/drug_mappings_inchikey.json', 'r') as drug_file:
    drugs = json.loads(drug_file.read())
    print(f'{len(drugs)} drugs found')

207 drugs found


## Overview of the drugs

In [61]:
pd = nb_setup.setup_pandas(escape_latex=False)
df = pd.DataFrame(np.random.rand(len(drugs),3),columns=['inchiKey','clinicalName','preclinicalName'])
df.inchiKey = [drugs[d]['inchiKey'] for d in drugs]
df.clinicalName = [drugs[d]['clinicalName'] for d in drugs]
df.preclinicalName = [drugs[d]['preclinicalName'] for d in drugs]
df.round(3)

Unnamed: 0,inchiKey,clinicalName,preclinicalName
0,MWTBKTRZPHJQLH-UHFFFAOYSA-N,alcaftadine,Alcaftadine
1,KKGQTZUTZRNORY-UHFFFAOYSA-N,fingolimod,Fingolimod
2,JLKIGFTWXXRPMT-UHFFFAOYSA-N,sulfamethoxazole,Sulfamethoxazole
3,XWTYSIMOBUGWOL-UHFFFAOYSA-N,terbutaline,
4,MUMGGOZAMZWBJJ-DYKIIFRCSA-N,testosterone,
...,...,...,...
202,QPGGEKPRGVJKQB-UHFFFAOYSA-N,dibenzepin,Dibenzepin
203,PVLJETXTTWAYEW-UHFFFAOYSA-N,mizolastine,Mizolastine
204,RKLNONIVDFXQRX-UHFFFAOYSA-N,bromperidol,Bromperidol
205,ZIXNZOBDFKSQTC-UHFFFAOYSA-N,cloxazolam,Cloxazolam


## Concordance table
Per drug retrieve the preclinical and clinical findings. 
- true positives are the findings that can be found present in the preclinical and clinical data. 
- false positives are the findings that can be found in the preclinical data but not in the clinical data
- false negatives are the clinical findings that can not be found in the preclinical data
- true negatives are all preclinical unmapped findings that are not part of the drug specific preclinical findings

In [62]:
ClinicalDatabases = {'ClinicalTrials': api.ClinicalTrials(), 'Medline': api.Medline(), 'Faers': api.Faers(), 'DailyMed': api.DailyMed()}
PreclinicalDatabases = {'eToxSys': api.eToxSys()}

groups = {
    'all': {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0},
    'other': {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0},
}

# count per drug the animal observations that have or don't have a corresponding clinical observation
for inchiKey in df.inchiKey:
    clinical_codes = set()
    preclinical_codes = set()

    # collect list of preclinical findings
    for database in PreclinicalDatabases:
        for f in [f['FINDING'] for f in PreclinicalDatabases[database].getAllFindingByIds(drugs[inchiKey][database])]:
            preclinical_codes.add(mapper.getKey(f))

    for database in ClinicalDatabases:
        if database in drugs[inchiKey] and drugs[inchiKey][database] is not None:
            for f in [f['FINDING'] for f in ClinicalDatabases[database].getAllFindingByIds(drugs[inchiKey][database])]:
                clinical_codes.add(mapper.getKey(f))

    for preclinical_code in preclinical_codes:
        soc = map_soc(all_preclinical_findings[preclinical_code]['SOC'])
        create_soc(groups, soc)
        if preclinical_code in all_preclinical_mapped_findings:
            groups['all']['tp'] += 1
            groups[soc]['tp'] += 1
        else:
            groups['all']['fp'] += 1
            groups[soc]['fp'] += 1

    # if a clinical code is not mapped to a preclinical we deal with a false negative
    for clinical_code in clinical_codes:
        if clinical_code not in all_clinical_mapped_findings:
            soc = map_soc(all_clinical_findings[clinical_code]['SOC']) if clinical_code in all_clinical_findings else 'other'
            create_soc(groups, soc)
            groups['all']['fn'] += 1
            groups[soc]['fn'] += 1

    # if a preclinical code is not mapped and not part of the drug preclinical findings
    for preclinical_code in all_preclinical_non_mapped_findings:
        soc = map_soc(all_preclinical_findings[preclinical_code]['SOC'])
        create_soc(groups, soc)
        if preclinical_code not in preclinical_codes:
            groups['all']['tn'] += 1
            groups[soc]['tn'] += 1


## Concordance tables

In [63]:
def compute_lrp(group):
    sensitivity = compute_sensitivity(group)
    specificity = compute_specificity(group)
    if specificity is not None and sensitivity is not None:
        return sensitivity / (1 - specificity) if specificity != 1 else None
    else:
        return None

def compute_lrn(group):
    sensitivity = compute_sensitivity(group)
    specificity = compute_specificity(group)
    if specificity is not None and sensitivity is not None:
        return (1 - sensitivity) / specificity if specificity != 0 else None
    else:
        return None
    
def compute_chisquare(group):
    tp = group['tp']
    fp = group['fp']
    fn = group['fn']
    tn = group['tn']
    total = tp + fp + fn + tn
    e11 = ((tp + fp) * (tp + fn)) / total
    e12 = ((tp + fp) * (fp + tn)) / total
    e21 = ((fn + tn) * (tp + fn)) / total
    e22 = ((fn + tn) * (fp + tn)) / total
    try:
        return (((tp - e11)**2)/e11) + (((fp - e12)**2)/e12) + (((fn - e21)**2)/e21) + (((tn - e22)**2)/e22)
    except Exception as e:
        return None


def compute_sensitivity(group):
    tp = group['tp']
    fp = group['fp']
    fn = group['fn']
    tn = group['tn']
    return tp / (tp + fn) if (tp + fn) > 0 else None

def compute_specificity(group):
    tp = group['tp']
    fp = group['fp']
    fn = group['fn']
    tn = group['tn']
    return tn / (fp + tn) if (fp + tn) > 0 else None
                
df = pd.DataFrame(np.random.rand(len(groups),10),columns=['soc','TP','FP', 'FN', 'TN', 'Sensitivity', 'Specificity', 'LR+', 'LR-', 'chi-square'])
df.soc = [soc for soc in groups]
df.TP = [groups[soc]['tp'] for soc in groups]
df.FP = [groups[soc]['fp'] for soc in groups]
df.FN = [groups[soc]['fn'] for soc in groups]
df.TN = [groups[soc]['tn'] for soc in groups]
df['Sensitivity'] = [compute_sensitivity(groups[soc]) for soc in groups]
df['Specificity'] = [compute_specificity(groups[soc]) for soc in groups]
df['LR+'] = [compute_lrp(groups[soc]) for soc in groups]
df['LR-'] = [compute_lrn(groups[soc]) for soc in groups]
df['chi-square'] = [compute_chisquare(groups[soc]) for soc in groups]

df.round(3)

Unnamed: 0,soc,TP,FP,FN,TN,Sensitivity,Specificity,LR+,LR-,chi-square
0,all,15165,11468,55803,338702,0.214,0.967,6.525,0.813,32610.177
1,other,0,547,67,0,0.0,0.0,0.0,,614.0
2,Renal and urinary disorders,1485,619,1349,13664,0.524,0.957,12.091,0.498,5067.604
3,General disorders and administration site cond...,2068,1678,4891,90644,0.297,0.982,16.35,0.716,13873.193
4,Endocrine disorders,1057,952,1454,20369,0.421,0.955,9.428,0.606,4120.804
5,Reproductive system and breast disorders,993,1109,707,27664,0.584,0.961,15.155,0.433,7439.61
6,"Respiratory, thoracic and mediastinal disorders",1370,661,1954,17555,0.412,0.964,11.358,0.61,4650.33
7,Hepatobiliary disorders,1295,481,818,9662,0.613,0.953,12.924,0.406,4512.322
8,Blood and lymphatic system disorders,586,408,2050,13047,0.222,0.97,7.331,0.802,1401.721
9,Eye disorders,606,480,1485,11733,0.29,0.961,7.374,0.739,1596.954
