# eTransafe Concordance analysis

This is the use scenario that has been described by Thomas Steger-Hartmann in a publication with Matthew Clark.
The idea is to compare animal observations with clinical observations for the various drugs
1. determine the drugs that have been used in the preclinical and the clinical domain
2. compare the individual SOCs for preclinical and clinical 
3. compute the concordance matrix
6. Visualize the matrix

(C) 2021 Erasmus University Medical Center, Rotterdam, The Netherlands
Author: Erik M. van Mulligen, e.vanmulligen@erasmusmc.nl

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from knowledgehub.api import KnowledgeHubAPI
# from Concordance.condordance_utils import getDrugsMapping, getClinicalDatabases, getPreclinicalDatabases, getSocs, getSocDrugFindings
from Concordance.mapper import Mapper
from Concordance.meddra import MedDRA
import Concordance3.concordance_utils as conutils

import ipywidgets as w
from IPython.display import display, Markdown, clear_output, Javascript
from ipypublish import nb_setup
import numpy as np
import seaborn as sns
import pandas
import json
import matplotlib.pyplot as plt
from pprint import pprint
import mysql.connector
import warnings
warnings.filterwarnings("ignore")

In [2]:
api = KnowledgeHubAPI(server='DEV', client_secret='3db5a6d7-4694-48a4-8a2e-e9c30d78f9ab')
mapper = Mapper(api)

## Authenticate for KnowledgeHub

In [3]:
username = w.Text(value='tester',placeholder='Knowledge Hub account', description='username:', disabled=False)
password = w.Password(value='', placeholder='Knowledge Hub password', description='password:', disabled=False)
loginBtn = w.Button(description='Login')
status = w.Output()

def on_button_clicked(_):
    if api.login(username.value, password.value) == False:
        print("Failed to login")
    else:
        print("successfully logged in")
        display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.get_selected_index()+2)'))

loginBtn.on_click(on_button_clicked)
w.VBox([username, password, loginBtn])

VBox(children=(Text(value='tester', description='username:', placeholder='Knowledge Hub account'), Password(de…

successfully logged in


<IPython.core.display.Javascript object>

## Authenticate for the data stored in the database

In [4]:
dbhost = w.Text(value='localhost',placeholder='database host', description='host:', disabled=False)
dbdatabase = w.Text(value='concordance-30082022',placeholder='database name', description='database:', disabled=False)
dbusername = w.Text(value='root',placeholder='database username', description='username:', disabled=False)
dbpassword = w.Password(value='', placeholder='database password', description='password:', disabled=False)
dbLoginBtn = w.Button(description='Login')
status = w.Output()

def dbLoginBtn_click(_):    
    try:
        global db
        global meddra
        db = mysql.connector.connect(host=dbhost.value, database=dbdatabase.value, user=dbusername.value, password=dbpassword.value)
        meddra = MedDRA(username=dbusername.value, password=dbpassword.value)

        print("successfully logged in database")
        display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.get_selected_index()+2)'))
    except Exception as e:
        print("failed to log in database")
        sys.exit(0)
    
dbLoginBtn.on_click(dbLoginBtn_click)
w.VBox([dbhost, dbdatabase, dbusername, dbpassword, dbLoginBtn])

VBox(children=(Text(value='localhost', description='host:', placeholder='database host'), Text(value='concorda…

successfully logged in database


<IPython.core.display.Javascript object>

## The database
In order to be able to compute the concordance tables we have constructed a database with all preclinical and clinical findings found for drugs (i.e. inchikeys) that are both found in the preclinical and clinical data. For the preclinical data we restricted to findings that are treatment related and not in the control group. For each of the preclinical findings we checked with the semantic service whether the equivalent clinical finding was found in the clinical findings and vice versa. This is stored in the database as 'mapped' per finding. For each of the preclinical and clinical findings we derived the system organ class it belongs to. This is stored in the database as 'SOC' per finding.

## Drug mapping
We maintain a list of drugs that can be found in the preclinical and clinical data with its inchikey. Per drug we have stored the finding ids that are associated per database.

In [5]:
def getDrugs(db):
    cursor = db.cursor(prepared=True)
    cursor.execute('select names, inchi_group, inchi_keys from drugs')
    result = [{'names':record[0].split(', '), 'inchi_group':record[1], 'inchi_keys':record[2].split(', ')} for record in cursor.fetchall()]
    return result

drugs = getDrugs(db)
print(f'{len(drugs)} drugs found')

266 drugs found


## Overview of the drugs

In [6]:
pd = nb_setup.setup_pandas(escape_latex=False)
df = pd.DataFrame(np.random.rand(len(drugs),3),columns=['inchiKey','inchiGroup','name'])
df.inchiKey = [drug['inchi_keys'][0] for drug in drugs]
df.inchiGroup = [drug['inchi_group'] for drug in drugs]
df.name = [drug['names'][0] for drug in drugs]
df.round(3)

Unnamed: 0,inchiKey,inchiGroup,name
0,GDLIGKIOYRNHDA-UHFFFAOYSA-N,GDLIGKIOYRNHDA,Clomipramine HCl
1,OZVBMTJYIDMWIL-AYFBDAFISA-N,OZVBMTJYIDMWIL,bromocriptine
2,KPYSYYIEGFHWSV-UHFFFAOYSA-N,KPYSYYIEGFHWSV,baclofen
3,QZUDBNBUXVUHMW-UHFFFAOYSA-N,QZUDBNBUXVUHMW,clozapine
4,ZNRGQMMCGHDTEI-ITGUQSILSA-N,ZNRGQMMCGHDTEI,tropisetron
...,...,...,...
261,IKBKZGMPCYNSLU-RGVLZGJSSA-N,IKBKZGMPCYNSLU,Tegaserod
262,UUOJIACWOAYWEZ-UHFFFAOYSA-N,UUOJIACWOAYWEZ,bopindolol
263,DEQANNDTNATYII-OULOTJBUSA-N,DEQANNDTNATYII,octreotide
264,KPJZHOPZRAFDTN-NQUBZZJWSA-N,KPJZHOPZRAFDTN,methysergide


## Concordance table
Per drug retrieve the preclinical and clinical findings. 
- true positives are the findings that can be found present in the preclinical and clinical data. 
- false positives are the findings that can be found in the preclinical data but not in the clinical data
- false negatives are the clinical findings that can not be found in the preclinical data
- true negatives are all preclinical unmapped findings that are not part of the drug specific preclinical findings

In [7]:
max_distance = 4
all_cc_findings = conutils.getAllClinicalFindings(db)
all_pc_findings = conutils.getAllPreclinicalFindings(db)
all_cc_names = [conutils.getKey(f, True) for f in all_cc_findings]
all_pc_names = [conutils.getKey(f, False) for f in all_pc_findings]

all_mappings = conutils.getMappings(db, max_distance)
all_df = pd.DataFrame([[None] * len(all_cc_names)] * len(all_pc_names), all_pc_names, all_cc_names)
for m in all_mappings:
    pc_name = conutils.getKey(m, False)
    cc_name = conutils.getKey(m, True)
    all_df.at[pc_name, cc_name] = m['minDistance']

## Compute Concordance tables

In [None]:
ct = nb_setup.setup_pandas(escape_latex=False)
ct.set_option('display.max_rows', None)
ct.set_option('display.colheader_justify', 'left')
ct.set_option('display.max_columns', None)
ct.set_option('display.width', 200)
ct.options.display.float_format = '{:.2f}'.format

def createGroups(all_df, level):
    groups = {}

    # first collect all groups
    for column in all_df.columns:
        group = meddra.map2name(column, level)
        if group not in groups:
            groups[group] = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0, 'codes': 1, 'min_distance': None, 'max_distance': None}
        else:
            groups[group]['codes'] += 1


    progress = w.IntProgress(min=0, max=len(drugs), description='drugs:', bar_style="info") # instantiate the bar
    done_label = w.Label(value="0%")
    layout = w.HBox([progress, done_label])
    display(layout) 

    for drug in drugs:
        progress.value += 1
        completed = (progress.value*1.0) / len(drugs)
        done_label.value = '{0:.0%}'.format(completed)
        inchiKey = drug['inchi_group']
        drug_pc_findings = conutils.getAllDrugPreclinicalFindings(db, inchiKey)
        drug_cc_findings = conutils.getAllDrugClinicalFindings(db, inchiKey)

        drug_pc_groups = set()

        # map the drug_pc_findings to groups
        for pc in drug_pc_findings:
            # find the cc findings associated with it
            pc_name = conutils.getKey(pc, False)
            if pc_name in all_df.index:
                row = all_df.loc[pc_name]
                # retrieve the clinical codes that are mapped to the preclinical code to obtain the groups
                for code, value in row.items():
                    if value is not None:
                        code_group = meddra.map2name(code, level)
                        drug_pc_groups.add(code_group)
                        groups[code_group]['min_distance'] = conutils.amin(groups[code_group]['min_distance'], value)
                        groups[code_group]['max_distance'] = conutils.amax(groups[code_group]['max_distance'], value)

        # map the drug_cc_findings to groups
        drug_cc_groups = set([meddra.map2name(conutils.getKey(cc, True), level) for cc in drug_cc_findings])

        for group in groups:
            if group in drug_pc_groups and group in drug_cc_groups:
                groups[group]['tp'] += 1
            elif group in drug_pc_groups and group not in drug_cc_groups:
                groups[group]['fp'] += 1
            elif group not in drug_pc_groups and group in drug_cc_groups:
                groups[group]['fn'] += 1
            elif group not in drug_pc_groups and group not in drug_cc_groups:
                groups[group]['tn'] += 1
    return groups

def displayGroups(df, groups, level, title):
    df[title] = [code for code in groups]
    df['codes'] = [groups[code]['codes'] for code in groups]
    df['min_dist'] = [groups[code]['min_distance'] for code in groups]
    df['min_dist'] = df['min_dist'].fillna(999).astype(int)
    df['max_dist'] = [groups[code]['max_distance'] for code in groups]
    df['max_dist'] = df['max_dist'].fillna(999).astype(int)
    df.TP = [groups[code]['tp'] for code in groups]
    df.FP = [groups[code]['fp'] for code in groups]
    df.FN = [groups[code]['fn'] for code in groups]
    df.TN = [groups[code]['tn'] for code in groups]
    df['Sensitivity'] = [conutils.compute_sensitivity(groups[code]) for code in groups]
    df['Specificity'] = [conutils.compute_specificity(groups[code]) for code in groups]
    df['LR+'] = [conutils.compute_lrp(groups[code]) for code in groups]
    df['LR-'] = [conutils.compute_lrn(groups[code]) for code in groups]
    df['chi-square'] = [conutils.compute_chisquare(groups[code]) for code in groups]
    df = df.sort_values(by=['LR+'], ascending=False)
    df.round(3)
    return df

In [None]:
groups = {}
for level in ['soc', 'hlgt', 'hlt', 'pt']:
    groups[level] = createGroups(all_df, level)

In [None]:
df = {}
for level in ['soc', 'hlgt', 'hlt', 'pt']:
    title = f'MedDRA {level.upper()}'
    df[level] = ct.DataFrame(np.random.rand(len(groups[level]), 13), columns=[title, 'codes', 'min_dist', 'max_dist', 'TP', 'FP', 'FN', 'TN', 'Sensitivity', 'Specificity', 'LR+', 'LR-', 'chi-square'])
    table = displayGroups(df[level], groups[level], level, title)
    display(table)