# Inspect DDI Train data to test rules

Notebook to test Parser, rules and features for the DDI Train dataset.

> NOTE: StanfordCoreNLP needs to be running on a terminal before using the noteboo. Use the following command:
```bash
java -mx4g -cp 'resources/stanford-corenlp/*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer
```

In [1]:
# !/usr/bin/python3
from graphviz import Digraph
from nltk.parse.corenlp import CoreNLPDependencyParser
from os import listdir, system, path, makedirs
from xml.dom.minidom import parse
from tqdm import tqdm
import pickle
import pandas as pd

In [2]:
inputdir = "data/Train"
outputfile = "data/tmp/task9.2_BASELINE_999.txt"
DependencyParser = CoreNLPDependencyParser(url="http://localhost:9000")
import DDI_ML as ddi

In [43]:
training_data = []
files = listdir(inputdir)
for f in tqdm(files[:]):
    # Parse XML file
    sentences = ddi.parseXML(f"{inputdir}/{f}")
    for s in sentences:
        # get sentence id/text
        sid = s.attributes["id"].value
        stext = s.attributes["text"].value
        if not stext:  # Do not process if sentence is empty
            continue

        # load sentence entities into a dictionary
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            id = e.attributes["id"].value
            offs = e.attributes["charOffset"].value.split("-")
            text = e.attributes["text"].value
            entities[id] = {"offset": offs, "text": text}

        # Tokenize, tag, and parse sentence
        analysis = ddi.analyze(stext)
        
        verbs = sorted(
            [analysis.nodes[n] for n in analysis.nodes if "VB" in analysis.nodes[n]["tag"]],
            key=lambda x: x["head"]
        )
        verb = verbs[0] if len(verbs) else None

        training_data.append((sid, stext, verb))

100%|██████████| 629/629 [04:17<00:00,  2.45it/s]


--------------------------------------
--------------------------------------
--------------------------------------

In [45]:
with open("resources/train_goldDDI.txt","r") as fp:
    lines = fp.readlines()
ents = [line.replace("\n","").split("|") for line in lines]
ents

df = pd.DataFrame(ents, columns=["sentence","e1","e2","is_DDI","DDI_type"])
print(df.shape)
df.groupby("DDI_type").size()

(25089, 5)


DDI_type
advise         707
effect        1525
int            186
mechanism     1118
null         21553
dtype: int64

In [46]:
df_ddi = df[df["DDI_type"] != "null"]
sents = df_ddi["sentence"].values
df_ddi.head()

Unnamed: 0,sentence,e1,e2,is_DDI,DDI_type
9,DDI-DrugBank.d542.s0,DDI-DrugBank.d542.s0.e3,DDI-DrugBank.d542.s0.e4,1,effect
12,DDI-DrugBank.d542.s1,DDI-DrugBank.d542.s1.e1,DDI-DrugBank.d542.s1.e2,1,mechanism
13,DDI-DrugBank.d359.s1,DDI-DrugBank.d359.s1.e0,DDI-DrugBank.d359.s1.e1,1,advise
14,DDI-DrugBank.d359.s1,DDI-DrugBank.d359.s1.e0,DDI-DrugBank.d359.s1.e2,1,advise
16,DDI-DrugBank.d359.s2,DDI-DrugBank.d359.s2.e0,DDI-DrugBank.d359.s2.e1,1,int


In [47]:
_data = [[t[0], t[1], t[2]["lemma"] if t[2] else None] for t in training_data]
df_verb = pd.DataFrame(_data, columns=["sentence","text","verb"])
df_verb.head()

Unnamed: 0,sentence,text,verb
0,DDI-DrugBank.d542.s0,Interactions for Vitamin B3 (Niacin): Antihype...,potentiate
1,DDI-DrugBank.d542.s1,Aspirin: Concomitant aspirin may decrease the ...,decrease
2,DDI-DrugBank.d542.s2,The clinical relevance of this finding is uncl...,be
3,DDI-DrugBank.d542.s3,Other: Concomitant alcohol or hot drinks may i...,increase
4,DDI-DrugBank.d359.s0,PEGANONE used in combination with other drugs ...,use


In [49]:
df = df_ddi.merge(df_verb, on="sentence", how="inner")
df.sort_values(by="DDI_type", inplace=True)
df.head()

Unnamed: 0,sentence,e1,e2,is_DDI,DDI_type,text,verb
2505,DDI-DrugBank.d558.s25,DDI-DrugBank.d558.s25.e3,DDI-DrugBank.d558.s25.e11,1,advise,"Inhibitors of this isoenzyme (e.g., macrolide ...",coadminister
1833,DDI-DrugBank.d3.s10,DDI-DrugBank.d3.s10.e0,DDI-DrugBank.d3.s10.e1,1,advise,"Therefore, concomitant use of TORADOL and prob...",contraindicate
1992,DDI-DrugBank.d521.s1,DDI-DrugBank.d521.s1.e0,DDI-DrugBank.d521.s1.e10,1,advise,"Before taking glimepiride, tell your doctor if...",tell
2419,DDI-DrugBank.d410.s2,DDI-DrugBank.d410.s2.e0,DDI-DrugBank.d410.s2.e1,1,advise,Sumatriptan and D.H.E. 45 (dihydroergotamine ...,take
2420,DDI-DrugBank.d410.s2,DDI-DrugBank.d410.s2.e0,DDI-DrugBank.d410.s2.e2,1,advise,Sumatriptan and D.H.E. 45 (dihydroergotamine ...,take


In [59]:
df_merged = df.groupby(by=["DDI_type","verb"]).size().reset_index(drop=False).sort_values(by=[0,"verb","DDI_type"], ascending=False)
df_merged[df_merged["DDI_type"] == "int"].head(20)

Unnamed: 0,DDI_type,verb,0
209,int,interact,93
217,int,suggest,18
206,int,have,17
216,int,report,12
200,int,demonstrate,6
213,int,occur,5
210,int,interfere,5
214,int,pose,4
207,int,identify,2
205,int,find,2
