# Inspect DDI Train data to test rules

Notebook to test Parser, rules and features for the DDI Train dataset.

> NOTE: StanfordCoreNLP needs to be running on a terminal before using the noteboo. Use the following command:
```bash
java -mx4g -cp 'resources/stanford-corenlp/*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer
```

In [10]:
# !/usr/bin/python3
from graphviz import Digraph
from nltk.parse.corenlp import CoreNLPDependencyParser
from os import listdir, system, path, makedirs
from xml.dom.minidom import parse
from tqdm import tqdm
import pickle
import pandas as pd

In [11]:
inputdir = "data/Train"
outputfile = "data/tmp/task9.2_BASELINE_999.txt"
DependencyParser = CoreNLPDependencyParser(url="http://localhost:9000")
import DDI_ML as ddi

In [3]:
training_data = []
files = listdir(inputdir)
for f in tqdm(files[:]):
    # Parse XML file
    sentences = ddi.parseXML(f"{inputdir}/{f}")
    for s in sentences:
        # get sentence id/text
        sid = s.attributes["id"].value
        stext = s.attributes["text"].value
        if not stext:  # Do not process if sentence is empty
            continue

        # load sentence entities into a dictionary
        entities = {}
        ents = s.getElementsByTagName("entity")
        for e in ents:
            id = e.attributes["id"].value
            offs = e.attributes["charOffset"].value.split("-")
            text = e.attributes["text"].value
            entities[id] = {"offset": offs, "text": text}

        # Tokenize, tag, and parse sentence
        analysis = ddi.analyze(stext)
        
        verbs = sorted(
            [analysis.nodes[n] for n in analysis.nodes if "VB" in analysis.nodes[n]["tag"]],
            key=lambda x: x["head"]
        )
        verb = verbs[0] if len(verbs) else None

        training_data.append((sid, stext, verb))

100%|██████████| 629/629 [02:06<00:00,  4.98it/s]


--------------------------------------
--------------------------------------
--------------------------------------

In [5]:
with open("resources/train_goldDDI.txt","r") as fp:
    lines = fp.readlines()
ents = [line.replace("\n","").split("|") for line in lines]
ents

df = pd.DataFrame(ents, columns=["sentence","e1","e2","is_DDI","DDI_type"])
print(df.shape)
df.groupby("DDI_type").size()

(2703, 5)


DDI_type
advise        119
effect        162
int             2
mechanism     201
null         2219
dtype: int64

In [6]:
df_ddi = df[df["DDI_type"] != "null"]
sents = df_ddi["sentence"].values
df_ddi.head()

Unnamed: 0,sentence,e1,e2,is_DDI,DDI_type
0,DDI-DrugBank.d481.s0,DDI-DrugBank.d481.s0.e0,DDI-DrugBank.d481.s0.e1,1,mechanism
8,DDI-MedLine.d63.s4,DDI-MedLine.d63.s4.e0,DDI-MedLine.d63.s4.e1,1,effect
13,DDI-MedLine.d63.s4,DDI-MedLine.d63.s4.e2,DDI-MedLine.d63.s4.e3,1,effect
14,DDI-MedLine.d63.s5,DDI-MedLine.d63.s5.e0,DDI-MedLine.d63.s5.e1,1,effect
15,DDI-MedLine.d63.s6,DDI-MedLine.d63.s6.e0,DDI-MedLine.d63.s6.e1,1,effect


In [7]:
_data = [[t[0], t[1], t[2]["lemma"] if t[2] else None] for t in training_data]
df_verb = pd.DataFrame(_data, columns=["sentence","text","verb"])
df_verb.head()

Unnamed: 0,sentence,text,verb
0,DDI-DrugBank.d10.s0,Interactions between Betaseron and other drugs...,evaluate
1,DDI-DrugBank.d10.s1,Although studies designed to examine drug inte...,note
2,DDI-DrugBank.d10.s2,Betaseron administration to three cancer patie...,lead
3,DDI-MedLine.d69.s0,Differential regulation of tyrosine phosphoryl...,
4,DDI-MedLine.d69.s1,The homodimeric disintegrin contortrostatin wa...,compare


In [9]:
df = df_ddi.merge(df_verb, on="sentence", how="inner")
df.sort_values(by="DDI_type", inplace=True)
df.head()

Unnamed: 0,sentence,e1,e2,is_DDI,DDI_type,text,verb


In [59]:
df_merged = df.groupby(by=["DDI_type","verb"]).size().reset_index(drop=False).sort_values(by=[0,"verb","DDI_type"], ascending=False)
df_merged[df_merged["DDI_type"] == "int"].head(20)

Unnamed: 0,DDI_type,verb,0
209,int,interact,93
217,int,suggest,18
206,int,have,17
216,int,report,12
200,int,demonstrate,6
213,int,occur,5
210,int,interfere,5
214,int,pose,4
207,int,identify,2
205,int,find,2
