# Lab AHLT Drug NER

In [11]:
import os
import re
import string
import xml.etree.ElementTree as ET
from collections import Counter

import nltk
from chemdataextractor.nlp.tokenize import ChemWordTokenizer
from nltk import word_tokenize, QuadgramCollocationFinder
from nltk.corpus import stopwords

**XML**:

In [12]:
def parse_xml(file):
    tree = ET.parse(file)
    return tree.getroot()

In [13]:
def get_sentence_info(child):
    return child.get('id'), child.get('text')

**Tokenizers:**

In [14]:
def chem_tokenize(text):
    cwt = ChemWordTokenizer()
    tokens = cwt.tokenize(text)
    token_indexs = cwt.span_tokenize(text)
    tokenized_info = []
    for token_index, token in zip(token_indexs, tokens):
        tokenized_info.append((token, token_index[0], token_index[1] - 1))
    return tokenized_info


def tokenize(text):
    tokenized_sent = word_tokenize(text)
    tokenized_info = []
    current_index = 0

    for word in tokenized_sent:

        if not re.match("[" + string.punctuation + "]", word):
            for match in re.finditer(word, text):
                if match.start() >= current_index:
                    tokenized_info.append((word, match.start(), match.end() - 1))
                    current_index = match.end() - 1
                    break
    return tokenized_info

In [15]:
def get_external_resources():
    file = open('../resources/DrugBank.txt', 'r', encoding="utf8")
    Lines = file.readlines()

    resources = {}

    # Strips the newline character
    for line in Lines:
        value = line.split("|")
        resources[value[0]] = value[1][:-1]
    return resources

**Extract entities:**

In [16]:
def extract_entities(token_list, entities_dict, with_resources=False):
    entities = []
    previous_token_offset = (0, 0)
    stop_words = set(stopwords.words('english'))

    # TODO: Revisar treure tokens majuscules d'una lletra 'A'
    for token in token_list:
        if with_resources:
            if token[0].lower() in entities_dict:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': entities_dict[token[0].lower()]})
                previous_token_offset = (token[1], token[2])
                continue
            if token[0] in entities_dict:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': entities_dict[token[0]]})
                previous_token_offset = (token[1], token[2])
            if token[0].lower() in stop_words:
                continue

        if token[0].lower() in {'of', 'the', 'and', 'in', 'with', 'to', 'be', 'or', 'is', 'not', 'by', 'for',
                                'should', 'on', 'that', 'been', 'have', 'other', 'was', 'when', 'are', 'as', 'were',
                                'no', 'has', 'these', 'an', 'this', 'such', 'at', 'from', 'it', 'if', 'there', 'after',
                                'which', 'can', 'between', 'during', 'because', 'both', 'than', 'did', 'its', 'but',
                                'some', 'who', 'any'}:
            continue
        if token[0].lower() == "aspirin":
            entities.append({'name': token[0],
                             'offset': str(token[1]) + "-" + str(token[2]),
                             'type': "brand"})
            previous_token_offset = (token[1], token[2])
            continue
        if '(' in token[0] and len(token[0]) > 1:
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "drug_n"})
                previous_token_offset = (token[1], token[2])
            continue
        if re.search("[a-z][\-][a-z]", token[0]) and re.search("^(\d+[\-\.]\d+)$|^(\d+\.\d+\-\d+\.\d+)$",
                                                               token[0]) is None:
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "group"})
                previous_token_offset = (token[1], token[2])
            continue

        if re.search("\w[_%()\-]\w", token[0]) and re.search("^(\d+[\-\.]\d+)$|^(\d+\.\d+\-\d+\.\d+)$",
                                                             token[0]) is None:
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "drug_n"})
                previous_token_offset = (token[1], token[2])
            continue
        if token[0].isupper():
            pattern = re.compile("[AEIOU]")
            # TODO Check if es la segona paraula en majuscules potser hem de canviar el type també
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            elif not bool(pattern.search(token[0])):
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "drug_n"})
                previous_token_offset = (token[1], token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "brand"})
                previous_token_offset = (token[1], token[2])
            continue

        if len(entities) > 0 and previous_token_offset[1] + 2 == token[1] and any(
                substring in token[0].lower() for substring in
                ['agent', 'inhibitor', 'blocker', 'drug', 'type', 'medication', 'contraceptive', 'anticoagulants']):
            entities[-1]['name'] += " " + token[0]
            entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            entities[-1]['type'] = "group"
            continue

        if token[0].lower() in ['digoxin', 'warfarin', 'phenytoin', 'theophylline', 'lithium', 'ketoconazole',
                                'cimetidine',
                                'alcohol', 'cyclosporine', 'erythromycin', 'tricyclic antidepressants', 'aspirin',
                                'carbamazepine', 'rifampin', 'amiodarone', 'quinidine', 'phenobarbital', 'indinavir',
                                'propranolol', 'methotrexate', 'diltiazem', 'cisapride',
                                'ethanol']:
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "drug"})
                previous_token_offset = (token[1], token[2])
            continue

        if any(substring in token[0].lower() for substring in
               ['anticoagulant', 'corticosteroid', 'NSAID', 'antacid', 'contraceptive', 'diuretic', 'barbiturate']):
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "group"})
                previous_token_offset = (token[1], token[2])
            continue

        suffixes = (
            "afil", "asone", "bicin", "bital", "caine", "cillin", "cycline", "azole", "dipine",
            "dronate", "eprazole", "fenac", "floxacin", "gliptin", "glitazone", "iramine", "lamide", "mab",
            "mustine", "mycin", "nacin", "nazole", "olol", "olone", "olone", "onide", "oprazole", "parin",
            "phylline", "pramine", "pril", "profen", "ridone", "sartan", "semide", "setron", "setron", "statin",
            "tadine", "tadine", "terol", "thiazide", "tinib", "trel", "tretin", "triptan", "tyline", "vudine",
            "zepam", "zodone", "zolam", "zosin", "ine")
        if token[0].endswith(suffixes):
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "drug"})
                previous_token_offset = (token[1], token[2])
            continue

        prefixes = ("anti")
        if token[0].startswith(prefixes) or "POC" in token[0]:
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "group"})
                previous_token_offset = (token[1], token[2])
            continue

        if token[0][0].isupper() and nltk.pos_tag([token[0]])[0][1][0] == 'N':
            if len(entities) > 0 and previous_token_offset[1] + 2 == token[1]:
                entities[-1]['name'] += " " + token[0]
                entities[-1]['offset'] = str(previous_token_offset[0]) + "-" + str(token[2])
            else:
                entities.append({'name': token[0],
                                 'offset': str(token[1]) + "-" + str(token[2]),
                                 'type': "brand"})
                previous_token_offset = (token[1], token[2])
            continue
    return entities

In [17]:
def output_entities(sid, entities, output_file):
    for entity in entities:
        output_file.write(sid + "|" + entity['offset'] + "|" + entity['name'] + "|" + entity['type'] + "\n")

In [18]:
def evaluate(inputdir, outputfile):
    return os.system("java -jar ../eval/evaluateNER.jar " + inputdir + " ../output/" + outputfile)

In [22]:
output_file_name = "task9.1_out_1.txt"
input_directory = '../data/Devel/'

entities_dict = get_external_resources()

**Without resources**

In [23]:
output_file = open('../output/' + output_file_name, 'w+')
for filename in os.listdir(input_directory):
    root = parse_xml(input_directory + filename)
    # print(" - File:", filename)
    for child in root:
        sid, text = get_sentence_info(child)
        token_list = chem_tokenize(text)
        entities = extract_entities(token_list, entities_dict, with_resources=False)
        output_entities(sid, entities, output_file)

# Close the file
output_file.close()
evaluate(input_directory, output_file_name) # Shown in console

0

**With resources**

In [24]:
output_file = open('../output/' + output_file_name, 'w+')
for filename in os.listdir(input_directory):
    root = parse_xml(input_directory + filename)
    # print(" - File:", filename)
    for child in root:
        sid, text = get_sentence_info(child)
        token_list = chem_tokenize(text)
        entities = extract_entities(token_list, entities_dict, with_resources=True)
        output_entities(sid, entities, output_file)

# Close the file
output_file.close()
evaluate(input_directory, output_file_name) # Shown in console

0

### Get train info

In [29]:
def get_postag_counts(token_list, entities, previous_tag_dict, entity_tag_dict):
    for entity in entities:
        entity_token = entity
        if " " in entity_token:
            entity_token = entity_token.split(" ")[0]
        if "-" in entity_token:
            entity_token = entity_token.split("-")[0]
        if "(" in entity_token:
            entity_token = entity_token.split("(")[0]
        if "." in entity_token:
            entity_token = entity_token.split(".")[0]

        for i, token in enumerate(token_list):
            if entity_token in token:
                entity_index = i
        tags = nltk.pos_tag(token_list)

        entity_tag = tags[entity_index][1]
        previous_tag = tags[entity_index - 1][1]

        if entity_tag not in entity_tag_dict:
            entity_tag_dict[entity_tag] = 0
        entity_tag_dict[entity_tag] += 1

        if previous_tag not in previous_tag_dict:
            previous_tag_dict[previous_tag] = 0
        previous_tag_dict[previous_tag] += 1

    return previous_tag_dict, entity_tag_dict


def get_truth_entities(child):
    return [ent.get('text') for ent in child.findall('entity')], [ent.get('type') for ent in child.findall('entity')]


def longestCommonPrefix(strs):
    longest_pre = ""
    if not strs: return longest_pre
    shortest_str = min(strs, key=len)
    for i in range(len(shortest_str)):
        if all([x.startswith(shortest_str[:i + 1]) for x in strs]):
            longest_pre = shortest_str[:i + 1]
        else:
            break
    return longest_pre


def print_truth_patterns(input_directory):
    all_entities = []
    all_entities_type = []
    all_token_list = []

    previous_tag_dict = {}
    entity_tag_dict = {}

    multiple_words_by_type = {}

    for filename in os.listdir(input_directory):
        root = parse_xml(input_directory + filename)
        # print(" - File:", filename)

        for child in root:
            sid, text = get_sentence_info(child)
            token_list = word_tokenize(text)
            entities, entities_type = get_truth_entities(child)

            for entity, type in zip(entities, entities_type):
                if len(entity.split(' ')) > 1:
                    if type not in multiple_words_by_type:
                        multiple_words_by_type[type] = 0
                    multiple_words_by_type[type] += 1

            all_entities.append(entities)
            all_entities_type.append(entities_type)
            all_token_list.append(token_list)

            get_postag_counts(token_list, entities, previous_tag_dict, entity_tag_dict)
    sorted_previous_tag = {k: v for k, v in
                           sorted(previous_tag_dict.items(), key=lambda item: item[1], reverse=True)}
    sorted_entity_tag = {k: v for k, v in sorted(entity_tag_dict.items(), key=lambda item: item[1], reverse=True)}

    # print("LONGEST PREFIX:", longestCommonPrefix(all_entities))
    print("MULTIPLE WORDS COUNT:", multiple_words_by_type, "\n")

    all_entities = [item for sublist in all_entities for item in sublist]
    all_entities_type = [item for sublist in all_entities_type for item in sublist]

    cnt = Counter(all_entities)
    print({k:v for k,v in cnt.items() if v > 4})
    
    print("\nStarts with 'phe' prefix:")
    for entity, entity_type in zip(all_entities, all_entities_type):
        if entity.startswith("phe"):
            print(entity, "-", entity_type)

    all_entities.insert(0, "  ")
    all_entities.append("  ")
    entities_string = "  ".join(all_entities)

    finder = QuadgramCollocationFinder.from_words(entities_string)
    finder.apply_freq_filter(10)

    quadgrams = [tr for tr in finder.ngram_fd.items()]
    quadgrams = sorted(quadgrams, key=lambda tup: tup[1], reverse=True)

    for quadgram in quadgrams:
        if quadgram[0][0] == ' ':
            # Prefix
            prefix = ''.join(quadgram[0][1:])
            entities_with_prefix = [ent for ent in all_entities if ent.startswith(prefix)]
            entities_type_with_prefix = {}
            for ent, ent_type in zip(all_entities, all_entities_type):
                if ent.startswith(prefix):
                    if ent_type not in entities_type_with_prefix:
                        entities_type_with_prefix[ent_type] = 0
                    entities_type_with_prefix[ent_type] += 1
            if entities_type_with_prefix:
                print("\nLongest Prefix:", longestCommonPrefix(entities_with_prefix), len(entities_with_prefix), "\n")
                print(entities_type_with_prefix, "\n")
    print("PREVIOUS TAGS:\n", sorted_previous_tag, "\n")
    print("ENTITY TAGS:\n", sorted_entity_tag, "\n")
    print("QUADGRAMS:\n", quadgrams, "\n")

print_truth_patterns(input_directory)

MULTIPLE WORDS COUNT: {'group': 237, 'drug': 74, 'drug_n': 9, 'brand': 3} 

{'antibiotics': 5, 'ofloxacin': 5, 'digoxin': 21, 'thiazide diuretics': 15, 'PCP': 15, 'Carbamazepine': 5, 'carbamazepine': 15, 'tricyclic antidepressants': 5, 'KRM-1648': 5, 'rifampicin': 5, 'rifabutin': 8, 'sildenafil': 9, 'amphetamine': 6, 'cocaine': 5, '3H-spiroperidol': 5, 'neuroleptics': 6, 'apomorphine': 7, 'naloxone': 10, 'beta-endorphin': 9, 'morphine': 12, '(+)-NANM': 7, '(-)-NANM': 8, 'gentamicin': 6, 'vitamin D': 19, '1,25(OH)2D3': 7, 'norepinephrine': 15, 'desipramine': 7, 'probenecid': 7, 'ampicillin': 6, 'Probenecid': 5, 'alcohol': 8, 'Ketoconazole': 6, 'indomethacin': 7, 'aspirin': 11, 'cyclosporine': 9, 'erythromycin': 7, 'atorvastatin': 18, 'colestipol': 5, 'cimetidine': 7, 'ethinyl estradiol': 9, 'warfarin': 24, 'HMG-CoA reductase inhibitors': 5, 'ketoconazole': 12, 'nelfinavir': 15, 'azithromycin': 12, 'indinavir': 5, 'theophylline': 13, 'phenytoin': 20, 'baclofen': 6, 'WELLBUTRIN': 7, 'bupr

In [30]:
import json

# Using readlines()
file1 = open('goldNER.txt', 'r')
Lines = file1.readlines()

truth = {}

# Strips the newline character
for line in Lines:
    value = line.split("|")
    if value[0] not in truth:
        truth[value[0]] = []
    truth[value[0]].append((value[-2], value[-1]))

# Using readlines()
file1 = open('../output/task9.1_out_2.txt', 'r')
Lines = file1.readlines()

output = {}
wrong_entities = []
new_sent = ""
matched_entities = []
missing = []

# Strips the newline character
for line in Lines:
    value = line.split("|")
    if value[0] not in output:
        output[value[0]] = []
    output[value[0]].append((value[-2], value[-1]))

    if new_sent != value[0] and new_sent != "":
        if new_sent in truth:
            missing += [item for item in truth[new_sent] if item not in matched_entities]
        matched_entities = []
        new_sent = value[0]

    if new_sent == "":
        new_sent = value[0]

    if value[0] in truth and (value[-2], value[-1]) in truth[value[0]]:
        matched_entities.append((value[-2], value[-1]))
    else:
        wrong_entities.append((value[-2], value[-1]))

missing_dict = {}
for ent, type in missing:
    if type[:-1] not in missing_dict:
        missing_dict[type[:-1]] = []
    missing_dict[type[:-1]].append(ent)

wrong_entities_dict = {}
for ent, type in wrong_entities:
    if type[:-1] not in wrong_entities_dict:
        wrong_entities_dict[type[:-1]] = []
    wrong_entities_dict[type[:-1]].append(ent)

print("MISSING:\n", json.dumps(missing_dict, indent=4))
print("\n\n")
print("WRONG:\n", json.dumps(wrong_entities_dict, indent=4))

MISSING:
 {
    "group": [
        "fluoroquinolones",
        "selective serotonin reuptake inhibitors",
        "SSRIs",
        "nitrate",
        "neuroleptics",
        "calcium-channel blockers",
        "Slow-channel calcium blockers",
        "radiopharmaceutical",
        "antiestrogen",
        "vitamin-D",
        "immunodepressant",
        "Bacteriostatic Antibiotics",
        "5HT3 Antagonists",
        "5HT3 antagonist class",
        "Antihypertensive Medication",
        "Vasodilators",
        "beta-blocking agent",
        "HMG-CoA reductase inhibitors",
        "HMG-CoA reductase inhibitors",
        "HMG-CoA reductase inhibitor",
        "macrolide products",
        "curare-like compounds",
        "botulinum neurotoxin",
        "botulinum toxin",
        "steroids",
        "Iron Supplements",
        "iron supplement",
        "iron supplements",
        "Azole Antifungals",
        "ANTACID",
        "angiotensin- converting enzyme (ACE) inhibitors",
        "