# Notebook for Development

In [None]:
import os
import sys
import re
import numpy as np
from pycorenlp import StanfordCoreNLP
import nltk
import subprocess
import pandas as pd
from lxml import etree
from jsonrpclib.jsonrpc import ServerProxy

nlp = StanfordCoreNLP('http://localhost:9000')

data_dir = '../data/'
tregex_dir = './stanford-tregex-2018-02-27/'
ctakes_folder = './ctakes/'

# can be extended to batch processing if needed (feed a list of filenames)
#filenames = ['dev.txt']
#filenames = ['3.txt']
filenames = ['test_ready.txt']

In [None]:
neg_list = pd.read_csv(data_dir + 'multilingual_lexicon-en-de-fr-sv.csv', sep=',', header=0)[['ITEM', 'CATEGORY', 'EN (SV) ACTION']]
neg_list = neg_list[neg_list['CATEGORY'].isin(['definiteNegatedExistence', 'probableNegatedExistence', 'pseudoNegation'])]
neg_list['NEG'] = ''
neg_list['FIRST_TOKEN'] = ''
neg_list['FIRST_POS'] = ''
neg_list['LAST_TOKEN'] = ''
neg_list['LAST_POS'] = ''
for idx in neg_list.index:
    if neg_list['CATEGORY'][idx] == 'definiteNegatedExistence' and neg_list['EN (SV) ACTION'][idx] == 'forward': 
        neg_list['NEG'][idx] = 'PREN'
    if neg_list['CATEGORY'][idx] == 'definiteNegatedExistence' and neg_list['EN (SV) ACTION'][idx] == 'backward': 
        neg_list['NEG'][idx] = 'POST'
    if neg_list['CATEGORY'][idx] == 'definiteNegatedExistence' and neg_list['EN (SV) ACTION'][idx] == 'bidirectional': 
        neg_list['NEG'][idx] = 'POST'
    if neg_list['CATEGORY'][idx] == 'probableNegatedExistence' and neg_list['EN (SV) ACTION'][idx] == 'forward':
        neg_list['NEG'][idx] = 'PREP'
    if neg_list['CATEGORY'][idx] == 'probableNegatedExistence' and neg_list['EN (SV) ACTION'][idx] == 'backward': 
        neg_list['NEG'][idx] = 'POSP'
    if neg_list['CATEGORY'][idx] == 'probableNegatedExistence' and neg_list['EN (SV) ACTION'][idx] == 'bidirectional': 
        neg_list['NEG'][idx] = 'POSP'
    if neg_list['CATEGORY'][idx] == 'pseudoNegation': 
        neg_list['NEG'][idx] = 'PSEU'
    neg_list['FIRST_TOKEN'][idx] = neg_list['ITEM'][idx].split()[0]
    neg_list['FIRST_POS'][idx] = nltk.pos_tag(nltk.word_tokenize(neg_list['FIRST_TOKEN'][idx]))[0][1]
    neg_list['LAST_TOKEN'][idx] = neg_list['ITEM'][idx].split()[len(neg_list['ITEM'][idx].split())-1]
    neg_list['LAST_POS'][idx] = nltk.pos_tag(nltk.word_tokenize(neg_list['LAST_TOKEN'][idx]))[0][1]

neg = neg_list['ITEM'].values
neg_list.head()
neg_list.to_csv(data_dir + 'neg_list.txt', sep='\t', index=False, quoting=False)

neg_term = [' ' + item + ' ' for item in neg]
neg_term.extend(item + ' ' for item in neg)

# use the labeled list (annotated 'type')
neg_list = pd.read_csv(data_dir + 'neg_list_complete.txt', sep='\t', header=0)
neg = neg_list['ITEM'].values
neg_term = [' ' + item + ' ' for item in neg]
neg_term.extend(item + ' ' for item in neg)

## Section and senetence tokenization

In [None]:
section_names = ['Allergies', 'Chief Complaint', 'Major Surgical or Invasive Procedure', 'History of Present Illness',
                'Past Medical History', 'Social History', 'Family History', 'Brief Hospital Course', 
                'Medications on Admission', 'Discharge Medications', 'Discharge Diagnosis', 'Discharge Condition', 
                 'Discharge Instructions']
section_dict ={
 'Allergies': ['allergy'],
 'Brief Hospital Course': ['hospital', 'course'],
 'Chief Complaint': ['chief', 'complaint'],
 'Discharge Condition': ['discharge', 'condition'],
 'Discharge Diagnosis': ['discharge', 'diagnosis'],
 'Discharge Instructions': ['discharge', 'instruction'],
 'Discharge Medications': ['discharge', 'medication'],
 'Family History': ['family', 'history'],
 'History of Present Illness': ['history', 'present', 'illness'],
 'Major Surgical or Invasive Procedure': ['major',
  'surgical',
  'invasive',
  'procedure'],
 'Medications on Admission': ['medication', 'admission'],
 'Past Medical History': ['medical', 'history'],
 'Social History': ['social', 'history']}

other_section_names = ['Followup Instructions', 'Physical Exam', 'Pertinent Results', 'Facility', 'Discharge Disposition']
other_section_dict = {
 'Discharge Disposition': ['discharge', 'disposition'],
 'Facility': ['facility'],
 'Followup Instructions': ['followup', 'instruction'],
 'Pertinent Results': ['pertinent', 'result'],
 'Physical Exam': ['physical', 'exam']}

all_section_dict = {}
all_section_dict.update(section_dict)
all_section_dict.update(other_section_dict)

section_names_list = list(section_dict.keys())

section_to_parse = ['History of Present Illness', 'Brief Hospital Course', 'Discharge Instructions']
section_not_to_parse = [item for item in section_names_list if item not in section_to_parse] + ['None']

hard_section_list = ['History of Present Illness', 'Past Medical History', 'Brief Hospital Course', 'Discharge Diagnosis', 'Discharge Instructions']
easy_section_list = [item for item in section_names_list if item not in hard_section_list]

In [None]:
def match_section_name(name, section_dict, nlp_parser):
    output = nlp_parser.annotate(name.lower(), properties={
                                              'annotators': 'lemma',
                                              'outputFormat': 'json',
                                              'threads': '4',
                                              'tokenize.options': 'normalizeParentheses=false, normalizeOtherBrackets=false'
                                              })
    try:
        name_lemma = set([[str(token['lemma']) for token in sent['tokens']] for sent in output['sentences']][0])
    except:
        return 'None'
    else:
        for section_name, section_name_lemma in section_dict.items():
            if all([item in name_lemma for item in section_name_lemma]):
                return section_name
    return 'None'

In [None]:
for idx in range(1):
    sections = {}
    sections['None'] = []
    with open(os.path.join(data_dir, filenames[idx]), 'r') as f:
        for _ in range(3): next(f)
        lines_buffer = []
        previous_section_name = 'None'
        for line in f:
            line = line.strip()
            if line:
                if line.lower() == 'attending:':
                    continue
                lines_buffer.append(line)
            else:
                if lines_buffer:
                    lines_buffer_head = lines_buffer[0]
                    if ':' in lines_buffer_head:
                        section_name = lines_buffer_head.split(':')[0]
                        matched_section_name = match_section_name(section_name, all_section_dict, nlp)
                        if matched_section_name != 'None':
                            previous_section_name = matched_section_name
                            if len(lines_buffer_head.split(':')[1:]) > 1:
                                sections[matched_section_name] = [' '.join(lines_buffer_head.split(':')[1:])] + lines_buffer[1:]
                            else:
                                sections[matched_section_name] = lines_buffer[1:]
                            lines_buffer = []
                            continue

                    sections[previous_section_name] = sections.get(previous_section_name, None) + lines_buffer
                lines_buffer = []

In [None]:
for section_name in section_to_parse:
    if section_name in sections:
        text = ' '.join(sections[section_name])
        output = nlp.annotate(text, properties={
                                          'annotators': 'ssplit',
                                          'outputFormat': 'json',
                                          'threads': '4',
                                          'tokenize.options': 'normalizeParentheses=false, normalizeOtherBrackets=false'
                                          })
        try:
            sents = [[str(token['word']) for token in sent['tokens']] for sent in output['sentences']]
        except Exception as e:
            pass
        else:
            sections[section_name] = [' '.join(sent) for sent in sents if sent != ['.']]

for section_name in section_not_to_parse:
    if section_name in sections:
        new_section_content = []
        for text in sections[section_name]:
            output = nlp.annotate(text, properties={
                                              'annotators': 'ssplit',
                                              'outputFormat': 'json',
                                              'threads': '4',
                                              'tokenize.options': 'normalizeParentheses=false, normalizeOtherBrackets=false'
                                              })
            try:
                sents = [[str(token['word']) for token in sent['tokens']] for sent in output['sentences']]
            except Exception as e:
                pass
            else:
                new_section_content.append(' '.join([' '.join(sent) for sent in sents if sent != ['.']]))
        sections[section_name] = new_section_content

In [None]:
with open(data_dir + 'tmp', 'w') as f:
    for section_name in hard_section_list:
        # add section head tag
        f.write('\n\n\n\n[SECTION-{}-START]'.format(section_name))
        if section_name in sections:
            for item in sections[section_name]:
                # tag negated or affirmed based on string matching --- negation term list
                # add one space to prevent loss of 'no ', 'not ', ... etc.
                if any(substring in ' ' + item for substring in neg_term):
                    f.write('\n\n\n\n' + item + '\t [NEGATED]')
                else:
                    f.write('\n\n\n\n' + item + '\t [AFFIRMED]')
        # add section end tag
        f.write('\n\n\n\n[SECTION-{}-END]'.format(section_name)) # this file for concept extraction and sentence parsing

## Concept extraction

In [None]:
def get_cui_spans(xml_filename):
    tree = etree.parse(xml_filename)
    textsems = tree.xpath('*[@_ref_ontologyConceptArr]')
    span = lambda e: (int(e.get('begin')), int(e.get('end')))
    ref_to_span = {e.get('_ref_ontologyConceptArr'): span(e) for e in textsems}
    fsarrays = tree.xpath('uima.cas.FSArray')
    id_to_ref = {e.text: fs.get('_id') for fs in fsarrays for e in fs}
    umlsconcepts = tree.xpath('org.apache.ctakes.typesystem.type.refsem.UmlsConcept')
    cui_ids = [(c.get('cui'), c.get('tui'), c.get('preferredText'), c.get('_id')) for c in umlsconcepts]
    id_to_span = lambda _id: ref_to_span[id_to_ref[_id]]
    cui_spans = [(cui, tui, pt, id_to_span(_id)) for cui, tui, pt, _id in cui_ids]    
    seen = set()
    seen_add = seen.add
    return [cs for cs in cui_spans if not (cs in seen or seen_add(cs))]


def extract_cuis(xml_filename):
    cui_spans = get_cui_spans(xml_filename)
    cui_spans.sort(key=lambda cs: cs[3])
    row_id = os.path.basename(xml_filename).split('.')[0]
    txt = etree.parse(xml_filename).xpath('uima.cas.Sofa')[0].get('sofaString')
    return [(row_id, str(cs[3][0]), str(cs[3][1]), cs[0], cs[1], txt[(cs[3][0]):(cs[3][1])], cs[2]) for cs in cui_spans]

# keep: 047, 046, 033, 184, 061, 048, 131
# discard: 029, 034, 197, 121, 023, 059, 060, 195, 109, 022, 122, 

d = {
  "ddx": ["T047", "T191"], # disease/disorder/syndrome 
  "ssx": ["T033", "T040", "T046", "T048", "T049", "T184"], # symptoms/signs
  "med": ["T116", "T123", "T126", "T131"], # medications
  "dxp": [], # diagnostic proc
  "txp": ["T061"], # therapeutic proc
  "lab": [], # labs
  "ana": ["T017", "T024", "T025"], # anatomy
}

tui_list = []
for k, v in d.items():
    tui_list.extend(v)

In [None]:
# def execute(command):
#     process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
#     while True:
#         nextline = process.stdout.readline()
#         if nextline == '' and process.poll() is not None:
#             break
#         sys.stdout.write(nextline)
#         sys.stdout.flush()

#     output = process.communicate()[0]
#     exitCode = process.returncode

#     if (exitCode == 0):
#         return output
#     else:
#         raise ProcessException(command, exitCode, output)
        
os.system('find . -name ".DS_Store" -type f -delete -print; ')
os.system('cp ' + data_dir + 'tmp ' + ctakes_folder + 'note_input/')
os.system('sh ' + ctakes_folder + 'bin/pipeline.sh')
#(output, err) = p.communicate()
os.system('rm ' + ctakes_folder + 'note_input/tmp')
os.system('mv ' + ctakes_folder + 'note_output/tmp.xml '+ data_dir)

In [None]:
# #subprocess.check_output(['bash', '-c', 'find . -name "' + data_dir + 'tmp" -exec cp {} ' + ctakes_folder + 'note_input/ \;'])
# subprocess.Popen('find . -name ".DS_Store" -type f -delete -print', stdout=subprocess.PIPE, shell=True)
# subprocess.Popen('cp ' + data_dir + 'tmp ' + ctakes_folder + 'note_input/', stdout=subprocess.PIPE, shell=True)
# p = subprocess.Popen("sh " + ctakes_folder + "bin/pipeline.sh", stdout=subprocess.PIPE, shell=True)
# (output, err) = p.communicate()
# subprocess.check_output(['bash', '-c', 'find . -name "' + ctakes_folder + 'note_output/tmp.xml" -exec cp {} ../data/ \;'])
# subprocess.Popen('rm ' + ctakes_folder + 'note_input/tmp', stdout=subprocess.PIPE, shell=True)
# subprocess.Popen('mv ' + ctakes_folder + 'note_output/tmp.xml '+ data_dir, stdout=subprocess.PIPE, shell=True)

In [None]:
#os.system('rm tmp')
d = [e for e in extract_cuis(data_dir + 'tmp.xml')]
df = pd.DataFrame(d, columns=['fname', 'start', 'end', 'cui', 'tui', 'original', 'preferred'])
df = df[df['tui'].isin(tui_list)]

with open(data_dir + 'tmp', 'r') as f:
    doc = f.read()

sec_dict = {}
for sec_head in hard_section_list:
    sec_dict[sec_head] = (doc.index('[SECTION-' + sec_head + '-START]') + len('[SECTION-' + sec_head + '-START]'), \
                          doc.index('[SECTION-' + sec_head + '-END]'))
sec_dict

df['section'] = ''
for idx in df.index:
    for k, v in sec_dict.iteritems():
        if int(df['start'][idx]) > v[0] and int(df['end'][idx]) < v[1]:
            df['section'][idx] = str(k)

In [None]:
# d = [e for e in extract_cuis(data_dir + 'tmp.xml')]
# df = pd.DataFrame(d, columns=['fname', 'start', 'end', 'cui', 'tui', 'original', 'preferred'])
# with open(data_dir + 'tmp', 'r') as f:
#     doc = f.read()

# sec_dict = {}
# for sec_head in hard_section_list:
#     sec_dict[sec_head] = (doc.index('[SECTION-' + sec_head + '-START]') + len('[SECTION-' + sec_head + '-START]'), \
#                           doc.index('[SECTION-' + sec_head + '-END]'))
# sec_dict

# df['section'] = ''
# for idx in df.index:
#     for k, v in sec_dict.iteritems():
#         if int(df['start'][idx]) > v[0] and int(df['end'][idx]) < v[1]:
#             df['section'][idx] = str(k)
# df[df.section != ""].shape

In [None]:
s_neg_start = [s.start() for s in re.finditer('\\n\\n\\n\\n.*\\t \[NEGATED\]', doc)]
s_neg_end = [s.start() for s in re.finditer('\[NEGATED\]', doc)]
s_neg = zip(s_neg_start, s_neg_end) # range of negation in sentence level
neg_range_list = [range(r[0], r[1]) for r in s_neg]
#neg_range_list = [y for x in neg_range_list for y in x]

df['negation'] = 0
df['sent_id'] = 0
df['sent_loc'] = 0
for idx in df.index:
    for i, nl in enumerate(neg_range_list):
        if int(df['start'][idx]) in nl:
            df['negation'][idx] = 0
            df['sent_id'][idx] = i + 1 # sent_id from 1
            df['sent_loc'][idx] = int(df['start'][idx]) - nl[0] + 1 # sent_loc also start from 1

In [None]:
df1 = df[df.sent_id != 0]
df0 = df[df.sent_id == 0]

## Syntactic parsing

In [None]:
class OpenNLP:
    def __init__(self, host='localhost', port=8080):
        uri = "http://%s:%d" % (host, port)
        self.server = ServerProxy(uri)

    def parse(self, text):
        return self.server.parse(text)

nlp = OpenNLP()


# preparing sentence for parsing
l = []
sl = []
with open(data_dir + 'tmp') as fr:
    for sent in fr:
        if sent.endswith('[NEGATED]\n') or sent == '\n':
            l.append(sent)
        if sent.endswith('[NEGATED]\n'):
            sl.append(sent)

# # opennlp parsing
# print '\n--- parse full sentence ---\n'
# tree_list = []
# with open(data_dir + 'tmp_tree', 'w') as fw:        
#     for i, s in enumerate(l):
#         t = (nlp.parse(s.replace('[NEGATED]', '')))
#         if t != '':
# #             print s
# #             print i, t
#             fw.write(t + '\n')
#             tree_list.append(t)
        
# print len(sl)
# print len(tree_list)           
    

# remove before/after words!
# neg_front = [i + ' ' for i in neg_list[neg_list['EN (SV) ACTION'] == 'forward']['ITEM'].tolist()]
# neg_back = [' ' + i for i in neg_list[neg_list['EN (SV) ACTION'] == 'backward']['ITEM'].tolist()]

ll = []
for ss in l:
    s = ''
    flag = ''
    for nw in sorted(neg_list['ITEM'].tolist(), key=len, reverse=True):
        if nw in neg_list[neg_list['EN (SV) ACTION'] == 'forward']['ITEM'].tolist():
            try:
                s = ss[ss.index(nw):]
                flag = 'f'
                break
            except:
                continue
        else:
            try:
                s = ss[:(ss.index(nw)+len(nw))]
                flag = 'b'
                break
            except:
                continue
    ll.append(s)
    
    
tree_list = []
while len(sl) != len(tree_list): # run until opennlp can parse with correct number of sentences. bug???
    # opennlp parsing the neg tree
    print('\n--- parse negated part of the sentence ---\n')
    tree_list = []
    with open(data_dir + 'tmp_neg_tree', 'w') as fw:
        for i, s in enumerate(ll):
            t = (nlp.parse(s.replace('[NEGATED]', '')))
            if t != '':
#                 print s
#                 print i, t
                fw.write(t + '\n')
                tree_list.append(t)
    print len(sl)
    print len(tree_list)

In [None]:
# using stanford corenlp parsing too slow
import requests
def extract_subtree(text, tregex):
    r = requests.post(url="http://localhost:9000/tregex", 
                      data=text, 
                      params={"pattern": tregex})
    js = r.json()
    if js['sentences'][0] and '0' in js['sentences'][0] and 'namedNodes' in js['sentences'][0]['0']:
        return js['sentences'][0]['0']['namedNodes']
    return ''


def extract_subtree_treefile(f, tregex):
    t = subprocess.Popen(tregex_dir + 'tregex.sh ' + tregex + ' ' + f , stdout=subprocess.PIPE, shell=True)
    p = subprocess.Popen(tregex_dir + 'tregex.sh ' + tregex + ' ' + f + ' -t', stdout=subprocess.PIPE, shell=True)
    (tree, err) = t.communicate()
    (output, err) = p.communicate()
    print(tree)
    print(output)
    return output


def tregex_tsurgeon(f, pos):
    cmd = trts[pos][0] + '\n\n' + trts[pos][1].replace(',', '\n')
    with open('./stanford-tregex-2018-02-27/ts', 'w') as fw:     
        fw.write(cmd)
    t = subprocess.Popen('cd ' + tregex_dir + '; ./tsurgeon.sh -treeFile ../' + f + ' ts; cd ..', stdout=subprocess.PIPE, shell=True)
    p = subprocess.Popen('cd ' + tregex_dir + '; ./tsurgeon.sh -treeFile ../' + f + ' ts -s; cd ..', stdout=subprocess.PIPE, shell=True)
    (tree, err) = t.communicate()
    (output, err) = p.communicate()
    print('constituency tree: ' + output.replace('\n', ''))
    ts_out = re.sub('\([A-Z]*\$? |\(-[A-Z]+- |\)|\)|\(, |\(. |\n', '', output)
    ts_out = re.sub('-LRB-', '(', ts_out)
    ts_out = re.sub('-RRB-', ')', ts_out)
    return ts_out, tree

In [None]:
trts = {}
# no "jvd|murmurs|deficits" not work, pleural -> vbz?
# trts['NP'] = ('NP=target << DT=neg <<, /no|without/ !> NP >> TOP=t >> S=s', \
#               'excise s target,delete neg')

# if np with top node=S???
# trts['NP'] = ('NP=target << DT=neg <<, /no|without/ !> NP >> TOP=t >> S=s', \
#               'excise s target,delete neg')
trts['NP'] = ('NP=target << DT=neg <<, /no|without/ !> NP >> TOP=t', \
              'delete neg')
# if np with top node=NP
trts['NP-nS'] = ('NP=target <<, /DT|NN|RB/=neg <<, /no|without/ !> NP >> TOP=t', \
              'delete neg')


# denies -> mis pos to nns
trts['NP-denies'] = ('NP=target <<, /denies|deny|denied/=neg >> TOP=t', \
              'delete neg')

# vp only
trts['VP-A'] = ('VP=target << /VBZ|VBD|VB/=neg >> TOP=t', \
              'delete neg')
trts['VP-CC'] = ('VP=target <<, /VBZ|VBD|VB/=neg < CC >> TOP=t', \
              'delete neg')
# vp only, 'resolved', add that neg1 part to prevent jvd -> VP, rashes -> VP error pos tagging
# trts['VP-P'] = ('NP=target <<, DT=neg1 <<, /no|negative|not/ $ VP=neg2 >> TOP=t >> S=s', \
#               'delete neg1')
trts['VP-P'] = ('VP=vp <<- /free|negative|absent|ruled|out|doubtful|unlikely|excluded|resolved|given/=neg $ NP=head >> TOP=t >> S=s', \
              'excise s head')
# this is post, ... is negative
# trts['ADJP-P'] = ('VP=vp < ADJP <<- /negative/=neg $ NP=target >> TOP=t >> S=s', \
#                 'delete vp,excise s target')
trts['ADJP-P'] = ('VP=vp <<- /free|negative|absent|ruled|out|doubtful|unlikely|excluded|resolved|given/=neg $ NP=head >> TOP=t >> S=s', \
                'excise s head')
# this is ant, negative for ...
# trts['ADJP-A'] = ('PP=head $ JJ=neg < NP=target >> TOP=t > ADJP=s', \
#                 'delete neg')
trts['ADJP-A'] = ('PP=head $ /JJ|ADJP|NP/=neg <- NP=target >> TOP=t >> /S|NP/=s', \
                'excise s target')
# not
# trts['ADVP-P'] = ('VP=target <<, /VB*|MD/ $ RB=neg >> TOP=t >> S=s', \
#                 'excise s target')
trts['ADVP-P'] = ('VP=head $ RB=neg <<, /VB*|MD/=be >> TOP=t >> S=s', \
                'delete head,delete neg')

# trts['ADVP-A'] = ('VP=target <<, /VB*|MD/ $ RB=neg >> TOP=t >> S=s', \
#                 'excise s target')
trts['ADVP-A'] = ('VP=head $ RB=neg <<, /VB*|MD/ >> TOP=t >> S=s', \
                'excise s head')
trts['ADVP-A2'] = ('VP=head << RB=neg <<, /VB*|MD/ << /ADJP|VP/=target >> TOP=t >> S=s', \
                'excise s target')
# remove sbar
trts['ADVP-sbar'] = ('PP=head <<, /of|without/=neg > NP $ NP < NP=target >> TOP=t >> NP=st << SBAR=sbar', \
                'excise st target,delete sbar')
trts['ADVP-advp'] = ('ADVP=advp', \
                'delete advp')
trts['forced-sbar'] = ('SBAR=sbar', \
                'delete sbar')

# remove RB
trts['ADVP-RB'] = ('TOP=target <<, RB=neg', \
                'delete neg')

# sob become this, so need to be after np and vp
# trts['PP'] = ('PP=head <<, /of|without/=neg > NP $ NP < NP=target >> TOP=t >> NP=s', \
#               'excise s target')
trts['PP'] = ('PP=head <<, IN=neg1 < NP=target >> TOP=t >> /S|NP|ADJP/=s $ /JJ|NP/=neg2', \
              'excise s target')
trts['PP-2'] = ('PP=head << IN=neg <<, /of|without/ >> TOP=t', \
                'delete neg')

trts['NP-CC'] = ('S=s < NP =head<< PP=target << DT=neg <<, /no|without/ < CC=but << but < S=rm < /\.|\,/=punct << SBAR=sbar !> NP > TOP=t', 
                 'delete neg,delete sbar,delete punct,delete but,delete rm')
trts['NP-although'] = ('S=s < NP =head<< PP=target << DT=neg <<, /no|without/ << /although|but/ < /\.|\,/=punct << SBAR=sbar !> NP > TOP=t', 
                       'delete neg,delete sbar,delete punct')

In [None]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
RM_POS = ['NN', 'NNS', 'RB', 'NP', 'ADVP', 'IN']
RM_CP = ['however', 'although', 'but']

In [None]:
from difflib import SequenceMatcher

for i, t in enumerate(tree_list):
    print('sent: ' + str(i))
    print('original: ' + sl[i])
    
    # get negated part of the sentence
    with open(data_dir + 'ntree_tmp', 'w') as fw:     
        fw.write(t)
    s = re.sub('\([A-Z]*\$? |\(-[A-Z]+- |\)|\)|\(, |\(. ', '', t)
    print('neg part: ' + s)
    
    # find what neg term is matched and use its neg type
    try:
        m = ''
        for neg in [x for x in sorted(neg_list['ITEM'].tolist(), key=len, reverse=True)]:
        #for neg in ['negative for']:
            match = SequenceMatcher(None, s, neg).find_longest_match(0, len(s), 0, len(neg))
            matched_string = s[match.a: match.a + match.size]
            try: # if next char might be different, means partial match
                if s[match.a + match.size + 1] == neg[match.b + match.size + 1] and \
                   s[match.a + match.size + 2] == neg[match.b + match.size + 2]:
                    if (len(matched_string) > len(m)) and \
                        ((matched_string[0] == s[0] and matched_string[1] == s[1]) or \
                         (matched_string[len(matched_string)-1] == s[len(s)-1] and matched_string[len(matched_string)-2] == s[len(s)-2])): # either match from the beginning or laast
                        m = matched_string 
                        matched_neg_item = neg[match.b: match.b + match.size]
                        if matched_neg_item[len(matched_neg_item)-1] == ' ':
                            matched_neg_item = matched_neg_item[0:len(matched_neg_item)-1]
                else:
                    continue
            except: # if no next char, means full match
                try:
                    if (len(matched_string) > len(m)) and \
                        ((matched_string[0] == s[0] and matched_string[1] == s[1]) or \
                         (matched_string[len(matched_string)-1] == s[len(s)-1] and matched_string[len(matched_string)-2] == s[len(s)-2])): # either match from the beginning or laast
                        m = matched_string 
                        matched_neg_item = neg[match.b: match.b + match.size]
                        if matched_neg_item[len(matched_neg_item)-1] == ' ':
                            matched_neg_item = matched_neg_item[0:len(matched_neg_item)-1]
                except: # match only one char!? rare case
                    if (len(matched_string) > len(m)) and \
                        (matched_string[0] == s[0]): # either match from the beginning or laast   
                        m = matched_string
                        matched_neg_item = neg[match.b: match.b + match.size]
                        if matched_neg_item[len(matched_neg_item)-1] == ' ':
                            matched_neg_item = matched_neg_item[0:len(matched_neg_item)-1]                    
        print('negated term: ' + matched_neg_item)
        
        neg_type = neg_list[neg_list.ITEM == matched_neg_item]['TYPE'].values[0]
        print('--- tregex/tsurgeon with negated type: ' + neg_type)

        # run tregex/tsurgeon based on the selected neg type
        ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', neg_type)

        # deal with corner cases
        if neg_type == 'NP' and ('that' in ts_out):
            print('--- NP with that')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'NP-denies')
        if neg_type == 'NP' and s == ts_out:
            print('--- NP without S node')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'NP-nS')
        
        if neg_type == 'PP' and sum([item in neg_list['ITEM'].tolist() for item in ts_out.split()]) > 0:
            print('--- NP without S node')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'NP-nS')
            
        if neg_type == 'VP-A' and s == ts_out:
            print('--- VP-A remove denies')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'NP-denies')
            
        if neg_type == 'ADVP-A' and s == ts_out:
            print('--- ADVP-A type 2')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'ADVP-A2')
        if neg_type == 'ADVP-A' and s == ts_out:
            print('--- ADVP-A remove SBAR')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'ADVP-sbar')
        if neg_type == 'ADVP-A' and s == ts_out: # no longer
            print('--- ADVP-A remove ADVP')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'ADVP-advp')
        if neg_type == 'ADVP-A' and s == ts_out:
            print('--- ADVP-A remove RB')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'ADVP-RB')
        
        if 'SBAR' in tree:
            print('--- forced remove SBAR')
            ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'forced-sbar')
            
#         if sum([item in neg_list['ITEM'].tolist() for item in ts_out.split()]) > 0:
#             print('--- remove neg terms if exists')
#             ts_out = ' '.join(ts_out.split()[1:])
            
        if sum([item in RM_POS for item in ts_out.split()]) > 0:
            print('--- remove POS')
            ts_out = ' '.join(ts_out.split()[1:])
            
        if sum([item in RM_CP for item in ts_out.split()]) > 0:
            print('--- remove CP')
            for cp in RM_CP:
                try:
                    cp_loc = ts_out.split().index(cp)
                except:
                    continue
            ts_out = ' '.join(ts_out.split()[:cp_loc])
            
        if ts_out.split()[0] in neg_list['ITEM'].tolist() + stopwords:
            print('--- remove first token f if f in negated list or stopword list')
            ts_out = ' '.join(ts_out.split()[1:])

#         if neg_type == 'VP-A' and len(ts_out) < 2:
#             print('--- VP-A CC')
#             ts_out, tree = tregex_tsurgeon(data_dir + 'ntree_tmp', 'VP-CC')

        print('>> ' + ts_out + '\n')

        try:
            neg_range = (sl[i].index(ts_out) + 1, sl[i].index(ts_out) + len(ts_out)) # negated place
        except:
            neg_range = (0, len(sl))
        
        print(neg_range)

        for idx in df1.index:
            if df1['sent_id'][idx] == i+1 and df1['sent_loc'][idx] in range(neg_range[0], neg_range[1]):
                df1['negation'][idx] = 1
                
    except: # need to debug why very few cases don't work
        continue

In [None]:
# preserve the longest strings/concepts
df_s = df1
df_s['start'] = df_s['start'].astype(int)
df_s['len'] = df_s['original'].str.len()
df_s = df_s.sort_values('len', ascending=False)
df_s = df_s.drop_duplicates(['sent_id', 'start'], keep='first')
df_s = df_s.drop_duplicates(['sent_id', 'end'], keep='first')
df_s = df_s.sort_values('start', ascending=True)
df_s

In [None]:
pd.set_option('display.max_rows', None)
df_ss = df_s[(df_s.sent_id != 0) & (df_s.section != '')]

def print_out_result(df):
    for s in set(df['section'].values):
        if s != '':
            subset = df[df['section'] == s][['preferred', 'negation']]
            subset['preferred'] = np.where(subset['negation'] == 1, subset['preferred'] + '(-)', subset['preferred'] + '(+)')
            print '--- ' + s + ' ---\n' + ', '.join(subset['preferred'])
    
print_out_result(df_ss)