# Create dataset with sentences constituent parse trees

Note: Notebook requires coreNLP server is installed and running...

In [1]:
import numpy as np
import pandas as pd
import os
import gc
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import matplotlib.pyplot as plt
%matplotlib inline  

Using TensorFlow backend.


In [2]:
# we will use the python CoreNLP wrapper py-coreNLP, per recommendation in Stanford CoreNLP official pages
# https://github.com/smilli/py-corenlp

from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
# file PubMed_20k_RCT.csv created by script01_create_single_dataset
df_all = pd.read_csv('input/PubMed_20k_RCT.csv')
df_train = df_all[df_all['partition']=='train']
df_valid = df_all[df_all['partition']=='dev']
df_test = df_all[df_all['partition']=='test']
pd.set_option('max_colwidth',500)
df_all.head()

Unnamed: 0,partition,abstract_id,seq,text,label
0,train,4293578,0,"To investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .",OBJECTIVE
1,train,4293578,1,A total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .,METHODS
2,train,4293578,2,Outcome measures included pain reduction and improvement in function scores and systemic inflammation markers .,METHODS
3,train,4293578,3,Pain was assessed using the visual analog pain scale ( 0-100 mm ) .,METHODS
4,train,4293578,4,"Secondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and 6-min walk distance ( 6MWD ) .",METHODS


In [4]:
X_train_cnt = df_train.shape[0]
X_valid_cnt = df_valid.shape[0]
X_test_cnt = df_test.shape[0]

X_all = df_all.text.values

print('Train partition size: {}'.format(X_train_cnt))
print('Valid partition size: {}'.format(X_valid_cnt))
print('Test partition size: {}'.format(X_test_cnt))
print('Total dataset size: {}'.format(X_all.shape[0]))

Train partition size: 180040
Valid partition size: 30212
Test partition size: 30135
Total dataset size: 240387


## Vectorization of constituent parse tree

In [5]:
%%time

i = 1

# get dependency parse in bracketted format
def get_deptree(txt):
    global i
    try:
        dep_parse = nlp.annotate(txt, properties={'annotators': 'parse', 'outputFormat': 'json'})
        dep_parse = dep_parse['sentences'][0]['parse'].replace('\n','')
        dep_parse = re.sub(r'\s+',r' ',dep_parse)
        dep_parse = re.sub(r'(\(.*?)\s[\w\-\.\,\;\:]*?\)',r'\1)', dep_parse)
        dep_parse = re.sub(r'\(', r'( ', dep_parse)
        dep_parse = re.sub(r'\)', r' )', dep_parse)
    except:
        i += 1
        dep_parse = '( ROOT )'
    return dep_parse

df_all['deptree'] = df_all['text'].apply(lambda x: get_deptree(x))
print('{} rows could not be parsed'.format(i))

1323 rows could not be parsed
CPU times: user 21min 44s, sys: 3min 32s, total: 25min 17s
Wall time: 23h 30min 26s


In [67]:
def check_voc():

    s = ' '.join(df_all.deptree2.values)
    voc = {token:1 for token in s.split()}
    s = [token for token in s.split()]
    VOC_SIZE = len(voc)
    print('Vocabulary size = {}'.format(VOC_SIZE))
    return voc, s

In [101]:
%%time

def get_deptree2(txt):
    #txt = re.sub(r'\s([\w\-]+[\/\+\-\@])+[\w\-]+\s\)',r'',txt)
    txt = re.sub(r'\-([A-Z]{3})\-', r'\1', txt)
    txt = re.sub(r'\+\d+\s\)', r'', txt)
    txt = re.sub(r'\s([\w\d\-\+\?\=]+[\/\+\-\@\&])+[\w\d\-]*\s\)',r'',txt)
    txt = re.sub(r'\shttp\:.+?\s\)', r'', txt)
    txt = re.sub(r'\sgov\/.+?\s\)', r'', txt)
    txt = re.sub(r'\scom\/.+?\s\)', r'', txt)
    #txt = re.sub(r'\sX\s\)', r'', txt)
    txt = re.sub(r'\s\d+\s\)', r'', txt)
    txt = re.sub(r'\s\+\d+[\.\,]\d+\s\)', r'', txt)
    txt = re.sub(r'\sn\'t\s\)', r'', txt)
    txt = re.sub(r'\s[\+\*\%\<\>\~\|\/\:\!\=\&\#\$\@]\s\)', r'', txt)
    txt = re.sub(r'\s[Hh]ttps\:.+?\s\)', r'', txt)
    txt = re.sub(r'\s[Hh]ttps\:.+?\s\)', r'', txt)
    return txt

df_all['deptree2'] = df_all['deptree'].apply(lambda x: get_deptree2(x))
voc, s = check_voc()

Vocabulary size = 194
CPU times: user 25.1 s, sys: 951 ms, total: 26 s
Wall time: 26.1 s


In [106]:
from collections import Counter
x = Counter(s)
print(x.most_common(100))
voc

[('(', 11328176), (')', 11126934), ('NP', 2503916), ('NN', 1466331), ('PP', 729243), ('IN', 676206), ('VP', 665350), ('JJ', 617825), ('S', 429279), ('NNS', 417422), ('DT', 386647), ('CD', 381243), (',', 250487), ('CC', 248855), ('ROOT', 240387), ('.', 238254), ('VBD', 212804), ('VBN', 188630), ('RRB', 173765), ('LRB', 172978), ('PRN', 158895), ('ADJP', 146848), ('NNP', 122749), ('RB', 113344), ('TO', 99911), ('VB', 81203), ('ADVP', 76275), ('VBG', 69221), ('SBAR', 61444), ('JJR', 48684), ('VBZ', 43157), ('NP-TMP', 42126), ('VBP', 25812), ('QP', 23114), ('MD', 21642), ('WHNP', 18779), ('PRP', 18298), ('PRP$', 9592), ('WDT', 9424), ('WP', 8660), ('EX', 8469), ('RBR', 7284), ('POS', 6029), ('JJS', 4979), ('WRB', 4843), ('WHADVP', 4772), ('FRAG', 4435), ('FW', 4332), ('UCP', 3995), ("''", 3721), ('LS', 3697), ('``', 3564), ("'", 3495), ("'s", 3472), ('CONJP', 2681), ('LST', 2591), (':', 2532), ('NNPS', 2074), ('PRT', 2061), ('RBS', 1992), ('RP', 1841), ('SINV', 1655), ('WHPP', 1526), ('NX'

{'#': 1,
 '#CIF': 1,
 '#ISRCTN': 1,
 '#KCT': 1,
 '#NCT': 1,
 '#NTR': 1,
 '#RECF': 1,
 '$': 1,
 "'": 1,
 "''": 1,
 "'Em": 1,
 "'M": 1,
 "'S": 1,
 "'d": 1,
 "'s": 1,
 '(': 1,
 ')': 1,
 '+.58': 1,
 '+0.010.07': 1,
 '+0.020.29': 1,
 '+0.020.45': 1,
 '+0.050.28': 1,
 '+0.070.29': 1,
 '+0.100.04': 1,
 '+0.11.0': 1,
 '+0.110.20': 1,
 '+0.110.30': 1,
 '+0.140.43': 1,
 '+0.300.45': 1,
 '+0.35.8': 1,
 '+0.472.18': 1,
 '+0.853.7': 1,
 '+20.6829.30': 1,
 '+29.774.1': 1,
 '+36,800.00': 1,
 '+5.343.4': 1,
 '+5.52.0': 1,
 '+5.864.30': 1,
 '+8.440.1': 1,
 ',': 1,
 '.': 1,
 '1': 1,
 '10': 1,
 '11': 1,
 '112': 1,
 '116': 1,
 "12-o'clock": 1,
 '133': 1,
 '16': 1,
 '184': 1,
 '23': 1,
 '24': 1,
 '26': 1,
 '308': 1,
 '31': 1,
 '315': 1,
 '33': 1,
 '34': 1,
 '35': 1,
 '37': 1,
 '41': 1,
 '6': 1,
 "6-o'clock": 1,
 ':': 1,
 ':[': 1,
 '</formula>': 1,
 '<and>': 1,
 '<euro>': 1,
 '<formula>': 1,
 '<span': 1,
 '<span>': 1,
 '>>': 1,
 '?': 1,
 'A$': 1,
 'ADJP': 1,
 'ADVP': 1,
 'AU$': 1,
 'AUD$': 1,
 "B'More": 1,


In [103]:
df_all.to_csv('input/PubMed_20k_RCT_CONSTPARSE.csv', index=False)

In [82]:
df_all.deptree2.values

array([ '( ROOT ( S ( S ( VP ( TO ) ( VP ( VB ) ( NP ( NP ( DT ) ( NN ) ) ( PP ( IN ) ( NP ( NP ( CD ) ( NNS ) ) ( PP ( IN ) ( NP ( JJ ) ( JJ ) ( JJ ) ( NN ) ) ) ) ) ) ( PP ( IN ) ( S ( VP ( VBG ) ( NP ( NP ( NN ) ) ( , ) ( NP ( NN ) ) ( , ) ( CC ) ( NP ( NP ( JJ ) ( JJ ) ( NN ) ) ( PP ( IN ) ( NP ( DT ) ( JJ ) ( NN ) ) ) ) ) ) ) ) ) ) ) ( CC ) ( S ( NP ( IN ) ( DT ) ( NN ) ) ( VP ( MD ) ( VP ( VB ) ( VP ( VBN ) ( PP ( IN ) ( NP ( NP ( CD ) ( NNS ) ) ( PP ( IN ) ( NP ( NP ( JJR ) ( NNS ) ) ( PP ( IN ) ( NP ( NP ( ADJP ( JJ ) ( PP ( TO ) ( NP ( JJ ) ( NN ) ) ) ) ( NN ) ) ( PRN ( ( NP ( NN ) ) ( ) ) ) ) ) ) ) ) ) ) ) ( . ) ) )',
       '( ROOT ( S ( S ( NP ( NP ( DT ) ( NN ) ) ( PP ( IN ) ( NP ( NP ( CD ) ( NNS ) ) ( PP ( IN ) ( NP ( JJ ) ( NN ) ( NN ) ) ) ) ) ) ( VP ( VBD ) ( VP ( VBN ) ( NP ( CD ) ) ) ) ) ( ( S ( NP ( CD ) ) ( VP ( VBD ) ( SBAR ( S ( NP ( NP ( CD ) ( NN ) ( PP ( IN ) ( NP ( NP ( NN ) ) ( CC ) ( NP ( CD ) ) ) ) ) ( VP ( VBD ) ( NP ( NN ) ) ( PP ( IN ) ( NP ( CD ) ( NNS 