# Create dataset with POS TAGs

In [1]:
import numpy as np
import pandas as pd
import os
import gc
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import spacy
from spacy import displacy

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline  

Using TensorFlow backend.


In [2]:
nlp = spacy.load('en')

In [3]:
# file PubMed_20k_RCT.csv created by script01_create_single_dataset
df_all = pd.read_csv('input/PubMed_20k_RCT.csv')
df_train = df_all[df_all['partition']=='train']
df_valid = df_all[df_all['partition']=='dev']
df_test = df_all[df_all['partition']=='test']
pd.set_option('max_colwidth',500)
df_all.head()

Unnamed: 0,partition,abstract_id,seq,text,label
0,train,4293578,0,"To investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .",OBJECTIVE
1,train,4293578,1,A total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .,METHODS
2,train,4293578,2,Outcome measures included pain reduction and improvement in function scores and systemic inflammation markers .,METHODS
3,train,4293578,3,Pain was assessed using the visual analog pain scale ( 0-100 mm ) .,METHODS
4,train,4293578,4,"Secondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and 6-min walk distance ( 6MWD ) .",METHODS


In [4]:
X_train_cnt = df_train.shape[0]
X_valid_cnt = df_valid.shape[0]
X_test_cnt = df_test.shape[0]

X_all = df_all.text.values

print('Train partition size: {}'.format(X_train_cnt))
print('Valid partition size: {}'.format(X_valid_cnt))
print('Test partition size: {}'.format(X_test_cnt))
print('Total dataset size: {}'.format(X_all.shape[0]))

Train partition size: 180040
Valid partition size: 30212
Test partition size: 30135
Total dataset size: 240387


## Vectorization of pos tag NLP features

In [16]:
%%time

nlp = spacy.load('en')

i = 1

# get pos tag sequence
def get_postags(txt):
    global i
    try:
        doc = nlp(txt)
        postaglist = ' '.join([(token.pos_) for token in doc])
        #print(doc)
        #print(postaglist)
    except:
        i += 1
        postaglist = '*'
    return postaglist

#df_all_head = df_all.head()
#df_all_head['postaglist'] = df_all_head['text'].apply(lambda x: get_postags(x))

df_all['postaglist'] = df_all['text'].apply(lambda x: get_postags(x))
print('{} rows could not be parsed'.format(i))

1 rows could not be parsed
CPU times: user 2h 48min 27s, sys: 3min 36s, total: 2h 52min 3s
Wall time: 43min 45s


In [17]:
df_all.to_csv('input/PubMed_20k_RCT_POS_TAG.csv', index=False)