# Read data and create single dataset

In [1]:
import numpy as np
import pandas as pd
import gc

In [2]:
def get_dataset(filename, partition):
    file = open(filename, 'r')
    dataset = list()
    for line in file:
        if line.find("###") == 0:
            abstract_id = line[4:-1]
            seq = 0
        elif line.strip():
            tab_loc = line.find("\t")
            if partition != 'test':
                label = line[:tab_loc]
            else:
                label = '?'
            text  = line[tab_loc+1:-1]
            dataset.append([partition, abstract_id, seq, text, label])
            seq += 1
        else:
            # finished reading abstract
            pass
    return pd.DataFrame(np.array(dataset), columns=['partition', 'abstract_id', 'seq', 'text', 'label'])

In [3]:
%%time

df_train = get_dataset('input/train.txt', 'train')
df_dev = get_dataset('input/dev.txt', 'dev')
df_test = get_dataset('input/test.txt', 'test')
df = pd.concat((df_train, df_dev, df_test), axis=0)

del df_train, df_dev, df_test
gc.collect()

CPU times: user 5.53 s, sys: 6.42 s, total: 11.9 s
Wall time: 13.1 s


In [4]:
df.to_csv('input/PubMed_20k_RCT.csv', index=None)
df = pd.read_csv('input/PubMed_20k_RCT.csv')

## Make sure database stats reflect expectations

In [5]:
def print_stats(df, partition):
    dataset = df[df['partition']==partition]
    print('Partition: {}'.format(partition))
    abs_count = len(set(df[df['partition']==partition].abstract_id.values))
    print('\tNumber of abstract: {}'.format(abs_count))
    print('\tNumber of sentences: {}'.format(dataset.abstract_id.count()))
    
    
print_stats(df, 'train')
print_stats(df, 'dev')
print_stats(df, 'test')

Partition: train
	Number of abstract: 15000
	Number of sentences: 180040
Partition: dev
	Number of abstract: 2500
	Number of sentences: 30212
Partition: test
	Number of abstract: 2500
	Number of sentences: 30135


In [6]:
pd.set_option('max_colwidth',500)
df.head(50)

Unnamed: 0,partition,abstract_id,seq,text,label
0,train,4293578,0,"To investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .",OBJECTIVE
1,train,4293578,1,A total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .,METHODS
2,train,4293578,2,Outcome measures included pain reduction and improvement in function scores and systemic inflammation markers .,METHODS
3,train,4293578,3,Pain was assessed using the visual analog pain scale ( 0-100 mm ) .,METHODS
4,train,4293578,4,"Secondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and 6-min walk distance ( 6MWD ) .",METHODS
5,train,4293578,5,"Serum levels of interleukin 1 ( IL-1 ) , IL-6 , tumor necrosis factor ( TNF ) - , and high-sensitivity C-reactive protein ( hsCRP ) were measured .",METHODS
6,train,4293578,6,"There was a clinically relevant reduction in the intervention group compared to the placebo group for knee pain , physical function , PGA , and 6MWD at 6 weeks .",RESULTS
7,train,4293578,7,"The mean difference between treatment arms ( 95 % CI ) was 10.9 ( 4.8-18 .0 ) , p < 0.001 ; 9.5 ( 3.7-15 .4 ) , p < 0.05 ; 15.7 ( 5.3-26 .1 ) , p < 0.001 ; and 86.9 ( 29.8-144 .1 ) , p < 0.05 , respectively .",RESULTS
8,train,4293578,8,"Further , there was a clinically relevant reduction in the serum levels of IL-1 , IL-6 , TNF - , and hsCRP at 6 weeks in the intervention group when compared to the placebo group .",RESULTS
9,train,4293578,9,These differences remained significant at 12 weeks .,RESULTS
