In [1]:
import os
import pandas as pd
import numpy as np
import pickle

In [2]:
from os import listdir
from os.path import isfile, join

# Generate annotation dataframe from brat annotation

In [4]:
DEFAULT_OTHER_ANNO = 'O'
STANDOFF_ENTITY_PREFIX = 'T'
STANDOFF_RELATION_PREFIX = 'R'
DATA_DIRECTORY = 'data/example_abstract_and_ann'
ann_data_files = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann']

In [5]:
# Get one annotation dataframe
entities = []
relations = []
for file in ann_data_files:
    pmid = file.split('-')[1].split('.')[0]
    with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file:

            lines = document_anno_file.readlines()
            for line in lines:
                standoff_line_0 = line.split('\t')
                standoff_line_1 = standoff_line_0[1].split()
                if standoff_line_0[0][0] == STANDOFF_ENTITY_PREFIX:
                    entity = {}
                    entity['pmid'] = pmid
                    entity['standoff_id'] = int(standoff_line_0[0][1:])
                    #standoff_line_1 = standoff_line_0[1].split()
                    entity['entity_type'] = standoff_line_1[0].capitalize()
                    entity['offset_start'] = int(standoff_line_1[1])
                    entity['offset_end'] = int(standoff_line_1[2])
                    entity['word'] = standoff_line_0[2].split('\n')[0]
                    entities.append(entity)

                elif standoff_line_0[0][0] == STANDOFF_RELATION_PREFIX:
                    relation = {}
                    relation['pmid'] = pmid
                    relation['standoff_id'] = int(standoff_line_0[0][1:])
                    relation['name'] = standoff_line_1[0]
                    relation['standoff_entity1_id'] = int(standoff_line_1[1].split(':')[1][1:])
                    relation['standoff_entity2_id'] = int(standoff_line_1[2].split(':')[1][1:])
                    relations.append(relation)

In [6]:
all_entity_df = pd.DataFrame(entities)
all_relation_df = pd.DataFrame(relations)

In [7]:
all_entity_df.head()

Unnamed: 0,pmid,standoff_id,entity_type,offset_start,offset_end,word
0,32673060,2,Year,389,393,2020
1,32673060,3,Population_info,539,574,"Symptomatic, nonhospitalized adults"
2,32673060,4,Population_info,580,630,laboratory-confirmed COVID-19 or probable COVI...
3,32673060,5,Population_info,635,684,high-risk exposure within 4 days of symptom onset
4,32673060,6,Total_sample_size,1080,1083,423


In [9]:
all_entity_df.to_csv('data/example_abstract_and_ann/all_ann_entity.csv')
#all_relation_df.to_csv('data/example_abstract_and_ann/all_ann_relation.csv')

In [32]:
# Dictionary version
all_entity = {}
all_relation = {}
for file in ann_data_files:
    entities = []
    relations = []
    pmid = file.split('-')[1].split('.')[0]
    with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file:

            lines = document_anno_file.readlines()
            for line in lines:
                standoff_line_0 = line.split('\t')
                standoff_line_1 = standoff_line_0[1].split()
                if standoff_line_0[0][0] == STANDOFF_ENTITY_PREFIX:
                    entity = {}
                    #entity['pmid'] = pmid
                    entity['standoff_id'] = int(standoff_line_0[0][1:])
                    #standoff_line_1 = standoff_line_0[1].split()
                    entity['entity_type'] = standoff_line_1[0].capitalize()
                    entity['offset_start'] = int(standoff_line_1[1])
                    entity['offset_end'] = int(standoff_line_1[2])
                    entity['word'] = standoff_line_0[2].split('\n')[0]
                    entities.append(entity)

                elif standoff_line_0[0][0] == STANDOFF_RELATION_PREFIX:
                    relation = {}
                    relation['standoff_id'] = int(standoff_line_0[0][1:])
                    relation['name'] = standoff_line_1[0]
                    relation['standoff_entity1_id'] = int(standoff_line_1[1].split(':')[1][1:])
                    relation['standoff_entity2_id'] = int(standoff_line_1[2].split(':')[1][1:])
                    relations.append(relation)
    all_entity[pmid] = entities
    all_relation[pmid] = relations
    
with open('data/example_abstract_and_ann/all_ann_entity.pickle', 'wb') as handle:
    pickle.dump(all_entity, handle)

# Focus on sample size entities and convert numbers in text into integer

In [10]:
tt_ss_df = all_entity_df[all_entity_df['entity_type'] == 'Total_sample_size']

We provide two options for transforming the number into integers.

- word2number.w2n: a package to transform word into numbers.
- Our defined processer.

## Option 1: word2number.w2n

In [19]:
from word2number import w2n

In [20]:
def convert_ss_word_to_num(x):
    if x[0].isalpha():
        x_r = x.replace('\u2009',' ').replace('\u2008',' ').replace('\xa0',' ').replace('\u202f',' ')
    else:
        x_r = x.replace(' ','').replace(',','').replace('\u2009','').replace('\u2008','').replace('\xa0','').replace('\u202f','')
    
    try:
        num = int(w2n.word_to_num(x_r))
    except:
        num = None
    return num

In [21]:
tt_ss_df['tt_sample_size'] = tt_ss_df['word'].apply(convert_ss_word_to_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
tt_ss_df

Unnamed: 0,pmid,standoff_id,entity_type,offset_start,offset_end,word,tt_sample_size
4,32673060,6,Total_sample_size,1080,1083,423,423
18,33306283,11,Total_sample_size,665,669,1033,1033
22,32641343,1,Total_sample_size,1129,1132,308,308
31,33085857,4,Total_sample_size,1110,1113,243,243
57,33332779,33,Total_sample_size,911,914,389,389
65,33332778,1,Total_sample_size,1115,1118,275,275
73,33113295,2,Total_sample_size,526,529,452,452
87,32706953,23,Total_sample_size,1207,1210,504,504
91,33301246,3,Total_sample_size,939,945,43448,43448


In [23]:
# Save ann results and generate total sample size labeled results
all_pmid_list = []
for file in ann_data_files:
    pmid = file.split('-')[1].split('.')[0]
    all_pmid_list.append(pmid)
tt_ss_df_save = tt_ss_df[['pmid','tt_sample_size']].merge(pd.DataFrame({'pmid':all_pmid_list}), 
                                                          how='right', on='pmid')
tt_ss_df_save.to_csv('data/example_abstract_and_ann/ann_tt_sample_size.csv')

In [24]:
tt_ss_df_save

Unnamed: 0,pmid,tt_sample_size
0,32673060,423.0
1,33306283,1033.0
2,32641343,308.0
3,33085857,243.0
4,33332779,389.0
5,33332778,275.0
6,33113295,452.0
7,32706953,504.0
8,33301246,43448.0
9,32678530,


## Option 2: Designed number transformer. (the same one used in model training)

In [25]:
from utils import index_numbers

In [26]:
def sse_str_to_num(x):
    try:
        num = int(x)
    except:
        num = x
    return num

In [27]:
tt_ss_df['tt_sample_size2'] = tt_ss_df['word'].apply(index_numbers.NumberTagger().swap) #str
tt_ss_df['tt_sample_size2_num'] = tt_ss_df['tt_sample_size2'].apply(sse_str_to_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
tt_ss_df # the transformed number are the same

Unnamed: 0,pmid,standoff_id,entity_type,offset_start,offset_end,word,tt_sample_size,tt_sample_size2,tt_sample_size2_num
4,32673060,6,Total_sample_size,1080,1083,423,423,423,423
18,33306283,11,Total_sample_size,665,669,1033,1033,1033,1033
22,32641343,1,Total_sample_size,1129,1132,308,308,308,308
31,33085857,4,Total_sample_size,1110,1113,243,243,243,243
57,33332779,33,Total_sample_size,911,914,389,389,389,389
65,33332778,1,Total_sample_size,1115,1118,275,275,275,275
73,33113295,2,Total_sample_size,526,529,452,452,452,452
87,32706953,23,Total_sample_size,1207,1210,504,504,504,504
91,33301246,3,Total_sample_size,939,945,43448,43448,43448,43448


In [None]:
#tt_ss_df_save2 = tt_ss_df[['pmid','tt_sample_size2_num']].merge(pd.DataFrame({'pmid':all_pmid_list}), 
#                                                          how='right', on='pmid')
#tt_ss_df_save2.to_csv('data/example_abstract_and_ann/ann_tt_sample_size2.csv')

# Generate a similar annotated dictionary for loose-match level
- generate a dict, {pmid: {tt_ss: num, poss_tt_ss: [num]}}
- based on the dict generated above, perform loose-levle match lateer

In [35]:
# Example
all_entity['33306283'][0]

{'standoff_id': 7,
 'entity_type': 'Group_size',
 'offset_start': 751,
 'offset_end': 754,
 'word': '518'}

In [37]:
# generate the dictionary
tt_poss_dict = {}
for pmid in all_entity.keys():
    cur_item_ls = all_entity[pmid]
    cur_item_dict = {}
    tt_ss = None
    poss_ss_ls = []
    for ent in cur_item_ls:
        if ent['entity_type'] == 'Total_sample_size':
            tt_ss = ent['word']
            tt_ss = convert_ss_word_to_num(tt_ss)
        if ent['entity_type'] == 'Poss_total_sample':
            num = ent['word']
            num = convert_ss_word_to_num(num)
            poss_ss_ls.append(num)
    
    cur_item_dict['Total_sample_size'] = tt_ss
    if len(poss_ss_ls)>0:
        cur_item_dict['Poss_total_sample'] = poss_ss_ls
    
    tt_poss_dict[pmid] = cur_item_dict

In [38]:
tt_poss_dict['33306283']

{'Total_sample_size': 1033}

In [39]:
with open('data/example_abstract_and_ann/tt_poss_dict.pickle', 'wb') as handle:
    pickle.dump(tt_poss_dict, handle)

# Appendix: Download abstracts from PubMed with a given list of PMID

In [None]:
#from Bio import Entrez
#pmid_to_extract_list = your list of PMID to extract
#pmid_df = a df with columns: ["PMID", "Title"]
#handle = Entrez.efetch(db="pubmed", id=','.join(map(str, pmid_to_extract_list)),
#                       rettype="xml", retmode="text")
#records = Entrez.read(handle)

In [None]:
# Downloaded all the abstracts and generate txt file
#j = 0
#for i in range(len(pmid_to_extract_list)):
#    pmid_ = pmid_to_extract_list[i]
#    title_ = pmid_df[pmid_df['PMID']==pmid_]['Title'].item()
#    pubmed_article_ = records['PubmedArticle'][i]
#    if 'Abstract' in pubmed_article_['MedlineCitation']['Article'].keys():
#        str_list = pubmed_article_['MedlineCitation']['Article']['Abstract']['AbstractText']
#        result_str = 'TITLE: '+title_+'\n'
#        for k in range(len(str_list)):
#            str_item = str_list[k]
#            if len(str_item.attributes)>0:
#                section_str = str_item.attributes['Label']+': '
#                result_str = result_str + section_str + str_item + '\n'
#            else:
#                result_str = result_str + str_item +'\n'
        
#        j = j + 1
#        file_path = 'data/example_abstract_and_ann/'+str(j)+'-'+str(pmid_)+'.txt'
#
#        with open (file_path, 'w') as f:
#            f.write(result_str)