In [3]:
import pandas as pd

# Not Splitted Dataset Creation

In [42]:
def text_preprocess(text):
    text = text.replace('\t', ' ').replace('\n', ' ')
    text_filtered = ''
    for char_index, char in enumerate(text):
        if char_index < (len(text)-1):
            if not(text[char_index + 1] == ' ' and char == ' '):
                text_filtered += char
    return text_filtered.lower()

In [44]:
def get_text(filename, dataset_type):
    with open('../data/trainingdata_v3/' + dataset_type + '/' + filename + '.txt') as file:
        text = file.readlines()
    text = ''.join(text)
    text = text_preprocess(text)
    return text

In [47]:
def build_dataset(dataset_type):
    df = pd.read_csv('../data/trainingdata_v3/datasets/dataset_gpt2_' + dataset_type + '.tsv', sep='\t')
    df = df.sort_values(['filename', 'start'], ignore_index=True)
    df = df.groupby(['filename'], as_index = False).agg({'output' : '\n'.join })
    df['text'] = df.apply( lambda row: get_text(row['filename'], dataset_type), axis=1)
    return df

In [48]:
df_train = build_dataset('train')
df_dev = build_dataset('dev')

In [51]:
df_train.to_csv('../data/trainingdata_v3/datasets/train_dataset_endtoend.tsv', sep='\t')
df_dev.to_csv('../data/trainingdata_v3/datasets/dev_dataset_endtoend.tsv', sep='\t')

# Tokens Length Analysis

In [None]:
df_train = pd.read_csv('../data/trainingdata_v3/datasets/train_dataset_endtoend.tsv', sep='\t')
df_dev = pd.read_csv('../data/trainingdata_v3/datasets/dev_dataset_endtoend.tsv', sep='\t')

In [9]:
import plotly.express as px
from transformers import AutoTokenizer

In [6]:
model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
df_train['tokenizer_length_text'] = df_train.apply(lambda row: len(tokenizer.encode(row['text'])) , axis=1)
df_train['tokenizer_length_output'] = df_train.apply(lambda row: len(tokenizer.encode(row['output'])) , axis=1)

Token indices sequence length is longer than the specified maximum sequence length for this model (654 > 512). Running this sequence through the model will result in indexing errors


In [10]:
fig = px.histogram(df_train, x="tokenizer_length_text")
fig.show()

In [11]:
fig = px.histogram(df_train, x="tokenizer_length_output")
fig.show()

# Splitting Text Data Up to 150 tokens

In [42]:
import pandas as pd
from tqdm import tqdm
from os import walk

In [43]:
def build_context_dataset(folder_path):
  # I create a list with all the filenames .ann and .text
  filenames = next(walk(folder_path), (None, None, []))[2]
  filenames_ann = [filename for filename in filenames if 'ann' in filename]
  filenames_txt = [filename for filename in filenames if 'txt' in filename]
  filenames_ann.sort()
  filenames_txt.sort()
  filenames_ann_txt = list(zip(filenames_ann, filenames_txt))
  # print('filenames_ann_txt sample:', filenames_ann_txt[1])

  df_dataset = pd.DataFrame()
  for index, (filename_ann, filename_txt) in tqdm(enumerate(filenames_ann_txt)):
      df_ann = pd.read_csv(folder_path + filename_ann, sep='\t', names=['entity-event-context', 'classification-type', 'value'])
      
      # I load the clinical note of the corresponding txt file
      with open(folder_path + filename_txt, 'r') as file:
        text = ''.join(file.readlines())

      if len(df_ann) == 0: continue # how to handle empty annotation files?
      df_disp = df_ann[df_ann['entity-event-context'].str.startswith('T')].copy()
      # careful: taking only first start-stop when multiple separated with ';' are present. how to handle multiple locations?
      df_disp[['disposition-type','start','end']] = df_disp['classification-type'].apply(lambda x: x.split(';',1)[0].split(' ')).to_list()
      df_disp['start'] = df_disp['start'].astype(int)
      df_disp['end'] = df_disp['end'].astype(int)
      df_disp = df_disp.drop('classification-type',axis=1)

      # mapping E# to T# (some annotations have different numbers)
      df_map = df_ann[df_ann['entity-event-context'].str.startswith('E')].set_index('entity-event-context')['classification-type'].apply(lambda x: x.split(':')[-1].rstrip(' '))

      # Context data
      df_ctx = pd.DataFrame(df_ann[df_ann['entity-event-context'].str.startswith('A')]['classification-type'].str.split(' ').to_list(), columns=['Category','Event','Value'])
      df_ctx = df_ctx.pivot(index='Event',columns='Category',values='Value')
      df_ctx = df_ctx.join(df_map).reset_index()
      df_ctx = df_disp[df_disp['disposition-type']=='Disposition'].merge(df_ctx, how='left', left_on='entity-event-context', right_on='classification-type').reset_index()
      # df_ctx = df_disp[df_disp.merge(df_ctx, how='left', left_on='entity-event-context', right_on='classification-type').reset_index()
      # Text and meta data
      df_no_disposition = df_disp.loc[df_disp['disposition-type']!='Disposition'].copy()
      if len(df_ctx) > 0:
        df_ctx.loc[:, 'filename'] = filename_ann[:-4]

      if len(df_no_disposition) > 0:
        df_no_disposition.loc[:, 'filename'] = filename_ann[:-4]

      df_dataset = pd.concat([df_dataset,df_ctx, df_no_disposition], ignore_index=True)

  df_dataset['value'] = df_dataset.apply(lambda row: str(row['value']).lower(), axis=1)
  df_dataset.loc[df_dataset['disposition-type']=='Disposition', 'disposition-type'] = 'Change'
  df_dataset.loc[df_dataset['disposition-type']=='NoDisposition', 'disposition-type'] = 'NoChange'
  separator = ', '
  df_dataset['output'] = df_dataset.apply(
    lambda row: 'medication: ' + row['value'] + separator +\
      'disposition: ' + row['disposition-type'] if row['disposition-type'] != 'Change' else\
      'medication: ' + row['value'] + separator + 'disposition: ' + row['disposition-type'] + separator +\
      'action: ' + row['Action'] + separator + 'actor: ' + row['Actor'] + separator +\
      'certainty: ' + row['Certainty'] + separator + 'negation: ' + row['Negation'] + separator +\
      'temporality: ' + row['Temporality'] + '\n',
    axis=1
  )
  df_dataset = df_dataset.sort_values(['filename', 'start'], ignore_index=True)
  return df_dataset


In [44]:
df_train = build_context_dataset('../data/trainingdata_v3/train/')
df_dev = build_context_dataset('../data/trainingdata_v3/dev/')

350it [00:08, 39.32it/s]
50it [00:01, 40.06it/s]


In [235]:
df_train.to_csv('../data/trainingdata_v3/datasets/dataset_context_train.tsv', sep='\t')
df_dev.to_csv('../data/trainingdata_v3/datasets/dataset_context_dev.tsv', sep='\t')

In [172]:
df_train = pd.read_csv('../data/trainingdata_v3/datasets/dataset_context_train.tsv', sep='\t')
df_dev = pd.read_csv('../data/trainingdata_v3/datasets/dataset_context_dev.tsv', sep='\t')

In [45]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [46]:
def text_preprocess(text):
    text = text.replace('\t', ' ').replace('\n', ' ')
    text_filtered = ''
    for char_index, char in enumerate(text):
        if char_index < (len(text)-1):
            if not(text[char_index + 1] == ' ' and char == ' '):
                text_filtered += char
    return text_filtered.lower()

In [47]:
def build_context_splitted_dataset(folder_path):
    filenames = next(walk(folder_path), (None, None, []))[2]
    filenames_ann = [filename for filename in filenames if 'ann' in filename]
    filenames_txt = [filename for filename in filenames if 'txt' in filename]
    filenames_ann.sort()
    filenames_txt.sort()
    filenames_ann_txt = list(zip(filenames_ann, filenames_txt))
    # print('filenames_ann_txt sample:', filenames_ann_txt[1])

    df_dataset = pd.DataFrame()
    df_context = build_context_dataset(folder_path)
    for index, (filename_ann, filename_txt) in tqdm(enumerate(filenames_ann_txt)):
        with open(folder_path + filename_txt, 'r') as file:
            text = ''.join(file.readlines())
        
        text_ids = tokenizer.encode(text)
        text_slices = []
        start_slice_ids = 0
        start_slice_text = 0
        max_lenght = 150
        
        df_entities = df_context.loc[df_context['filename']==filename_txt[:-4]].copy()
        # print(df_entities[['value','output']])
        for end_slice in range(max_lenght, len(text_ids), max_lenght):
            df_row = pd.DataFrame({'filename': [filename_txt[:-4]]})

            text_slice = tokenizer.decode(text_ids[start_slice_ids:end_slice])
            text_slices.append(text_slice)
            df_row['text'] = text_preprocess(text_slice)
            # assert ((~((df_entities['start']>start_slice_text) & (df_entities['end']>=start_slice_text + len(text_slice))) == True).all().item()),\
            #     f'One or more entities were cutted during splitting, start_slice_text: {start_slice_text}, end_slice_text:'+\
            #     f'{start_slice_text + len(text_slice)}, entities: {df_entities[["start", "end", "value"]]}'
            output_slice = '\n'.join(df_entities[(df_entities['start']>start_slice_text) & (df_entities['end']<start_slice_text + len(text_slice))]['output'])
            df_row['output'] = output_slice
            start_slice_text += len(text_slice)
            start_slice_ids = end_slice
            df_dataset = pd.concat([df_dataset, df_row], ignore_index=True)

        df_row = pd.DataFrame({'filename': [filename_txt[:-4]]})
        text_slice = tokenizer.decode(text_ids[start_slice_ids:])
        text_slices.append(text_slice)
        df_row['text'] = text_preprocess(text_slice)
        output_slice = '\n'.join(df_entities[(df_entities['start']>start_slice_text) & (df_entities['end']<start_slice_text + len(text_slice))]['output'])
        df_row['output'] = output_slice
        df_dataset = pd.concat([df_dataset, df_row], ignore_index=True)
    return df_dataset

In [48]:
df_train = build_context_splitted_dataset('../data/trainingdata_v3/train/')
df_dev = build_context_splitted_dataset('../data/trainingdata_v3/dev/')

350it [00:07, 47.24it/s]
3it [00:00, 28.38it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1146 > 1024). Running this sequence through the model will result in indexing errors
350it [00:14, 24.30it/s]
50it [00:00, 51.88it/s]
50it [00:02, 21.56it/s]


In [57]:
df_train.to_csv('../data/trainingdata_v3/datasets/dataset_endtoend_splitted_train.tsv', sep='\t')
df_dev.to_csv('../data/trainingdata_v3/datasets/dataset_endtoend_splitted_dev.tsv', sep='\t')

# Token Length Analysis

In [None]:
print(df_train.loc[3175, 'output'])

In [241]:
df_train = pd.read_csv('../data/trainingdata_v3/datasets/dataset_endtoend_splitted_train.tsv', sep='\t')
df_dev = pd.read_csv('../data/trainingdata_v3/datasets/dataset_endtoend_splitted_dev.tsv', sep='\t')

In [55]:
df_dev['tokenizer_length_text'] = df_dev.apply(lambda row: len(tokenizer.encode(row['text'])) , axis=1)
df_dev['tokenizer_length_output'] = df_dev.apply(lambda row: len(tokenizer.encode(row['output'])) , axis=1)

In [56]:
df_dev.sort_values('tokenizer_length_output')

Unnamed: 0,filename,text,output,tokenizer_length_text,tokenizer_length_output
0,100-01,record date: 2106-02-12 campbell orthopedic a...,,48,0
379,315-02,"record date: 2122-04-23 patient name: valle,i...",,33,0
376,315-01,152/87 mm hg p: 88 bpm resp: 20 rpm sao2: 97%...,,100,0
372,315-01,record date: 2121-03-31 kern medical center p...,,114,0
371,302-04,she will listen to the advice that i gave her...,,71,0
...,...,...,...,...,...
469,350-02,agree with holding clopidogrel. if she does n...,"medication: clopidogrel, disposition: Change, ...",144,231
577,389-02,2 sats 88-92% chest pressure: follow for now. ...,"medication: asa, disposition: Undetermined\nme...",126,252
35,100-04,on integrilin x 24hrs prior. he was started o...,"medication: integrilin, disposition: Change, a...",128,254
101,131-02,s weight loss. 78-9 really not adequate contro...,"medication: cardura, disposition: Change, acti...",141,262


In [50]:
df_train['tokenizer_length_text'] = df_train.apply(lambda row: len(tokenizer.encode(row['text'])) , axis=1)
df_train['tokenizer_length_output'] = df_train.apply(lambda row: len(tokenizer.encode(row['output'])) , axis=1)

In [51]:
df_train.sort_values('tokenizer_length_output')

Unnamed: 0,filename,text,output,tokenizer_length_text,tokenizer_length_output
0,101-01,record date: 2079-05-12 mercy care center mer...,,122,0
2367,281-03,ur-spgr 1.010(t) 1.018 ua-bld negative negati...,,28,0
2368,281-03,.5 ua-prot negative trace ua-urobi negative ne...,,15,0
2369,281-03,ua-nit negative negative ua-wbc negative nega...,,33,0
2370,281-03,is mild to moderate stenosis of the origins o...,,102,0
...,...,...,...,...,...
975,177-04,f/up blood cultures and sputum cultures -- f/...,"medication: atenolol, disposition: NoChange\nm...",134,294
2637,291-05,"inal equivalent. small troponin leak, likely i...","medication: fluid, disposition: NoChange\nmedi...",132,298
503,145-04,floor and elevated her legs above her head wh...,"medication: excedrin, disposition: Change, act...",147,302
3028,322-03,this order it's written to hold nph if tf sto...,"medication: nph, disposition: Change, action: ...",144,304


In [269]:
fig = px.histogram(df_train, x="tokenizer_length_text")
fig.show()

In [270]:
fig = px.histogram(df_train, x="tokenizer_length_output")
fig.show()

# Create endtoend Sentence Level Splitting

In [1]:
import pandas as pd
from tqdm import tqdm
from os import walk
import spacy
# spacy.require_gpu()
nlp = spacy.load("en_core_sci_scibert")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [3]:
def text_preprocess(text):
    text = text.replace('\t', ' ').replace('\n', ' ')
    text_filtered = ''
    for char_index, char in enumerate(text):
        if char_index < (len(text)-1):
            if not(text[char_index + 1] == ' ' and char == ' '):
                text_filtered += char
    return text_filtered.lower()

In [4]:
def build_context_dataset(folder_path):
  # I create a list with all the filenames .ann and .text
  filenames = next(walk(folder_path), (None, None, []))[2]
  filenames_ann = [filename for filename in filenames if 'ann' in filename]
  filenames_txt = [filename for filename in filenames if 'txt' in filename]
  filenames_ann.sort()
  filenames_txt.sort()
  filenames_ann_txt = list(zip(filenames_ann, filenames_txt))
  # print('filenames_ann_txt sample:', filenames_ann_txt[1])

  df_dataset = pd.DataFrame()
  for index, (filename_ann, filename_txt) in tqdm(enumerate(filenames_ann_txt)):
      df_ann = pd.read_csv(folder_path + filename_ann, sep='\t', names=['entity-event-context', 'classification-type', 'value'])
      
      # I load the clinical note of the corresponding txt file
    #   with open(folder_path + filename_txt, 'r') as file:
    #     text = ''.join(file.readlines())

      if len(df_ann) == 0: continue # how to handle empty annotation files?
      df_disp = df_ann[df_ann['entity-event-context'].str.startswith('T')].copy()
      # careful: taking only first start-stop when multiple separated with ';' are present. how to handle multiple locations?
      df_disp[['disposition-type','start','end']] = df_disp['classification-type'].apply(lambda x: x.split(';',1)[0].split(' ')).to_list()
      df_disp['start'] = df_disp['start'].astype(int)
      df_disp['end'] = df_disp['end'].astype(int)
      df_disp = df_disp.drop('classification-type',axis=1)

      # mapping E# to T# (some annotations have different numbers)
      df_map = df_ann[df_ann['entity-event-context'].str.startswith('E')].set_index('entity-event-context')['classification-type'].apply(lambda x: x.split(':')[-1].rstrip(' '))

      # Context data
      df_ctx = pd.DataFrame(df_ann[df_ann['entity-event-context'].str.startswith('A')]['classification-type'].str.split(' ').to_list(), columns=['Category','Event','Value'])
      df_ctx = df_ctx.pivot(index='Event',columns='Category',values='Value')
      df_ctx = df_ctx.join(df_map).reset_index()
      df_ctx = df_disp[df_disp['disposition-type']=='Disposition'].merge(df_ctx, how='left', left_on='entity-event-context', right_on='classification-type').reset_index()
      # df_ctx = df_disp[df_disp.merge(df_ctx, how='left', left_on='entity-event-context', right_on='classification-type').reset_index()
      # Text and meta data
      df_no_disposition = df_disp.loc[df_disp['disposition-type']!='Disposition'].copy()
      if len(df_ctx) > 0:
        df_ctx.loc[:, 'filename'] = filename_ann[:-4]

      if len(df_no_disposition) > 0:
        df_no_disposition.loc[:, 'filename'] = filename_ann[:-4]

      df_dataset = pd.concat([df_dataset,df_ctx, df_no_disposition], ignore_index=True)

  df_dataset['value'] = df_dataset.apply(lambda row: str(row['value']).lower(), axis=1)
  df_dataset.loc[df_dataset['disposition-type']=='Disposition', 'disposition-type'] = 'Change'
  df_dataset.loc[df_dataset['disposition-type']=='NoDisposition', 'disposition-type'] = 'NoChange'
  separator = ', '
  df_dataset['output'] = df_dataset.apply(
    lambda row: 'medication: ' + row['value'] + separator +\
      'disposition: ' + row['disposition-type'] if row['disposition-type'] != 'Change' else\
      'medication: ' + row['value'] + separator + 'disposition: ' + row['disposition-type'] + separator +\
      'action: ' + row['Action'] + separator + 'actor: ' + row['Actor'] + separator +\
      'certainty: ' + row['Certainty'] + separator + 'negation: ' + row['Negation'] + separator +\
      'temporality: ' + row['Temporality'],
    axis=1
  )
  df_dataset = df_dataset.sort_values(['filename', 'start'], ignore_index=True)
  return df_dataset


In [40]:
def build_context_splitted_dataset(folder_path):
    filenames = next(walk(folder_path), (None, None, []))[2]
    filenames_ann = [filename for filename in filenames if 'ann' in filename]
    filenames_txt = [filename for filename in filenames if 'txt' in filename]
    filenames_ann.sort()
    filenames_txt.sort()
    filenames_ann_txt = list(zip(filenames_ann, filenames_txt))
    # print('filenames_ann_txt sample:', filenames_ann_txt[1])

    df_dataset = pd.DataFrame()
    df_context = build_context_dataset(folder_path)
    for index, (filename_ann, filename_txt) in tqdm(enumerate(filenames_ann_txt)):
        with open(folder_path + filename_txt, 'r') as file:
            text = ''.join(file.readlines())
        
        text_ids = tokenizer.encode(text)
        text_slices = []
        start_slice_ids = 0
        start_slice_text = 0
        max_lenght = 150
        
        df_entities = df_context.loc[df_context['filename']==filename_txt[:-4]].copy()
        print(len(text))
        print(df_entities)
        # print(text)
        doc = nlp(text)
        use_paragraph_slice = False
        max_lenght = 100
        text_slices = []
        sents = [sent.text for sent in doc.sents]
        # print(sents)
        text_slice = ''
        start_slice_text = 0
        for sent in sents:
            df_row = pd.DataFrame({'filename': [filename_txt[:-4]]})

            if not use_paragraph_slice:
                text_slice = sent
                text_slices.append(text_slice)
                df_row['text'] = text_preprocess(text_slice)
                print('slice_start:', start_slice_text)
                print('end:', start_slice_text + len(text_slice))
                output_slice = '\n'.join(df_entities[(df_entities['start']>start_slice_text) & (df_entities['end']<start_slice_text + len(text_slice))]['output'])
                df_row['output'] = output_slice
                start_slice_text += len(text_slice)
                df_dataset = pd.concat([df_dataset, df_row], ignore_index=True)
            elif len(tokenizer.encode(text_slice + sent)) > max_lenght:
                text_slices.append(text_slice)
                df_row['text'] = text_preprocess(text_slice)
                output_slice = '\n'.join(df_entities[(df_entities['start']>start_slice_text) & (df_entities['end']<start_slice_text + len(text_slice))]['output'])
                df_row['output'] = output_slice
                start_slice_text += len(text_slice)
                df_dataset = pd.concat([df_dataset, df_row], ignore_index=True)
                text_slice = sent
            else:
                text_slice += sent
        if use_paragraph_slice:
            text_slices.append(text_slice)

        break


        # for end_slice in range(max_lenght, len(text_ids), max_lenght):
        #     df_row = pd.DataFrame({'filename': [filename_txt[:-4]]})

        #     text_slice = tokenizer.decode(text_ids[start_slice_ids:end_slice])
        #     text_slices.append(text_slice)
        #     df_row['text'] = text_preprocess(text_slice)
        #     # assert ((~((df_entities['start']>start_slice_text) & (df_entities['end']>=start_slice_text + len(text_slice))) == True).all().item()),\
        #     #     f'One or more entities were cutted during splitting, start_slice_text: {start_slice_text}, end_slice_text:'+\
        #     #     f'{start_slice_text + len(text_slice)}, entities: {df_entities[["start", "end", "value"]]}'
        #     output_slice = '\n'.join(df_entities[(df_entities['start']>start_slice_text) & (df_entities['end']<start_slice_text + len(text_slice))]['output'])
        #     df_row['output'] = output_slice
        #     start_slice_text += len(text_slice)
        #     start_slice_ids = end_slice
        #     df_dataset = pd.concat([df_dataset, df_row], ignore_index=True)

        df_row = pd.DataFrame({'filename': [filename_txt[:-4]]})
        text_slice = tokenizer.decode(text_ids[start_slice_ids:])
        text_slices.append(text_slice)
        df_row['text'] = text_preprocess(text_slice)
        output_slice = '\n'.join(df_entities[(df_entities['start']>start_slice_text) & (df_entities['end']<start_slice_text + len(text_slice))]['output'])
        df_row['output'] = output_slice
        df_dataset = pd.concat([df_dataset, df_row], ignore_index=True)
    return df_dataset

In [41]:
df_train = build_context_splitted_dataset('../data/trainingdata_v3/train/')
df_dev = build_context_splitted_dataset('../data/trainingdata_v3/dev/')

350it [00:05, 61.72it/s]
0it [00:00, ?it/s]

2452
  index entity-event-context      value disposition-type  start   end Event  \
0   NaN                   T2    lipitor         NoChange    703   710   NaN   
1   NaN                   T3  synthroid         NoChange   2090  2099   NaN   

  classification-type filename Action Actor Certainty Negation Temporality  \
0                 NaN   101-01    NaN   NaN       NaN      NaN         NaN   
1                 NaN   101-01    NaN   NaN       NaN      NaN         NaN   

                                         output  
0    medication: lipitor, disposition: NoChange  
1  medication: synthroid, disposition: NoChange  


0it [00:00, ?it/s]


slice_start: 0
end: 201
slice_start: 201
end: 249
slice_start: 249
end: 302
slice_start: 302
end: 341
slice_start: 341
end: 398
slice_start: 398
end: 447
slice_start: 447
end: 489
slice_start: 489
end: 589
slice_start: 589
end: 642
slice_start: 642
end: 706
slice_start: 706
end: 751
slice_start: 751
end: 783
slice_start: 783
end: 830
slice_start: 830
end: 831
slice_start: 831
end: 895
slice_start: 895
end: 988
slice_start: 988
end: 989
slice_start: 989
end: 1003
slice_start: 1003
end: 1004
slice_start: 1004
end: 1033
slice_start: 1033
end: 1086
slice_start: 1086
end: 1157
slice_start: 1157
end: 1188
slice_start: 1188
end: 1220
slice_start: 1220
end: 1276
slice_start: 1276
end: 1368
slice_start: 1368
end: 1380
slice_start: 1380
end: 1411
slice_start: 1411
end: 1520
slice_start: 1520
end: 1521
slice_start: 1521
end: 1700
slice_start: 1700
end: 1733
slice_start: 1733
end: 1990
slice_start: 1990
end: 1991
slice_start: 1991
end: 2146
slice_start: 2146
end: 2185
slice_start: 2185
end: 2189
s

50it [00:00, 59.13it/s]
0it [00:00, ?it/s]

2199
  index entity-event-context       value disposition-type  start  end Event  \
0   NaN                   T1      prozac         NoChange    821  827   NaN   
1   NaN                   T2    cardizem         NoChange    829  837   NaN   
2   NaN                   T4  glucophage         NoChange    839  849   NaN   
3   NaN                   T3      amaryl         NoChange    854  860   NaN   

  classification-type filename Action Actor Certainty Negation Temporality  \
0                 NaN   100-01    NaN   NaN       NaN      NaN         NaN   
1                 NaN   100-01    NaN   NaN       NaN      NaN         NaN   
2                 NaN   100-01    NaN   NaN       NaN      NaN         NaN   
3                 NaN   100-01    NaN   NaN       NaN      NaN         NaN   

                                          output  
0      medication: prozac, disposition: NoChange  
1    medication: cardizem, disposition: NoChange  
2  medication: glucophage, disposition: NoChange  
3   

0it [00:01, ?it/s]

slice_start: 0
end: 58
slice_start: 58
end: 92
slice_start: 92
end: 182
slice_start: 182
end: 402
slice_start: 402
end: 496
slice_start: 496
end: 561
slice_start: 561
end: 693
slice_start: 693
end: 759
slice_start: 759
end: 805
slice_start: 805
end: 855
slice_start: 855
end: 915
slice_start: 915
end: 936
slice_start: 936
end: 945
slice_start: 945
end: 1000
slice_start: 1000
end: 1113
slice_start: 1113
end: 1204
slice_start: 1204
end: 1255
slice_start: 1255
end: 1289
slice_start: 1289
end: 1358
slice_start: 1358
end: 1401
slice_start: 1401
end: 1428
slice_start: 1428
end: 1482
slice_start: 1482
end: 1483
slice_start: 1483
end: 1537
slice_start: 1537
end: 1574
slice_start: 1574
end: 1671
slice_start: 1671
end: 1729
slice_start: 1729
end: 1799
slice_start: 1799
end: 1804
slice_start: 1804
end: 1836
slice_start: 1836
end: 1924
slice_start: 1924
end: 2057
slice_start: 2057
end: 2079
slice_start: 2079
end: 2180





In [22]:
df_train[df_train['filename']=='101-01'].iloc[34]['text']

'2. hypothyroidism, well controlled at the present time. continue same dose of synthroid. 3. hypertension, well controlled presently. 4. hyperlipidemia.'

In [23]:
df_train[df_train['filename']=='101-01'].iloc[34]['output']

'medication: synthroid, disposition: NoChange'

In [34]:
df_train[df_train['filename']=='101-01']

Unnamed: 0,filename,text,output
0,101-01,record date: 2079-05-12 mercy care center mer...,
1,101-01,she has been in rhode island for six months.,
2,101-01,her complaint is that she has heaviness in the...,
3,101-01,she does not have any palpitations.,
4,101-01,she does not note that this is associated with...,
5,101-01,she does have diaphoresis associated with it.,
6,101-01,she does not have any shortness of breath,
7,101-01,this heaviness has been in her chest on and o...,
8,101-01,she has a history of hypothyroidism and hypert...,
9,101-01,she also has hyperlipidemia and is being trea...,
