In [None]:
%config Completer.use_jedi = False

# Comment

This script works as follows:
- Gets the reference text that contains examples to "teach" GPT-3
- Gets the text of a single document and splits it into paragraphs
- Combine reference text with every single paragraph to get the prompt
- Invokes GPT-3 to produce outputs from prompt
- Combines all outputs to get the output for the entire file in the "smart" format
- Converts the file in the brat format

# Import libraries

In [None]:
import math
import os
import openai
import re
import textwrap
import pandas as pd

# Define functions

In [None]:
def findnth(s, subs, n):
    parts = s.split(subs, n + 1)
    if len(parts) <= n + 1:
        return -1
    return len(s) - len(parts[-1]) - len(subs)

In [None]:
def get_standard_output(simplified_out, document, remove_until, drug_list, filter_by_drug_list = False):
    '''
    Starts from the "simplified" output, i.e. Drug and information related, and converts it into brat format.
    Example:
    Input
    "DRUG: hydrochlorothiazide, DISPOSITION: change, CERTAINTY: certain, ACTOR: physician, ACTION: stop, TEMPORALITY: present, NEGATION: no"
    Output:
    "T8	Disposition 1780 1799	hydrochlorothiazide
    E8	Disposition:T8 
    A26	Certainty E8 Certain
    A27	Actor E8 Physician
    A28	Action E8 Stop
    A29	Temporality E8 Present
    A33	Negation E8 NotNegated"
    Parameters:
    - simplified output: string with the simplified output
    - document: string with text of whole document
    - remove_until: num of characters removed by the beginning of document. This is used to compensate if the header is removed
    - drug_list: list of drugs used to filter the revealed entities. This is done only if filter_by_drug_list = True
    '''
    # remove_until: to compensate for header removed at beginning
    lines_total = simplified_out.strip().split('DRUG: ')
    lines_unfiltered = [i for i in lines_total if i != '']
    
    # Filter by external drug list
    if filter_by_drug_list:
        lines = []
        for el in lines_unfiltered:
            drug_name = el.split(',')[0].lower()
            if drug_name in drug_list:
                lines.append(el)
    else:
        lines = lines_unfiltered
    
    drugs_count = {}
    TE_count = 0
    A_count = 0
    output = ''
    lines = [l for l in lines if len(l) > 0]
    for l in lines:
        fields = l.split(',')
        name = fields[0]
        drugs_count[name] = drugs_count.get(name, 0) + 1
        start_pos = findnth(document, name, drugs_count[name] - 1)
        end_pos = start_pos + len(name)
        TE_count = TE_count + 1
        disposition = re.search('DISPOSITION: ([^,]*),', l)
        if disposition is not None:
            disposition = disposition.group(1)
        # if disposition is not None and disposition.lower() == 'yes':
        if disposition is not None and disposition.lower() == 'change':
            disposition_str = 'Disposition'
        # elif disposition is not None and disposition.lower() == 'no':
        elif disposition is not None and disposition.lower() == 'no change':
            disposition_str = 'NoDisposition'
        elif disposition is not None and disposition.lower() == 'undetermined':
            disposition_str = 'Undetermined'
        else:
            disposition_str = DEFAULT_DISPOSITION
        output = output + 'T' + str(TE_count) + '\t' + disposition_str + ' ' + str(start_pos+remove_until) + ' ' + str(end_pos+remove_until) + '\t' + name + '\n'
        output = output + 'E' + str(TE_count) + '\t' + disposition_str +':' + 'T' + str(TE_count) +	'\n'
        if disposition_str == 'Disposition':
            for attr in DEFAULT_ATTRIBUTES.keys():
                A_count = A_count + 1
                model_value = re.search(attr + ': ([^,]*),', l)
                if model_value is not None:
                    model_value = model_value.group(1)
                if attr == 'NEGATION' and model_value is not None:
                    if model_value.lower() == 'yes':
                        model_value = 'Negated'
                    elif model_value.lower() == 'no':
                        model_value = 'NotNegated'
                if model_value is not None and model_value.title() in VALID_VALUES[attr]:
                    value = model_value.title()
                else:
                    value = DEFAULT_ATTRIBUTES[attr].title()
                output = output + 'A' + str(A_count) + '\t' + attr.title() + ' E' + str(TE_count) + ' ' + value + '\n'
    return output

In [None]:
def split_into_paragraphs(doc, simple_mode = True, num_chars = 200):
    '''
    Split a text into paragraph.
    If simple_mode = True, paragraphs are strings with num_chars words.
    If simple_mode = False, paragraphs will be split in a more proper way. #TODO
    '''
    paragraphs = []
    if simple_mode: # Do simple splitting
        paragraphs = textwrap.wrap(doc, num_chars)
    else: # Do smart splitting
        pass #TODO
    
    return paragraphs

In [None]:
# Paremeters
VALID_VALUES = {'CERTAINTY': ['Certain', 'Hypothetical', 'Conditional', 'Unknown'],
                'TEMPORALITY': ['Past', 'Present', 'Future', 'Unknown'], 
                'ACTOR': ['Physician', 'Patient', 'Unknown'], 
                'ACTION': ['Start', 'Stop', 'Increase', 'Decrease', 'OtherChange', 'UniqueDose', 'Unknown'], 
                'NEGATION': ['Negated', 'NotNegated']
}

# most common values in training set
DEFAULT_ATTRIBUTES = {'CERTAINTY': 'Certain', 'TEMPORALITY': 'Past', 'ACTOR': 'Physician', 'ACTION': 'Start', 'NEGATION': 'NotNegated'}

DEFAULT_DISPOSITION = 'NoDisposition'

savepath = 'result_from_GPT3/'

# Get texts

The prompt will be composed by:
1. Examples = sentences followed by drugs formatted in the "smart" way. Example:

"INPUT:
If Elavil is helpful, may increase to 50 qhs after one week of therapy.
OUTPUT DRUGS WITH DETAILS:
DRUG: Elavil, DISPOSITION: change, CERTAINTY: conditional, ACTOR: physician, ACTION: increase, TEMPORALITY: future, NEGATION: no"

2. "INPUT" + text of single paragraph + "OUTPUT DRUGS WITH DETAILS:"

In [None]:
basepath = # Path of .txt files to be evaluated

# Reference file
filename = # Path of the file with the "gold" prompt
file = open(filename,mode='r')
reference = file.read()
file.close()

In [None]:
# Get all txt files in folder
txt_files = []
for file in os.listdir(basepath):
    if file.endswith('.txt'):
        txt_files.append(os.path.join(basepath, file))

# Get Drugs list

In [None]:
drug_df = pd.read_csv('drugs.csv', sep=',')
drug_df.rename(columns={'0': 'DrugName'}, inplace=True)
drug_df.drop(labels='Unnamed: 0', axis=1, inplace=True)
drug_df = drug_df['DrugName'].str.lower()
drug_list = drug_df.tolist()

# Invoke GPT-3

In [None]:
%%capture
# Credentials
openai.organization = # Organization API-key
openai.api_key = # Personal API-key

openai.Engine.list()

In [None]:
do_slice_text = True # If True, splits text into paragraphs
do_clean_text = False # If True, remove header and footer #TODO

for filename in txt_files:
    print(f'************************************ Working on {filename.split("/")[-1]} ************************************')
    
    # Read file content
    file = open(filename,mode='r')
    input_text = file.read()
    file.close()
    
    # OPTIONAL: clean text from header and footer
    if do_clean_text:
        pass #TODO
    else: # Do not perform cleaning
        clean_text = input_text
        remove_until = 0 # Num of chars removed by cutting off header
    
    # Slice text in smaller chunks
    text_chunks = split_into_paragraphs(clean_text, simple_mode=True, num_chars=1000)
    
    response_chunk = []
    for el in text_chunks:
        # Produce prompt text
        prompt_text = reference + '\n\nINPUT:\n\n' + el + '\n\nOUTPUT DRUGS WITH DETAILS:'
        # Invoke GPT-3
        response = openai.Completion.create(
              engine="text-davinci-002",
              prompt=prompt_text,
              temperature=0,
              max_tokens=1000, # TODO Evaluate if there's a "better" value
              # top_p=1,
              # frequency_penalty=0.0,
              # presence_penalty=0.0,
              # stop=["\n"]
            )
        response_text = response['choices'][0]['text']
        if ('no drugs mentioned' in response_text.lower()): # If response is "No drugs mentioned", do not append, it messes with the valuation script
            pass
        else:
            response_chunk.append(response_text)
    merged_response = '\n'.join(response_chunk)
    
    # Convert to brat format
    result = get_standard_output(merged_response.replace('\n\n\n', '\n', -1), clean_text, 0, drug_list=drug_list, filter_by_drug_list=True) # Sometime there are unnecessary new lines
    # Write to file
    file = open(savepath+filename.split('/')[-1].split('.')[0]+'.ann',mode='w')
    input_text = file.write(result)
    file.close()