In [3]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, DatasetDict
from dotenv import dotenv_values
from utils.data_generation import DataGenerator 
from transformers import AutoTokenizer
from config import config
#from utils.data_preprocessor import DataPreprocessor


HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']
#DATASET_CHEKPOINT = dotenv_values(".env.base")['DATASET_CHEKPOINT']
hf_e3c = load_dataset("ferrazzipietro/e3c-sentences", token = HF_TOKEN)

tokenizer = AutoTokenizer.from_pretrained(config.BASE_MODEL_CHECKPOINT, add_eos_token=True)

Downloading readme: 100%|██████████| 2.97k/2.97k [00:00<00:00, 4.54MB/s]
Downloading data: 100%|██████████| 414k/414k [00:00<00:00, 594kB/s]
Downloading data: 100%|██████████| 527k/527k [00:00<00:00, 1.23MB/s]s]
Downloading data: 100%|██████████| 69.8k/69.8k [00:00<00:00, 201kB/s]]
Downloading data: 100%|██████████| 4.45M/4.45M [00:00<00:00, 6.85MB/s]
Downloading data: 100%|██████████| 394k/394k [00:00<00:00, 840kB/s]/s]
Downloading data: 100%|██████████| 545k/545k [00:00<00:00, 984kB/s]/s]
Downloading data: 100%|██████████| 76.9k/76.9k [00:00<00:00, 165kB/s]]
Downloading data: 100%|██████████| 3.86M/3.86M [00:00<00:00, 5.74MB/s]
Downloading data: 100%|██████████| 594k/594k [00:00<00:00, 1.20MB/s]s]
Downloading data: 100%|██████████| 196k/196k [00:00<00:00, 494kB/s]/s]
Downloading data: 100%|██████████| 64.5k/64.5k [00:00<00:00, 183kB/s]s]
Downloading data: 100%|██████████| 2.76M/2.76M [00:00<00:00, 4.59MB/s]]
Downloading data: 100%|██████████| 373k/373k [00:00<00:00, 604kB/s]t/s]
Down

In [None]:
pd_h3c = hf_e3c.to_pandas()
#pd_h3c['entities'] = pd_h3c['entities'].apply(lambda x: ent['text'] for ent in x)

data_ft = pd.DataFrame(columns=['document_id', 'layer', 'prompt', 'answer', 'concatenation', 'original_text'])


In [33]:
tokenizer.eos_token
tokenizer.bos_token
type(tokenizer)

transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast

In [104]:
class DataPreprocessor():

    def __init__(self) -> None:

        self.one_shot_example = """
[INST]
Extract the entities contained in the text and the offset, i.e. the position of that entity in the string. Extract only entities contained in the text.
{instruction_on_response_format}
Text: <<<{example_query}>>> [/INST]
{example_response}
"""
        self.one_shot_example_no_offset = """
[INST]
Extract the entities contained in the text. Extract only entities contained in the text.
{instruction_on_response_format}
Text: <<<{example_query}>>> [/INST]
{example_response}
"""

        self.prompt_template = """
[INST]
Extract the entities contained in the text and the offset, i.e. the position of that entity in the string. Extract only entities contained in the text.
{instruction_on_response_format}
Text: <<{query}>>> [/INST]
"""

        self.prompt_template_no_offset = """
<s>
[INST]
Extract the entities contained in the text. Extract only entities contained in the text.
{instruction_on_response_format}
Text: <<{query}>>> [/INST]
"""


    def _formatting_prompt(self, task: str, input: str, instruction_on_response_format:str, n_shots:int, offset: bool, tokenizer=None, output:str='', list_of_examples: [str]=[], list_of_responses:[str]=[]) -> str:
        """
        Format the input and output into a prompt for the finetuning

        Args:
            task: the task for which the prompt is generated, either 'finetuning' or 'inference'
            input: the input text
            instruction_on_response_format: the instruction on the response format. E.g. "The response must be a list of dictionaries, where each dictionary contains the keys 'text' and 'offset'"
            n_shots: the number of examples to provide as few shot prompting
            offset: whether to require the offset in the response
            tokenizer: the tokenizer to use
            output: the output text
            list_of_examples: the list of examples to provide as few shot prompting
            list_of_responses: the list of responses to provide as few shot prompting

        Returns:
            the formatted prompt
        """
        if task == 'finetuning':
            if n_shots > 0:
                raise ValueError("The numebr of shot in generating prompts for the finetuning must be 0")
            if tokenizer is None:
                raise ValueError("The tokenizer must be provided")
            if output == '':
                raise ValueError("The output must be provided when generating prompts for the finetuning")

        elif task == 'inference':
            if output != '':
                raise ValueError("The output must be an empty string when generating prompts for the inference")
        else:
            raise ValueError("The task must be either 'finetuning' or 'inference'")


        if len(list_of_examples) != len(list_of_responses):
            raise ValueError("The number of examples and responses must be the same")
        if n_shots != len(list_of_examples):
            raise ValueError("The number of examples and shots must be the same")
        if n_shots != len(list_of_responses):
            raise ValueError("The number of responses and shots must be the same")
        
        if offset:
            base_prompt = self.prompt_template.format(
                instruction_on_response_format=instruction_on_response_format, 
                query=input) 
            one_shot_example = self.one_shot_example
        else:
            base_prompt = self.prompt_template_no_offset.format(
                instruction_on_response_format=instruction_on_response_format, 
                query=input)
            one_shot_example = self.one_shot_example_no_offset
            
        prompt = ''
        for shot_example in range(n_shots):
            prompt += one_shot_example.format(
                instruction_on_response_format=instruction_on_response_format, 
                example_query=list_of_examples[shot_example], 
                example_response=list_of_responses[shot_example])
        
        bos_token = tokenizer.bos_token
        eos_token = ''
        if task == 'finetuning':
            eos_token = tokenizer.eos_token
        prompt = bos_token + prompt + base_prompt + output + eos_token
                            
        return prompt


    def _format_entities_in_response(self, entities_list: [dict], offset: bool) -> str:
        """
        Format the response into a string

        Args:
            response: the response to format
            offset: whether to require the offset in the response

        Returns:
            the formatted response
        """
        formatted_response = '['
        if offset:
            for entity in entities_list:
                formatted_response = formatted_response + '{"entity": "' + entity['text'] + f'", "offset": {entity["offsets"]}' + '}, '
        else:
            for entity in entities_list: 
                formatted_response = formatted_response + '{"entity": "' + entity['text'] + '"}, '
        formatted_response = formatted_response[:-2]
        formatted_response = formatted_response + '] '
        return formatted_response
    
    def _apply_to_one_example(self, example, task: str, instruction_on_response_format:str, n_shots:int, offset: bool, tokenizer=None, list_of_examples: [str]=[], list_of_responses:[str]=[]) -> dict:
        """
        Apply the data preprocessing to one example

        Args:
            example: the example (data row) to preprocess
            task: the task for which the prompt is generated, either 'finetuning' or 'inference'
            instruction_on_response_format: the instruction on the response format. E.g. "The response must be a list of dictionaries, where each dictionary contains the keys 'text' and 'offset'"
            n_shots: the number of examples to provide as few shot prompting
            offset: whether to require the offset in the response
            tokenizer: the tokenizer to use
            list_of_examples: the list of examples to provide as few shot prompting
            list_of_responses: the list of responses to provide as few shot prompting

        Returns:
            the preprocessed example
        """
        output = self._format_entities_in_response(entities_list=example['entities'], offset=offset)
        prompt = self._formatting_prompt(task, input=example['sentence'], instruction_on_response_format=instruction_on_response_format, n_shots=n_shots, offset=offset, tokenizer=tokenizer, output=output, list_of_examples=list_of_examples, list_of_responses=list_of_responses)
        example['prompt'] = prompt
        return example
    
    def apply(self, data: Dataset):
        """
        Apply the data preprocessing to the dataset

        Args:
            data: the dataset to preprocess

        Returns:
            the preprocessed dataset
        """
        data = data.map(self._preprocess_function, batched=True)
        return data


In [96]:
print(hf_e3c['en.layer1']['entities'][100])
data_preprocessor = DataPreprocessor()
data_preprocessor._format_entities_in_response(hf_e3c['en.layer1']['entities'][100], offset=True)

[{'id': '7509', 'offsets': [2663, 2671], 'role': '', 'semantic_type_id': '', 'text': 'collapse', 'type': 'EVENT'}, {'id': '7524', 'offsets': [2702, 2712], 'role': '', 'semantic_type_id': '', 'text': 'dobutamine', 'type': 'EVENT'}, {'id': '7806', 'offsets': [2651, 2671], 'role': '', 'semantic_type_id': 'C0948268', 'text': 'hemodynamic collapse', 'type': 'CLINENTITY'}, {'id': '8051', 'offsets': [2629, 2640], 'role': 'PATIENT', 'semantic_type_id': '', 'text': 'the patient', 'type': 'ACTOR'}, {'id': '8089', 'offsets': [2597, 2627], 'role': '', 'semantic_type_id': '', 'text': 'immediate postoperative period', 'type': 'TIMEX3'}]
[{"entity": "collapse", "offset": [2663, 2671]}, {"entity": "dobutamine", "offset": [2702, 2712]}, {"entity": "hemodynamic collapse", "offset": [2651, 2671]}, {"entity": "the patient", "offset": [2629, 2640]}, {"entity": "immediate postoperative period", "offset": [2597, 2627]}]


'[{"entity": "collapse", "offset": [2663, 2671]}, {"entity": "dobutamine", "offset": [2702, 2712]}, {"entity": "hemodynamic collapse", "offset": [2651, 2671]}, {"entity": "the patient", "offset": [2629, 2640]}, {"entity": "immediate postoperative period", "offset": [2597, 2627]}]'

In [105]:
data_preprocessor = DataPreprocessor()
tmp = data_preprocessor._apply_to_one_example(hf_e3c['en.layer1'][0], task='finetuning', instruction_on_response_format='The response must be a list of dictionaries, where each dictionary contains the keys "text" and "offset"', n_shots=0, offset=True, tokenizer=tokenizer, list_of_examples=[], list_of_responses=[])
tmp

{'sentence': 'Hormonal study and dynamic biochemical tests performed indicated ECS.',
 'entities': [{'id': '1704',
   'offsets': [208, 213],
   'role': '',
   'semantic_type_id': '',
   'text': 'study',
   'type': 'EVENT'},
  {'id': '1719',
   'offsets': [238, 243],
   'role': '',
   'semantic_type_id': '',
   'text': 'tests',
   'type': 'EVENT'},
  {'id': '1734',
   'offsets': [254, 263],
   'role': '',
   'semantic_type_id': '',
   'text': 'indicated',
   'type': 'EVENT'},
  {'id': '1749',
   'offsets': [264, 267],
   'role': '',
   'semantic_type_id': '',
   'text': 'ECS',
   'type': 'EVENT'}],
 'original_text': 'A 46-year-old man with hypertension and dyslipidemia diagnosed 4-months before, as well as new-onset diabetes mellitus unveiled 1-month earlier, was referred to emergency department for hypokalemia. Hormonal study and dynamic biochemical tests performed indicated ECS. Imaging and cytological findings pointed toward a likely primary right parotid malignancy with liver metast

In [62]:
first_shot_example = 'We present a case of a 32-year-old woman with a history of gradual enlargement of the anterior neck.'
second_shot_example = 'Patient information: a 9-month-old boy presented to the emergency room with a 3-day history of refusal to bear weight on the right lower extremity and febrile peaks of up to 38.5°C for 24 hours.'
instruction_on_response_format = 'Return the result in a json format.'
first_response = '[{"entity": "present", "offset": [3, 10]}, {"entity": "history", "offset": [48, 55]}, {"entity": "enlargement", "offset": [67, 78]}]'
second_response = '[{"entity": "presented", "offset": [39, 48]}, {"entity": "refusal", "offset": [95, 102]}, {"entity": "bear", "offset": [106, 110]}, {"entity": "peaks", "offset": [159, 164]}]'
input = "A 46-year-old man with hypertension and dyslipidemia diagnosed 4-months before, as well as new-onset diabetes mellitus unveiled 1-month earlier, was referred to emergency department for hypokalemia"
output = '[{"entity": "hypertension", "offset": [13, 25]}, {"entity": "dyslipidemia", "offset": [30, 42]}, {"entity": "diabetes mellitus", "offset": [74, 91]}, {"entity": "hypokalemia", "offset": [143, 154]}]'

In [73]:
data_preprocessor = DataPreprocessor()
tmp = data_preprocessor.formatting_prompt(input=input, task='finetuning', instruction_on_response_format=instruction_on_response_format, offset=True, n_shots=0, tokenizer = tokenizer)#, output='', list_of_examples=[first_shot_example, second_shot_example], list_of_responses=[first_response, second_response])
print(tmp)

<s>
[INST]
Extract the entities contained in the text and the offset, i.e. the position of that entity in the string. Extract only entities contained in the text.
Return the result in a json format.
Text: <<A 46-year-old man with hypertension and dyslipidemia diagnosed 4-months before, as well as new-onset diabetes mellitus unveiled 1-month earlier, was referred to emergency department for hypokalemia>>> [/INST]
</s>
