In [1]:
from dotenv import dotenv_values
from datasets import load_dataset, Dataset
from utils.data_preprocessor import DataPreprocessor
from utils.evaluator import Evaluator
from config.finetuning import config
from utils.load_merged_model_tokenizer import load_mergedModel_tokenizer

HF_TOKEN = dotenv_values(".env.base")['HF_TOKEN']

adapters_checkpoints = ['ferrazzipietro/Mistral-7B-Instruct-v0.2_adapters_en.layer1__v0.2_wandblog']
base_models = ['mistralai/Mistral-7B-Instruct-v0.2']
splits = ['en.layer1']
merged_model, tokenizer = load_mergedModel_tokenizer(adapters_checkpoints[0], base_models[0])

dataset = load_dataset("ferrazzipietro/e3c-sentences", token=HF_TOKEN)
dataset = dataset[splits[0]]
preprocessor = DataPreprocessor()
dataset = preprocessor.preprocess_data_one_layer(dataset)
_, val_data, _ = preprocessor.split_layer_into_train_val_test_(dataset, splits[0])

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'load_model_tokenizer' from 'utils.load_merged_model_tokenizer' (/home/pferrazzi/mistral_finetuning/utils/load_merged_model_tokenizer.py)

In [33]:
from utils.output_cleaner import OutputCleaner
from utils.test_data_processor import TestDataProcessor
import pandas as pd

max_new_tokens_factor = 6
n_shots_inference = 2

from_backup_file = False
if from_backup_file:
    tmp_data = pd.read_csv(f"data/test_data_processed/en_nShots{n_shots_inference}_maxNewTokensFactor{max_new_tokens_factor}.csv")
    tmp_data = Dataset.from_pandas(tmp_data)
    postprocessor = TestDataProcessor(test_data=tmp_data, preprocessor=preprocessor, n_shots_inference=n_shots_inference, language=splits[0].split('.')[0], tokenizer=tokenizer)

if not from_backup_file:
    postprocessor = TestDataProcessor(test_data=val_data, preprocessor=preprocessor, n_shots_inference=n_shots_inference, language=splits[0].split('.')[0], tokenizer=tokenizer)
    postprocessor.add_inference_prompt_column()
    postprocessor.add_ground_truth_column()
    postprocessor.add_responses_column(model=merged_model, tokenizer=tokenizer, batch_size=24, max_new_tokens_factor=max_new_tokens_factor)
    postprocessor.test_data.to_csv(f"data/test_data_processed/{splits[0].split('.')[0]}_nShots{n_shots_inference}_maxNewTokensFactor{max_new_tokens_factor}.csv", index=False)

generating responses:   4%|▎         | 24/681 [00:46<21:16,  1.94s/it]

In [31]:
import json
import re

class OutputCleaner():
    def __init__(self) -> None:
        pass
  
    def _assess_model_output(self, model_response: str) -> bool:
        """
        Check if the model output is in the right format. If not, return False.
        
        Args:
        model_output (str): the postprocessed model output after beeing passed to _postprocess_model_output()

        return:
        bool: True if the format is correct, False otherwise
        """
        good_format = True
        try :
            res = json.loads(model_response)
            # print( res)
        except:
            good_format = False
        return good_format
    
    def _clean_model_output(self, example: dict) -> str:
        """
        Postprocess the model output to return a json like formatted string that can be used to compute the F1 score.

        Args:
        model_output (str): the model output as it is returned by the model. The processing of the output is done in the function

        return:
        str: the model response, i.e. the model output without the instruction

        """
        def has_unclosed_square_brackets(s):
            count = 0
            for char in s:
                if char == '[':
                    count += 1
                elif char == ']':
                    count -= 1
                    if count < 0:
                        return True
            return count > 0
        
        model_output = example['model_responses']
        if self._assess_model_output(model_output):
            return {'model_output':model_output}
        
        tmp = re.findall(r'\[\{(.+?)\}\]', model_output)
        if len(tmp) != 0:
            tmp = '[{' + tmp[0] + '}]'
            if self._assess_model_output(tmp):
                return {'model_output':tmp}
        if has_unclosed_square_brackets(model_output):
            last_bracket_index = model_output.rfind('},') # find the last complete entity
            model_output = model_output[:last_bracket_index+1] + ']' 
            return {'model_output':model_output} 
        print('THIS IS A BROKEN ROW: ', model_output)

        return {'model_output':model_output}
  
    def apply_cleaning(self, data) -> None:
        """
        Apply the cleaning to the model output and return the cleaned response.

        Args:
        model_output (str): the model output as it is returned by the model. The processing of the output is done in the function

        return:
        str: the model response, i.e. the model output without the instruction
        """
        return data.map(lambda x: self._clean_model_output(x), )

In [12]:
postprocessor.test_data

Dataset({
    features: ['sentence', 'entities', 'original_text', 'original_id', 'prompt'],
    num_rows: 681
})

In [32]:
output_cleaner = OutputCleaner()
postprocessor.test_data = output_cleaner.apply_cleaning(postprocessor.test_data)


Map: 100%|██████████| 681/681 [00:00<00:00, 8581.97 examples/s]

THIS IS A BROKEN ROW:  {"entity": "signs"}, {"entity": "compression"}, {"entity": "signs"}, {"entity": "respiratory"}, {"entity": "digestive"}, {"entity": "vascular"}, {"entity": "neurologic"}, {"entity
THIS IS A BROKEN ROW:  {"entity": "The mass"}, {"entity": "the front and the two sides of the neck"}, {"entity": "a 32-year-old woman"}, {"entity": "A 9-month-old boy"}, {"entity": "3-day"},
THIS IS A BROKEN ROW:  {"entity": "The mass"}, {"entity": "firm"}, {"entity": "painless"}, {"entity": "movements"}, {"entity": "swallowing"}, {"entity": "a 32-year-old woman"}, {"entity": "9
THIS IS A BROKEN ROW:  {"entity": "CT"}, {"entity": "revealed"}, {"entity": "thyroid mass"}, {"entity": "plunging"}, {"entity": "the anterior mediastinum"}, {"entity": "a partially calcified thyroid mass"}, {"entity
THIS IS A BROKEN ROW:  {"entity": "surrounded"}, {"entity": "narrowed"}, {"entity": "deviated"}, {"entity": "goiter"}, {"entity": "the lower part of the larynx"}, {"entity": "The trachea"}, {"entity"




In [12]:
evaluator = Evaluator(preprocessor, postprocessor.test_data)
results = evaluator.generate_evaluation_table(similar_is_equal=True, similar_is_equal_threshold=80)
results['precision'], results['recall'], results['f1']

ORIGINAL model_response:  []
GROUND THRUTH:  ['hypertension', 'dyslipidemia', 'diagnosed', 'mellitus', 'referred', 'hypokalemia', 'new-onset diabetes mellitus', 'A 46-year-old man', '4-months', '1-month']
NEW model_response:  [] 


ORIGINAL model_response:  ['study', 'tests', 'indicated', 'ECS']
GROUND THRUTH:  ['study', 'tests', 'indicated', 'ECS']
NEW model_response:  ['study', 'tests', 'indicated', 'ECS'] 


ORIGINAL model_response:  ['findings', 'pointed', 'malignancy', 'metastases', 'primary right parotid malignancy', 'liver metastases']
GROUND THRUTH:  ['findings', 'pointed', 'primary right parotid malignancy', 'liver metastases']
NEW model_response:  ['findings', 'pointed', 'malignancy', 'metastases', 'primary right parotid malignancy', 'liver metastases'] 


ORIGINAL model_response:  []
GROUND THRUTH:  ['parotidectomy', 'examination', 'confirmed', 'ACC', 'The patient']
NEW model_response:  [] 


ORIGINAL model_response:  []
GROUND THRUTH:  ['hypercortisolism', 'managed']
NEW mo



ORIGINAL model_response:  []
GROUND THRUTH:  ['re-presented', 'flare', 'thrombosis', 'anticoagulation', 'PV', 'deep vein thrombosis', 'vein', 'the patient', 'late September']
NEW model_response:  [] 


ORIGINAL model_response:  []
GROUND THRUTH:  ['Warfarin', 'changed', 'heparin', 'treatment', 'escalated', 'azathioprine', 'switched', 'mycophenolate', 'increased', 'response', 'given', 'courses', 'received', 'induced', 'remission', 'pemphigus', 'he']
NEW model_response:  [] 


ORIGINAL model_response:  []
GROUND THRUTH:  ['decreased', 'anticoagulation', 'continued', 'ensure', 'decreasing', 'suppression', 'allow', 'flare', 'April 2015', 'July 2015']
NEW model_response:  [] 


ORIGINAL model_response:  []
GROUND THRUTH:  ['reported', 'complaints', 'cough', 'pain', 'appetite', 'fever', 'sweats', 'chest pain', 'reduced appetite', 'fever with chills', 'night sweats', 'chest', 'A 43-year-old non-diabetic Indian male', 'two weeks']
NEW model_response:  [] 


ORIGINAL model_response:  []
GROUND 

(0.6993630573248407, 0.14886117136659435, 0.24547283702213277)