In [None]:
#from google.colab import drive
#drive.mount('/gdrive')
#%cd /gdrive/My Drive/NLP_progetto_2024

In [None]:
#!pip install transformers

In [None]:
#!wandb login

In [None]:
#import os
#os.environ["WANDB_DISABLED"] = "true"

In [None]:
import json
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

2024-05-20 10:49:15.368784: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-20 10:49:15.368884: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-20 10:49:15.512255: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## 1) Preprocess Data


In [None]:
# Flatten the json file
def flatten_data(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [None]:
# Import json file
file_json = open("/kaggle/input/medical-meadow-wikidoc-medical-flashcards/medical_meadow_wikidoc_medical_flashcards (1).json")
data = json.load(file_json)

# Convert from json to dataframe
df = pd.DataFrame.from_dict(data)

# We drop the column instruction since it's not
# useful for the training
df = df.drop(columns='instruction')

# We split the dataset into training, validation and test
# 70% for training, 21% for validation and 9% for test
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(test_data, test_size=0.7, random_state=42)

# We convert to json again, removing the index inserted by the dataframe
json_dict_train = json.loads(train_data.to_json(orient='values'))
json_dict_val = json.loads(val_data.to_json(orient='values'))
json_dict_test = json.loads(test_data.to_json(orient='values'))

# We add '[Q]' in front of all the questions and '[A]' in front
# of all the answer to be able to differentiate between the
# beginning of the question and the beginning of the answer
for i in json_dict_train:
    i[0] = '[Q]' + i[0]
    i[1] = '[A]' + i[1]

for i in json_dict_val:
    i[0] = '[Q]' + i[0]
    i[1] = '[A]' + i[1]

for i in json_dict_test:
    i[0] = '[Q]' + i[0]
    i[1] = '[A]' + i[1]

# Flatten the json files
json_dict_val = flatten_data(json_dict_val)
json_dict_train = flatten_data(json_dict_train)
json_dict_test = flatten_data(json_dict_test)

# Save the json files modified in this way to use them later (maybe)
with open('/kaggle/working/data_train.json', 'w', encoding='utf-8') as f:
    json.dump(json_dict_train, f, ensure_ascii=False, indent=4)

with open('/kaggle/working/data_val.json', 'w', encoding='utf-8') as f:
    json.dump(json_dict_val, f, ensure_ascii=False, indent=4)

with open('/kaggle/working/data_test.json', 'w', encoding='utf-8') as f:
    json.dump(json_dict_test, f, ensure_ascii=False, indent=4)

## 2) GPT2 Fine Tuning

In [None]:
# Function to build the dataset
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        # The path where the json file is stored
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [None]:
# Function to build the data collator (we need it since we are going to
# train the model for conditional generation)
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [None]:
# Function to manage the training
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps,val_file_path):

  # We are going to use the tokenizer for GPT2
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  val_dataset = load_dataset(val_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  # Import GPT2 for language modeling
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          report_to="none",
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          # We want to save only at the end
          save_strategy = 'no',
          # Evaluate every epoch
          evaluation_strategy = 'epoch',
          # Set mixed precision
          fp16=True
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
          eval_dataset=val_dataset
  )

  trainer.train()
  trainer.save_model()

In [None]:
# Trainer params
train_file_path = "/kaggle/working/data_train.json"
val_file_path = "/kaggle/working/data_val.json"
model_name = 'gpt2'
output_dir = "/kaggle/working/output"
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 10

In [None]:
#!pip install accelerate

In [None]:
# Train
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=0,
    val_file_path = val_file_path
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,1.7096,1.607048
2,1.5361,1.530275
3,1.4341,1.489915
4,1.3597,1.469549
5,1.2986,1.456332
6,1.2532,1.449715
7,1.2095,1.448546
8,1.1746,1.449344
9,1.1476,1.449584
10,1.129,1.452726


## 3) Test the model

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
# Load the model from path
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

# Load tokenizer from path
def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

# Generate text given a starting sequence
def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
# We test the model with a sample question
model1_path = "/kaggle/working/output"
# Put '[Q]' at the beginning of the question and '[A]' where the answer should start
sequence1 = "[Q] What is the recommended management for a patient with unstable angina and a positive stress test? [A]"

# Max length of the answer
max_len = 50

generate_text(model1_path, sequence1, max_len)
print("The correct answer is: The recommended management for a patient with unstable angina and a positive stress test is cardiac catheterization.")

[Q] What is the recommended management for a patient with unstable angina and a positive stress test? [A]The recommended management for a patient with unstable angina and a positive stress test is coronary artery dissection. Antibody therapy is
The correct answer is: The recommended management for a patient with unstable angina and a positive stress test is cardiac catheterization.


## References


*   https://github.com/bnsreenu/python_for_microscopists/blob/master/311_fine_tuning_GPT2.ipynb
*   https://huggingface.co/docs/transformers/model_doc/gpt2

