<a href="https://colab.research.google.com/github/deea-c/thesis_absa/blob/main/Hyperparameter_tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/code
!pwd 

/content/drive/MyDrive/Colab Notebooks/code
/content/drive/MyDrive/Colab Notebooks/code


In [None]:
#To install
!pip install evaluate
!pip install seqeval (need to install this) 
!pip install optuna
!pip install transformers
!pip install seqeval

In [None]:
from transformers import set_seed
random_seed = 11
set_seed(random_seed)

In [None]:
#Packages
from help_functions import get_annotations_as_dict, align_tokens_and_annotations_bio, compute_metrics
from help_functions import metric
from help_functions import TraingDataset, LabelSet, TrainingExample, label_list, label_set
import pandas as pd 
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer,EarlyStoppingCallback
from transformers import DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
import evaluate

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
#Load data
df = get_annotations_as_dict( pd.read_pickle("../pickles/" + "train_English" +".pkl"))
test_df = get_annotations_as_dict( pd.read_pickle("../pickles/" + "test_English" +".pkl"))

In [None]:
model_name = 'bert-base-multilingual-cased'


tokenizer = AutoTokenizer.from_pretrained(model_name) # Load a pre-trained tokenizer
label_set = LabelSet(labels=["positive", "negative", "neutral"])

ds = TraingDataset(
    data=df, tokenizer=tokenizer, label_set=label_set, tokens_per_batch=89)
test_ds = TraingDataset(
    data=test_df, tokenizer=tokenizer, label_set=label_set, tokens_per_batch=89)

train_ds , val_ds= train_test_split(ds, test_size=0.2, random_state=random_seed)
data_collator = DataCollatorForTokenClassification(tokenizer)
label_list = label_set.ids_to_label
print(label_list)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

{0: 'O', 1: 'B-positive', 2: 'I-positive', 3: 'B-negative', 4: 'I-negative', 5: 'B-neutral', 6: 'I-neutral'}


In [None]:
import json

In [None]:

training_args = TrainingArguments(
    output_dir = f"search/{model_name}-finetuned",
    logging_dir= f"search/{model_name}-finetuned/log",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    logging_strategy= "epoch",
    save_strategy = "epoch",
    per_device_eval_batch_size=16,   
    num_train_epochs =10, 
    metric_for_best_model  = "eval_loss",
    greater_is_better = False,
    seed = 11,
    save_total_limit=1,
    push_to_hub= False,
    load_best_model_at_end = True
)

def model_init():
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_set.labels_to_id), label2id = label_set.labels_to_id, id2label =label_set.ids_to_label )
    for param in model.base_model.parameters():
        param.requires_grad = True
    return model
trainer = Trainer(
    args=training_args,   
    train_dataset= train_ds,
    eval_dataset= val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init=model_init,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
   )

def my_hp_space(trial):
    learning_rates = [5e-5, 3e-5, 2e-5]
    w_d =  [1e-1,1e-2,1e-3,1e-5 ]
    batch_t =[8, 16, 32]
    return {
        'learning_rate': trial.suggest_categorical('learning_rate', learning_rates),
        'per_device_train_batch_size': trial.suggest_categorical('per_device_train_batch_size', batch_t),
        'weight_decay': trial.suggest_categorical('weight_decay',w_d),
    }
def my_objective(metrics):
    return metrics["eval_val_f1"]

best_run = trainer.hyperparameter_search(direction="maximize",backend="optuna", hp_space=my_hp_space, compute_objective=my_objective, n_trials=30)




In [None]:
with open(f"./search/modelbest_run.json", "w+") as f:
  f.write(json.dumps(best_run.hyperparameters))