In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/mnt/software/'
import sys
import gc
# assuming data, models, engine in flicc directory:
flicc_path = os.path.join(os.path.dirname(os.getcwd()), '')
sys.path.append(flicc_path)
import torch
from data import ClimateDataset
from models import ClassificationModel
from engine import Engine

# import warnings
# warnings.filterwarnings("ignore")

In [2]:
model_checkpoints=['EleutherAI/gpt-neo-1.3B', 'bigscience/bloom-560m', 'microsoft/deberta-v2-xlarge'] #'EleutherAI/gpt-j-6B', 

In [3]:
results = {'test_acc':[],
           'test_f1':[],
           'eval_acc':[],
           'eval_f1':[],
           'lr':[]}

In [4]:
learning_rates = [1.0e-5, 5.0e-5 ,1.0e-4]
for model_checkpoint in model_checkpoints:
    for lr in learning_rates:
        print(f'Grid search {model_checkpoint}, learning rate {lr}')
        data = ClimateDataset(model_to_train=1,model_checkpoint=model_checkpoint,dataset_url=flicc_path,batch_size=32)
        data.setup_dataloaders()
        model = ClassificationModel(model_checkpoint=data.model_checkpoint,num_labels=data.num_labels)
        trainer = Engine(epochs=30,labels=data.labels)
        trainer.model = model.model
        test_acc, test_f1, eval_acc, eval_f1 = trainer.run(lr=lr,
                                                            wd=0.0,
                                                            train_dataloader=data.train_dataloader,
                                                            eval_dataloader=data.eval_dataloader,
                                                            test_dataloader=data.test_dataloader,
                                                            # accumulation_steps=32,
                                                            early_stop=3)
        results['test_acc'].append(test_acc)
        results['test_f1'].append(test_f1)
        results['eval_acc'].append(eval_acc)
        results['eval_f1'].append(eval_f1)
        results['lr'].append(lr)
        print('### '*10)
        print(results)
        print('### '*10)
        with torch.no_grad():
            torch.cuda.empty_cache()
        del data, model, trainer
        gc.collect()

Grid search EleutherAI/gpt-neo-1.3B, learning rate 1e-05


Downloading (…)okenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Map:   0%|          | 0/1796 [00:00<?, ? examples/s]

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Map:   0%|          | 0/457 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	0.6039	Val Loss:	0.4725	Accuracy:	0.7965	F1:	0.6561 *
2 / 30: Train Loss:	0.2434	Val Loss:	0.5836	Accuracy:	0.7943	F1:	0.6828 *
3 / 30: Train Loss:	0.1067	Val Loss:	0.5921	Accuracy:	0.7549	F1:	0.6627
4 / 30: Train Loss:	0.0569	Val Loss:	1.2122	Accuracy:	0.5711	F1:	0.5643
5 / 30: Train Loss:	0.0873	Val Loss:	0.6382	Accuracy:	0.7330	F1:	0.6765
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.69      0.39      0.49       119
      struct       0.81      0.94      0.87       338

    accuracy                           0.79       457
   macro avg       0.75      0.66      0.68       457
weighted avg       0.78      0.79      0.77       457

test results:
              precision    recall  f1-score   support

       bknow       0.55      0.31      0.40        67
      struct       0.79      0.91      0.85       189

    accuracy                           0.75

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	0.6895	Val Loss:	0.4433	Accuracy:	0.7987	F1:	0.6870 *
2 / 30: Train Loss:	0.4122	Val Loss:	0.4394	Accuracy:	0.8096	F1:	0.7549 *
3 / 30: Train Loss:	0.2080	Val Loss:	0.5369	Accuracy:	0.7943	F1:	0.7151
4 / 30: Train Loss:	0.0816	Val Loss:	0.6084	Accuracy:	0.7877	F1:	0.7106
5 / 30: Train Loss:	0.1022	Val Loss:	0.6202	Accuracy:	0.7965	F1:	0.7208
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.63      0.65      0.64       119
      struct       0.87      0.87      0.87       338

    accuracy                           0.81       457
   macro avg       0.75      0.76      0.75       457
weighted avg       0.81      0.81      0.81       457

test results:
              precision    recall  f1-score   support

       bknow       0.57      0.58      0.58        67
      struct       0.85      0.85      0.85       189

    accuracy                           0.78

Map:   0%|          | 0/457 [00:00<?, ? examples/s]

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	0.6923	Val Loss:	0.4209	Accuracy:	0.8140	F1:	0.6970 *
2 / 30: Train Loss:	0.3206	Val Loss:	0.4470	Accuracy:	0.8184	F1:	0.7372 *
3 / 30: Train Loss:	0.3018	Val Loss:	1.6330	Accuracy:	0.4420	F1:	0.4411
4 / 30: Train Loss:	0.3428	Val Loss:	0.6431	Accuracy:	0.7287	F1:	0.6891
5 / 30: Train Loss:	0.1495	Val Loss:	0.8439	Accuracy:	0.8074	F1:	0.6956
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.71      0.50      0.59       119
      struct       0.84      0.93      0.88       338

    accuracy                           0.82       457
   macro avg       0.78      0.72      0.74       457
weighted avg       0.81      0.82      0.81       457

test results:
              precision    recall  f1-score   support

       bknow       0.59      0.39      0.47        67
      struct       0.81      0.90      0.85       189

    accuracy                           0.77

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	1.2489	Val Loss:	0.5963	Accuracy:	0.7396	F1:	0.4252 *
2 / 30: Train Loss:	0.6287	Val Loss:	0.5840	Accuracy:	0.7440	F1:	0.4428 *
3 / 30: Train Loss:	0.4367	Val Loss:	0.9823	Accuracy:	0.7681	F1:	0.5581 *
4 / 30: Train Loss:	0.2146	Val Loss:	0.9252	Accuracy:	0.7571	F1:	0.6144 *
5 / 30: Train Loss:	0.1276	Val Loss:	1.1665	Accuracy:	0.7287	F1:	0.6478 *
6 / 30: Train Loss:	0.1141	Val Loss:	1.1944	Accuracy:	0.6543	F1:	0.6106
7 / 30: Train Loss:	0.1421	Val Loss:	1.2166	Accuracy:	0.6586	F1:	0.6180
8 / 30: Train Loss:	0.1096	Val Loss:	1.4009	Accuracy:	0.7068	F1:	0.6107
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.48      0.48      0.48       119
      struct       0.82      0.82      0.82       338

    accuracy                           0.73       457
   macro avg       0.65      0.65      0.65       457
weighted avg       0.73      0.73      0.73       457


Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Map:   0%|          | 0/1796 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Map:   0%|          | 0/457 [00:00<?, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	1.2349	Val Loss:	0.4349	Accuracy:	0.7987	F1:	0.7263 *
2 / 30: Train Loss:	0.2699	Val Loss:	0.5054	Accuracy:	0.8337	F1:	0.7635 *
3 / 30: Train Loss:	0.2722	Val Loss:	1.0108	Accuracy:	0.8293	F1:	0.7207
4 / 30: Train Loss:	0.0741	Val Loss:	0.5129	Accuracy:	0.8359	F1:	0.7802 *
5 / 30: Train Loss:	0.0416	Val Loss:	0.7441	Accuracy:	0.8468	F1:	0.7918 *
6 / 30: Train Loss:	0.0183	Val Loss:	0.7027	Accuracy:	0.8206	F1:	0.7786
7 / 30: Train Loss:	0.0092	Val Loss:	0.7016	Accuracy:	0.8359	F1:	0.7864
8 / 30: Train Loss:	0.0139	Val Loss:	0.7043	Accuracy:	0.8490	F1:	0.7874
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.74      0.64      0.68       119
      struct       0.88      0.92      0.90       338

    accuracy                           0.85       457
   macro avg       0.81      0.78      0.79       457
weighted avg       0.84      0.85      0.84       457

t

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.3121	Val Loss:	0.5016	Accuracy:	0.7593	F1:	0.6288 *
2 / 30: Train Loss:	0.3576	Val Loss:	0.6745	Accuracy:	0.8009	F1:	0.6635 *
3 / 30: Train Loss:	0.5050	Val Loss:	1.1716	Accuracy:	0.8096	F1:	0.7587 *
4 / 30: Train Loss:	0.3210	Val Loss:	1.5493	Accuracy:	0.8009	F1:	0.6352
5 / 30: Train Loss:	0.1811	Val Loss:	0.8372	Accuracy:	0.7834	F1:	0.7132
6 / 30: Train Loss:	0.0505	Val Loss:	1.1329	Accuracy:	0.7877	F1:	0.6971
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.62      0.67      0.65       119
      struct       0.88      0.86      0.87       338

    accuracy                           0.81       457
   macro avg       0.75      0.77      0.76       457
weighted avg       0.81      0.81      0.81       457

test results:
              precision    recall  f1-score   support

       bknow       0.60      0.61      0.61        67
      struct       0.86 

Map:   0%|          | 0/457 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	3.2166	Val Loss:	0.5872	Accuracy:	0.7418	F1:	0.7042 *
2 / 30: Train Loss:	0.4308	Val Loss:	0.5799	Accuracy:	0.7396	F1:	0.7091 *
3 / 30: Train Loss:	0.3143	Val Loss:	1.5433	Accuracy:	0.7877	F1:	0.6024
4 / 30: Train Loss:	0.2822	Val Loss:	0.8559	Accuracy:	0.8074	F1:	0.7553 *
5 / 30: Train Loss:	0.1695	Val Loss:	1.4949	Accuracy:	0.8249	F1:	0.7278
6 / 30: Train Loss:	0.1297	Val Loss:	1.6206	Accuracy:	0.7746	F1:	0.7404
7 / 30: Train Loss:	0.2391	Val Loss:	1.1835	Accuracy:	0.7921	F1:	0.7365
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.62      0.66      0.64       119
      struct       0.88      0.86      0.87       338

    accuracy                           0.81       457
   macro avg       0.75      0.76      0.76       457
weighted avg       0.81      0.81      0.81       457

test results:
              precision    recall  f1-score   support

      

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	14.5349	Val Loss:	16.6338	Accuracy:	0.7396	F1:	0.4252 *
2 / 30: Train Loss:	6.6603	Val Loss:	1.1317	Accuracy:	0.7396	F1:	0.4252
3 / 30: Train Loss:	0.8375	Val Loss:	0.7811	Accuracy:	0.7396	F1:	0.4252
4 / 30: Train Loss:	0.6024	Val Loss:	0.7398	Accuracy:	0.7702	F1:	0.6322 *
5 / 30: Train Loss:	0.4724	Val Loss:	0.7451	Accuracy:	0.7768	F1:	0.7037 *
6 / 30: Train Loss:	0.2492	Val Loss:	0.7080	Accuracy:	0.7615	F1:	0.7036
7 / 30: Train Loss:	0.1918	Val Loss:	0.9606	Accuracy:	0.7615	F1:	0.6895
8 / 30: Train Loss:	0.1738	Val Loss:	0.8603	Accuracy:	0.7527	F1:	0.6897
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.58      0.54      0.56       119
      struct       0.84      0.86      0.85       338

    accuracy                           0.78       457
   macro avg       0.71      0.70      0.70       457
weighted avg       0.77      0.78      0.77       457

t

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

Map:   0%|          | 0/1796 [00:00<?, ? examples/s]

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Map:   0%|          | 0/457 [00:00<?, ? examples/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.78G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	0.4862	Val Loss:	0.3669	Accuracy:	0.8228	F1:	0.7654 *
2 / 30: Train Loss:	0.3263	Val Loss:	0.3145	Accuracy:	0.8665	F1:	0.8281 *
3 / 30: Train Loss:	0.1904	Val Loss:	0.3806	Accuracy:	0.8643	F1:	0.8030
4 / 30: Train Loss:	0.1998	Val Loss:	0.4779	Accuracy:	0.8118	F1:	0.7165
5 / 30: Train Loss:	0.5844	Val Loss:	0.5729	Accuracy:	0.7396	F1:	0.4252
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.74      0.76      0.75       119
      struct       0.91      0.91      0.91       338

    accuracy                           0.87       457
   macro avg       0.83      0.83      0.83       457
weighted avg       0.87      0.87      0.87       457

test results:
              precision    recall  f1-score   support

       bknow       0.67      0.66      0.66        67
      struct       0.88      0.88      0.88       189

    accuracy                           0.82

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	0.6190	Val Loss:	0.6607	Accuracy:	0.7396	F1:	0.4252 *
2 / 30: Train Loss:	0.5885	Val Loss:	0.6134	Accuracy:	0.7396	F1:	0.4252
3 / 30: Train Loss:	0.5889	Val Loss:	0.6102	Accuracy:	0.7396	F1:	0.4252
4 / 30: Train Loss:	0.5895	Val Loss:	0.5918	Accuracy:	0.7396	F1:	0.4252
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.00      0.00      0.00       119
      struct       0.74      1.00      0.85       338

    accuracy                           0.74       457
   macro avg       0.37      0.50      0.43       457
weighted avg       0.55      0.74      0.63       457

test results:
              precision    recall  f1-score   support

       bknow       0.00      0.00      0.00        67
      struct       0.74      1.00      0.85       189

    accuracy                           0.74       256
   macro avg       0.37      0.50      0.42       256
weighted 

Map:   0%|          | 0/457 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	0.6261	Val Loss:	0.5888	Accuracy:	0.7396	F1:	0.4252 *
2 / 30: Train Loss:	0.5840	Val Loss:	0.5834	Accuracy:	0.7396	F1:	0.4252
3 / 30: Train Loss:	0.5811	Val Loss:	0.5848	Accuracy:	0.7396	F1:	0.4252
4 / 30: Train Loss:	0.5842	Val Loss:	0.5837	Accuracy:	0.7396	F1:	0.4252
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.00      0.00      0.00       119
      struct       0.74      1.00      0.85       338

    accuracy                           0.74       457
   macro avg       0.37      0.50      0.43       457
weighted avg       0.55      0.74      0.63       457

test results:
              precision    recall  f1-score   support

       bknow       0.00      0.00      0.00        67
      struct       0.74      1.00      0.85       189

    accuracy                           0.74       256
   macro avg       0.37      0.50      0.42       256
weighted 

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	0.8361	Val Loss:	0.5845	Accuracy:	0.7396	F1:	0.4252 *
2 / 30: Train Loss:	0.5833	Val Loss:	0.5866	Accuracy:	0.7396	F1:	0.4252
3 / 30: Train Loss:	0.5802	Val Loss:	0.5843	Accuracy:	0.7396	F1:	0.4252
4 / 30: Train Loss:	0.5766	Val Loss:	0.5844	Accuracy:	0.7396	F1:	0.4252
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
              precision    recall  f1-score   support

       bknow       0.00      0.00      0.00       119
      struct       0.74      1.00      0.85       338

    accuracy                           0.74       457
   macro avg       0.37      0.50      0.43       457
weighted avg       0.55      0.74      0.63       457

test results:
              precision    recall  f1-score   support

       bknow       0.00      0.00      0.00        67
      struct       0.74      1.00      0.85       189

    accuracy                           0.74       256
   macro avg       0.37      0.50      0.42       256
weighted 

In [5]:
%reset -f