In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/mnt/software/'
import sys
import gc
# assuming data, models, engine in flicc directory:
flicc_path = os.path.join(os.path.dirname(os.getcwd()), '')
sys.path.append(flicc_path)
import torch
from data import ClimateDataset
from models import ClassificationModel
from engine import Engine

# import warnings
# warnings.filterwarnings("ignore")

In [2]:
model_checkpoints = ['bert-base-uncased',
                   'roberta-large',
                   'gpt2',
                   'bigscience/bloom-560m',
                   'facebook/opt-350m',
                   'EleutherAI/gpt-neo-1.3B', 
                   'microsoft/deberta-base',
                   'microsoft/deberta-v2-xlarge']

In [3]:
results = {'test_acc':[],
           'test_f1':[],
           'eval_acc':[],
           'eval_f1':[],
           'lr':[],
           'model':[]}

In [4]:
learning_rates = [1.0e-5, 5.0e-5 ,1.0e-4]
for model_checkpoint in model_checkpoints:
    for lr in learning_rates:
        print(f'Grid search {model_checkpoint}, learning rate {lr}')
        data = ClimateDataset(model_to_train=4,model_checkpoint=model_checkpoint,dataset_url=flicc_path,batch_size=32)
        data.setup_dataloaders()
        model = ClassificationModel(model_checkpoint=data.model_checkpoint,num_labels=data.num_labels)
        trainer = Engine(epochs=30,labels=data.labels)
        trainer.model = model.model
        test_acc, test_f1, eval_acc, eval_f1 = trainer.run(lr=lr,
                                                            wd=0.0,
                                                            train_dataloader=data.train_dataloader,
                                                            eval_dataloader=data.eval_dataloader,
                                                            test_dataloader=data.test_dataloader,
                                                            # accumulation_steps=32,
                                                            early_stop=3)
        results['test_acc'].append(test_acc)
        results['test_f1'].append(test_f1)
        results['eval_acc'].append(eval_acc)
        results['eval_f1'].append(eval_f1)
        results['lr'].append(lr)
        results['model'].append(model_checkpoint)
        print('### '*10)
        print(results)
        print('### '*10)
        with torch.no_grad():
            torch.cuda.empty_cache()
        del data, model, trainer
        gc.collect()

Grid search bert-base-uncased, learning rate 1e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.3806	Val Loss:	2.1792	Accuracy:	0.2976	F1:	0.1813 *
2 / 30: Train Loss:	2.0266	Val Loss:	1.8814	Accuracy:	0.4354	F1:	0.3076 *
3 / 30: Train Loss:	1.7194	Val Loss:	1.6721	Accuracy:	0.4923	F1:	0.3687 *
4 / 30: Train Loss:	1.4581	Val Loss:	1.5409	Accuracy:	0.5405	F1:	0.4116 *
5 / 30: Train Loss:	1.2442	Val Loss:	1.4461	Accuracy:	0.5755	F1:	0.4443 *
6 / 30: Train Loss:	1.0456	Val Loss:	1.3050	Accuracy:	0.5952	F1:	0.4636 *
7 / 30: Train Loss:	0.8667	Val Loss:	1.2323	Accuracy:	0.6258	F1:	0.5165 *
8 / 30: Train Loss:	0.6978	Val Loss:	1.2087	Accuracy:	0.6346	F1:	0.5496 *
9 / 30: Train Loss:	0.5722	Val Loss:	1.1688	Accuracy:	0.6346	F1:	0.5366
10 / 30: Train Loss:	0.4586	Val Loss:	1.1584	Accuracy:	0.6411	F1:	0.5581 *
11 / 30: Train Loss:	0.3750	Val Loss:	1.1686	Accuracy:	0.6346	F1:	0.5640 *
12 / 30: Train Loss:	0.3041	Val Loss:	1.1892	Accuracy:	0.6324	F1:	0.5840 *
13 / 30: Train Loss:	0.2513	Val Loss:	1.2038	Accuracy:	0.6543	F1:	0.6189 *
14 / 30: Train Loss:	0.2072	Val Loss

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.0734	Val Loss:	1.5805	Accuracy:	0.5164	F1:	0.3812 *
2 / 30: Train Loss:	1.2720	Val Loss:	1.2624	Accuracy:	0.5886	F1:	0.5053 *
3 / 30: Train Loss:	0.7652	Val Loss:	1.2007	Accuracy:	0.6193	F1:	0.5654 *
4 / 30: Train Loss:	0.4265	Val Loss:	1.2871	Accuracy:	0.6171	F1:	0.5642
5 / 30: Train Loss:	0.2667	Val Loss:	1.4833	Accuracy:	0.6083	F1:	0.5814 *
6 / 30: Train Loss:	0.1595	Val Loss:	1.4199	Accuracy:	0.6368	F1:	0.6064 *
7 / 30: Train Loss:	0.0882	Val Loss:	1.3384	Accuracy:	0.6521	F1:	0.6236 *
8 / 30: Train Loss:	0.0407	Val Loss:	1.4377	Accuracy:	0.6696	F1:	0.6335 *
9 / 30: Train Loss:	0.0269	Val Loss:	1.4970	Accuracy:	0.6543	F1:	0.6271
10 / 30: Train Loss:	0.0185	Val Loss:	1.4649	Accuracy:	0.6652	F1:	0.6306
11 / 30: Train Loss:	0.0177	Val Loss:	1.5012	Accuracy:	0.6521	F1:	0.6136
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.69    

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	1.9084	Val Loss:	1.4854	Accuracy:	0.5295	F1:	0.3998 *
2 / 30: Train Loss:	1.1017	Val Loss:	1.4341	Accuracy:	0.5602	F1:	0.5157 *
3 / 30: Train Loss:	0.6575	Val Loss:	1.2939	Accuracy:	0.6105	F1:	0.5584 *
4 / 30: Train Loss:	0.3754	Val Loss:	1.2815	Accuracy:	0.6433	F1:	0.6067 *
5 / 30: Train Loss:	0.2356	Val Loss:	1.4374	Accuracy:	0.6543	F1:	0.6102 *
6 / 30: Train Loss:	0.1206	Val Loss:	1.5037	Accuracy:	0.6346	F1:	0.6226 *
7 / 30: Train Loss:	0.0652	Val Loss:	1.6122	Accuracy:	0.6368	F1:	0.6123
8 / 30: Train Loss:	0.0393	Val Loss:	1.5840	Accuracy:	0.6433	F1:	0.6311 *
9 / 30: Train Loss:	0.0641	Val Loss:	1.7202	Accuracy:	0.6149	F1:	0.6014
10 / 30: Train Loss:	0.0921	Val Loss:	2.0348	Accuracy:	0.5952	F1:	0.5788
11 / 30: Train Loss:	0.1538	Val Loss:	1.8333	Accuracy:	0.6083	F1:	0.5873
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.68    

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.2349	Val Loss:	1.6003	Accuracy:	0.5033	F1:	0.3722 *
2 / 30: Train Loss:	1.3532	Val Loss:	1.1752	Accuracy:	0.6149	F1:	0.5104 *
3 / 30: Train Loss:	0.8966	Val Loss:	0.9958	Accuracy:	0.6805	F1:	0.6482 *
4 / 30: Train Loss:	0.6150	Val Loss:	0.9477	Accuracy:	0.7068	F1:	0.6980 *
5 / 30: Train Loss:	0.4273	Val Loss:	0.9666	Accuracy:	0.7155	F1:	0.7054 *
6 / 30: Train Loss:	0.3208	Val Loss:	1.2816	Accuracy:	0.6718	F1:	0.6485
7 / 30: Train Loss:	0.2258	Val Loss:	1.0949	Accuracy:	0.7287	F1:	0.6992
8 / 30: Train Loss:	0.1275	Val Loss:	1.1484	Accuracy:	0.7287	F1:	0.7112 *
9 / 30: Train Loss:	0.0960	Val Loss:	1.2356	Accuracy:	0.7090	F1:	0.7097
10 / 30: Train Loss:	0.0722	Val Loss:	1.2278	Accuracy:	0.7155	F1:	0.7070
11 / 30: Train Loss:	0.0483	Val Loss:	1.2260	Accuracy:	0.7177	F1:	0.6995
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.79      

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	1.8926	Val Loss:	1.3415	Accuracy:	0.5514	F1:	0.4446 *
2 / 30: Train Loss:	1.1432	Val Loss:	1.1086	Accuracy:	0.6565	F1:	0.6078 *
3 / 30: Train Loss:	0.8291	Val Loss:	1.0573	Accuracy:	0.6915	F1:	0.6545 *
4 / 30: Train Loss:	0.4967	Val Loss:	1.1628	Accuracy:	0.6871	F1:	0.6634 *
5 / 30: Train Loss:	0.3243	Val Loss:	1.1722	Accuracy:	0.6871	F1:	0.6774 *
6 / 30: Train Loss:	0.2365	Val Loss:	1.2276	Accuracy:	0.6718	F1:	0.6496
7 / 30: Train Loss:	0.2223	Val Loss:	1.3473	Accuracy:	0.6783	F1:	0.6610
8 / 30: Train Loss:	0.1503	Val Loss:	1.2293	Accuracy:	0.7133	F1:	0.7068 *
9 / 30: Train Loss:	0.1215	Val Loss:	1.2906	Accuracy:	0.7046	F1:	0.6964
10 / 30: Train Loss:	0.0811	Val Loss:	1.5512	Accuracy:	0.6783	F1:	0.6678
11 / 30: Train Loss:	0.0829	Val Loss:	1.5122	Accuracy:	0.6740	F1:	0.6426
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.85      

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.4353	Val Loss:	2.3808	Accuracy:	0.1466	F1:	0.0213 *
2 / 30: Train Loss:	2.4132	Val Loss:	2.3784	Accuracy:	0.1466	F1:	0.0213
3 / 30: Train Loss:	2.4127	Val Loss:	2.3805	Accuracy:	0.1225	F1:	0.0182
4 / 30: Train Loss:	2.4002	Val Loss:	2.3823	Accuracy:	0.1466	F1:	0.0213
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.15      1.00      0.26        67
               anecdote       0.00      0.00      0.00        43
         cherry picking       0.00      0.00      0.00        56
      conspiracy theory       0.00      0.00      0.00        39
           fake experts       0.00      0.00      0.00        12
           false choice       0.00      0.00      0.00        13
      false equivalence       0.00      0.00      0.00        14
impossible expectations       0.00      0.00      0.00        37
      misrepresentation       0.00  

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	3.0621	Val Loss:	2.5606	Accuracy:	0.1269	F1:	0.0794 *
2 / 30: Train Loss:	2.5522	Val Loss:	2.4449	Accuracy:	0.1466	F1:	0.0785
3 / 30: Train Loss:	2.4509	Val Loss:	2.3903	Accuracy:	0.1532	F1:	0.0802 *
4 / 30: Train Loss:	2.4074	Val Loss:	2.3565	Accuracy:	0.1816	F1:	0.0896 *
5 / 30: Train Loss:	2.4024	Val Loss:	2.3291	Accuracy:	0.2101	F1:	0.1032 *
6 / 30: Train Loss:	2.3667	Val Loss:	2.3008	Accuracy:	0.2254	F1:	0.1145 *
7 / 30: Train Loss:	2.3151	Val Loss:	2.2692	Accuracy:	0.2429	F1:	0.1272 *
8 / 30: Train Loss:	2.2843	Val Loss:	2.2361	Accuracy:	0.2385	F1:	0.1324 *
9 / 30: Train Loss:	2.2478	Val Loss:	2.1835	Accuracy:	0.2626	F1:	0.1542 *
10 / 30: Train Loss:	2.1792	Val Loss:	2.1250	Accuracy:	0.3107	F1:	0.2018 *
11 / 30: Train Loss:	2.0999	Val Loss:	2.0360	Accuracy:	0.3239	F1:	0.2205 *
12 / 30: Train Loss:	2.0076	Val Loss:	1.9080	Accuracy:	0.3654	F1:	0.2574 *
13 / 30: Train Loss:	1.8858	Val Loss:	1.8251	Accuracy:	0.3917	F1:	0.2826 *
14 / 30: Train Loss:	1.7937	Val Loss

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.6266	Val Loss:	2.3424	Accuracy:	0.2013	F1:	0.0824 *
2 / 30: Train Loss:	2.3336	Val Loss:	2.2754	Accuracy:	0.2276	F1:	0.1327 *
3 / 30: Train Loss:	2.1797	Val Loss:	2.0540	Accuracy:	0.3085	F1:	0.2483 *
4 / 30: Train Loss:	1.7936	Val Loss:	1.7507	Accuracy:	0.4026	F1:	0.3460 *
5 / 30: Train Loss:	1.4829	Val Loss:	1.5700	Accuracy:	0.4595	F1:	0.4094 *
6 / 30: Train Loss:	1.2146	Val Loss:	1.5907	Accuracy:	0.4748	F1:	0.4251 *
7 / 30: Train Loss:	1.0116	Val Loss:	1.6436	Accuracy:	0.4814	F1:	0.4558 *
8 / 30: Train Loss:	0.7988	Val Loss:	1.8927	Accuracy:	0.4902	F1:	0.4747 *
9 / 30: Train Loss:	0.7213	Val Loss:	1.7884	Accuracy:	0.5055	F1:	0.5044 *
10 / 30: Train Loss:	0.4975	Val Loss:	1.8117	Accuracy:	0.5427	F1:	0.5258 *
11 / 30: Train Loss:	0.3648	Val Loss:	2.0342	Accuracy:	0.4967	F1:	0.4913
12 / 30: Train Loss:	0.2508	Val Loss:	1.7090	Accuracy:	0.5667	F1:	0.5528 *
13 / 30: Train Loss:	0.1629	Val Loss:	1.7114	Accuracy:	0.5624	F1:	0.5420
14 / 30: Train Loss:	0.1069	Val Loss:	

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.5868	Val Loss:	2.3106	Accuracy:	0.2254	F1:	0.1095 *
2 / 30: Train Loss:	2.2675	Val Loss:	2.2539	Accuracy:	0.2079	F1:	0.1578 *
3 / 30: Train Loss:	1.8849	Val Loss:	1.8168	Accuracy:	0.4136	F1:	0.3592 *
4 / 30: Train Loss:	1.4383	Val Loss:	1.4891	Accuracy:	0.4967	F1:	0.4465 *
5 / 30: Train Loss:	1.0646	Val Loss:	1.4957	Accuracy:	0.5164	F1:	0.4691 *
6 / 30: Train Loss:	0.7738	Val Loss:	1.5035	Accuracy:	0.5602	F1:	0.5453 *
7 / 30: Train Loss:	0.5478	Val Loss:	1.8897	Accuracy:	0.5427	F1:	0.5283
8 / 30: Train Loss:	0.3619	Val Loss:	2.3268	Accuracy:	0.5077	F1:	0.4838
9 / 30: Train Loss:	0.2915	Val Loss:	2.3947	Accuracy:	0.4792	F1:	0.4628
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.79      0.45      0.57        67
               anecdote       0.93      0.65      0.77        43
         cherry picking       0.60      0.55      0.57  

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	12.2972	Val Loss:	2.2351	Accuracy:	0.3786	F1:	0.3392 *
2 / 30: Train Loss:	1.3793	Val Loss:	1.9890	Accuracy:	0.4508	F1:	0.3930 *
3 / 30: Train Loss:	0.8893	Val Loss:	2.4685	Accuracy:	0.5230	F1:	0.4992 *
4 / 30: Train Loss:	0.4955	Val Loss:	3.0003	Accuracy:	0.4945	F1:	0.4638
5 / 30: Train Loss:	0.2619	Val Loss:	2.7601	Accuracy:	0.5449	F1:	0.5247 *
6 / 30: Train Loss:	0.0903	Val Loss:	2.7333	Accuracy:	0.5558	F1:	0.5433 *
7 / 30: Train Loss:	0.0480	Val Loss:	2.7850	Accuracy:	0.5777	F1:	0.5656 *
8 / 30: Train Loss:	0.0248	Val Loss:	2.5086	Accuracy:	0.5886	F1:	0.5879 *
9 / 30: Train Loss:	0.0193	Val Loss:	2.4669	Accuracy:	0.6193	F1:	0.6104 *
10 / 30: Train Loss:	0.0011	Val Loss:	2.3923	Accuracy:	0.6061	F1:	0.5911
11 / 30: Train Loss:	0.0034	Val Loss:	2.4107	Accuracy:	0.6061	F1:	0.5813
12 / 30: Train Loss:	0.0005	Val Loss:	2.3802	Accuracy:	0.6149	F1:	0.6047
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         pre

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	9.2221	Val Loss:	2.2887	Accuracy:	0.3457	F1:	0.2619 *
2 / 30: Train Loss:	1.4755	Val Loss:	2.1322	Accuracy:	0.4748	F1:	0.4130 *
3 / 30: Train Loss:	0.7979	Val Loss:	2.6738	Accuracy:	0.5164	F1:	0.4612 *
4 / 30: Train Loss:	0.5484	Val Loss:	2.9105	Accuracy:	0.5405	F1:	0.4975 *
5 / 30: Train Loss:	0.2611	Val Loss:	3.1601	Accuracy:	0.5164	F1:	0.4945
6 / 30: Train Loss:	0.1980	Val Loss:	3.2558	Accuracy:	0.5252	F1:	0.5158 *
7 / 30: Train Loss:	0.1809	Val Loss:	2.3788	Accuracy:	0.6324	F1:	0.6269 *
8 / 30: Train Loss:	0.0702	Val Loss:	2.3745	Accuracy:	0.5952	F1:	0.5744
9 / 30: Train Loss:	0.0384	Val Loss:	2.3497	Accuracy:	0.6149	F1:	0.5933
10 / 30: Train Loss:	0.0024	Val Loss:	2.3741	Accuracy:	0.6521	F1:	0.6334 *
11 / 30: Train Loss:	0.0006	Val Loss:	2.3648	Accuracy:	0.6565	F1:	0.6361 *
12 / 30: Train Loss:	0.0001	Val Loss:	2.3564	Accuracy:	0.6543	F1:	0.6366 *
13 / 30: Train Loss:	0.0001	Val Loss:	2.3535	Accuracy:	0.6521	F1:	0.6346
14 / 30: Train Loss:	0.0001	Val Loss:	2.35

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	11.6967	Val Loss:	2.8080	Accuracy:	0.1422	F1:	0.0351 *
2 / 30: Train Loss:	2.7533	Val Loss:	3.2294	Accuracy:	0.1028	F1:	0.0444 *
3 / 30: Train Loss:	2.2044	Val Loss:	2.6389	Accuracy:	0.2735	F1:	0.2048 *
4 / 30: Train Loss:	1.3879	Val Loss:	2.1067	Accuracy:	0.4814	F1:	0.4497 *
5 / 30: Train Loss:	0.8036	Val Loss:	6.4966	Accuracy:	0.2823	F1:	0.2372
6 / 30: Train Loss:	0.4473	Val Loss:	4.4695	Accuracy:	0.4551	F1:	0.3930
7 / 30: Train Loss:	0.2795	Val Loss:	3.7140	Accuracy:	0.5033	F1:	0.4478
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.59      0.51      0.54        67
               anecdote       0.89      0.58      0.70        43
         cherry picking       0.72      0.41      0.52        56
      conspiracy theory       0.77      0.51      0.62        39
           fake experts       0.22      0.17      0.19        12
        

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.5073	Val Loss:	1.9581	Accuracy:	0.2910	F1:	0.1980 *
2 / 30: Train Loss:	2.1399	Val Loss:	2.1875	Accuracy:	0.2604	F1:	0.1461
3 / 30: Train Loss:	2.0063	Val Loss:	2.1887	Accuracy:	0.2429	F1:	0.1349
4 / 30: Train Loss:	2.3225	Val Loss:	2.4007	Accuracy:	0.1794	F1:	0.0925
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.45      0.34      0.39        67
               anecdote       0.62      0.79      0.69        43
         cherry picking       0.54      0.25      0.34        56
      conspiracy theory       0.17      0.67      0.27        39
           fake experts       0.00      0.00      0.00        12
           false choice       0.00      0.00      0.00        13
      false equivalence       0.00      0.00      0.00        14
impossible expectations       0.22      0.65      0.32        37
      misrepresentation       0.18  

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.8147	Val Loss:	2.2915	Accuracy:	0.1751	F1:	0.1088 *
2 / 30: Train Loss:	2.4144	Val Loss:	2.7027	Accuracy:	0.1028	F1:	0.0401
3 / 30: Train Loss:	2.4304	Val Loss:	2.3900	Accuracy:	0.1466	F1:	0.0213
4 / 30: Train Loss:	2.4025	Val Loss:	2.3781	Accuracy:	0.1050	F1:	0.0364
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.20      0.03      0.05        67
               anecdote       0.54      0.33      0.41        43
         cherry picking       0.25      0.23      0.24        56
      conspiracy theory       0.13      0.82      0.23        39
           fake experts       0.00      0.00      0.00        12
           false choice       0.00      0.00      0.00        13
      false equivalence       0.00      0.00      0.00        14
impossible expectations       0.19      0.30      0.23        37
      misrepresentation       0.11  

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.9144	Val Loss:	2.4279	Accuracy:	0.0832	F1:	0.0128 *
2 / 30: Train Loss:	2.4379	Val Loss:	2.3951	Accuracy:	0.1466	F1:	0.0213 *
3 / 30: Train Loss:	2.4229	Val Loss:	2.3958	Accuracy:	0.1247	F1:	0.0185
4 / 30: Train Loss:	2.4139	Val Loss:	2.3937	Accuracy:	0.1225	F1:	0.0182
5 / 30: Train Loss:	2.4134	Val Loss:	2.3916	Accuracy:	0.1225	F1:	0.0182
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.15      1.00      0.26        67
               anecdote       0.00      0.00      0.00        43
         cherry picking       0.00      0.00      0.00        56
      conspiracy theory       0.00      0.00      0.00        39
           fake experts       0.00      0.00      0.00        12
           false choice       0.00      0.00      0.00        13
      false equivalence       0.00      0.00      0.00        14
impossible expectations    

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.4305	Val Loss:	1.9503	Accuracy:	0.3632	F1:	0.2714 *
2 / 30: Train Loss:	1.1990	Val Loss:	1.7496	Accuracy:	0.4158	F1:	0.3547 *
3 / 30: Train Loss:	0.4360	Val Loss:	1.9091	Accuracy:	0.4376	F1:	0.3923 *
4 / 30: Train Loss:	0.2173	Val Loss:	1.8900	Accuracy:	0.4420	F1:	0.4053 *
5 / 30: Train Loss:	0.0731	Val Loss:	1.9910	Accuracy:	0.4661	F1:	0.4276 *
6 / 30: Train Loss:	0.0305	Val Loss:	1.8254	Accuracy:	0.4792	F1:	0.4418 *
7 / 30: Train Loss:	0.0124	Val Loss:	1.8738	Accuracy:	0.4770	F1:	0.4422 *
8 / 30: Train Loss:	0.0072	Val Loss:	1.8539	Accuracy:	0.4683	F1:	0.4273
9 / 30: Train Loss:	0.0020	Val Loss:	1.8749	Accuracy:	0.4639	F1:	0.4075
10 / 30: Train Loss:	0.0015	Val Loss:	1.8845	Accuracy:	0.4705	F1:	0.4129
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.41      0.69      0.51        67
               anecdote       0.67      0.70  

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.1048	Val Loss:	1.3876	Accuracy:	0.5208	F1:	0.4536 *
2 / 30: Train Loss:	1.0011	Val Loss:	1.5238	Accuracy:	0.5449	F1:	0.5398 *
3 / 30: Train Loss:	0.3424	Val Loss:	1.8086	Accuracy:	0.5624	F1:	0.5324
4 / 30: Train Loss:	0.1723	Val Loss:	1.7508	Accuracy:	0.5842	F1:	0.5740 *
5 / 30: Train Loss:	0.0632	Val Loss:	1.7771	Accuracy:	0.5864	F1:	0.5726
6 / 30: Train Loss:	0.0336	Val Loss:	1.9801	Accuracy:	0.5886	F1:	0.5747 *
7 / 30: Train Loss:	0.0128	Val Loss:	1.7419	Accuracy:	0.5974	F1:	0.5930 *
8 / 30: Train Loss:	0.0010	Val Loss:	1.7660	Accuracy:	0.6018	F1:	0.6029 *
9 / 30: Train Loss:	0.0003	Val Loss:	1.7666	Accuracy:	0.6083	F1:	0.6073 *
10 / 30: Train Loss:	0.0002	Val Loss:	1.7755	Accuracy:	0.6127	F1:	0.6107 *
11 / 30: Train Loss:	0.0002	Val Loss:	1.7841	Accuracy:	0.6127	F1:	0.6101
12 / 30: Train Loss:	0.0002	Val Loss:	1.7922	Accuracy:	0.6105	F1:	0.6100
13 / 30: Train Loss:	0.0001	Val Loss:	1.7997	Accuracy:	0.6127	F1:	0.6114 *
14 / 30: Train Loss:	0.0001	Val Loss:	1.80

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.3253	Val Loss:	1.3558	Accuracy:	0.5470	F1:	0.4867 *
2 / 30: Train Loss:	1.0557	Val Loss:	1.4081	Accuracy:	0.5733	F1:	0.5305 *
3 / 30: Train Loss:	0.2916	Val Loss:	1.8979	Accuracy:	0.5799	F1:	0.5657 *
4 / 30: Train Loss:	0.3797	Val Loss:	1.8018	Accuracy:	0.5274	F1:	0.4964
5 / 30: Train Loss:	0.4702	Val Loss:	2.1181	Accuracy:	0.5602	F1:	0.5621
6 / 30: Train Loss:	0.0981	Val Loss:	2.3121	Accuracy:	0.5886	F1:	0.5848 *
7 / 30: Train Loss:	0.0275	Val Loss:	2.1508	Accuracy:	0.5864	F1:	0.5904 *
8 / 30: Train Loss:	0.0092	Val Loss:	2.1661	Accuracy:	0.6018	F1:	0.6046 *
9 / 30: Train Loss:	0.0003	Val Loss:	2.1413	Accuracy:	0.6018	F1:	0.6091 *
10 / 30: Train Loss:	0.0001	Val Loss:	2.1503	Accuracy:	0.6018	F1:	0.6100 *
11 / 30: Train Loss:	0.0001	Val Loss:	2.1588	Accuracy:	0.6018	F1:	0.6100
12 / 30: Train Loss:	0.0001	Val Loss:	2.1669	Accuracy:	0.6018	F1:	0.6100
13 / 30: Train Loss:	0.0001	Val Loss:	2.1747	Accuracy:	0.6061	F1:	0.6161 *
14 / 30: Train Loss:	0.0001	Val Loss:	2.18

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.3660	Val Loss:	2.0162	Accuracy:	0.3479	F1:	0.2233 *
2 / 30: Train Loss:	1.7856	Val Loss:	1.5339	Accuracy:	0.4967	F1:	0.3598 *
3 / 30: Train Loss:	1.3403	Val Loss:	1.3740	Accuracy:	0.5558	F1:	0.4308 *
4 / 30: Train Loss:	1.0307	Val Loss:	1.2275	Accuracy:	0.5952	F1:	0.4759 *
5 / 30: Train Loss:	0.7939	Val Loss:	1.1535	Accuracy:	0.6346	F1:	0.5753 *
6 / 30: Train Loss:	0.6244	Val Loss:	1.1471	Accuracy:	0.6433	F1:	0.6115 *
7 / 30: Train Loss:	0.4842	Val Loss:	1.2059	Accuracy:	0.6346	F1:	0.6076
8 / 30: Train Loss:	0.3819	Val Loss:	1.2881	Accuracy:	0.6389	F1:	0.6108
9 / 30: Train Loss:	0.2976	Val Loss:	1.2358	Accuracy:	0.6652	F1:	0.6583 *
10 / 30: Train Loss:	0.2419	Val Loss:	1.2372	Accuracy:	0.6674	F1:	0.6585 *
11 / 30: Train Loss:	0.1931	Val Loss:	1.2493	Accuracy:	0.6783	F1:	0.6765 *
12 / 30: Train Loss:	0.1533	Val Loss:	1.3459	Accuracy:	0.6586	F1:	0.6496
13 / 30: Train Loss:	0.1115	Val Loss:	1.3537	Accuracy:	0.6630	F1:	0.6578
14 / 30: Train Loss:	0.0767	Val Loss:	1.33

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	1.9577	Val Loss:	1.6383	Accuracy:	0.4726	F1:	0.3842 *
2 / 30: Train Loss:	1.1880	Val Loss:	1.2219	Accuracy:	0.5930	F1:	0.5188 *
3 / 30: Train Loss:	0.7498	Val Loss:	1.2800	Accuracy:	0.6061	F1:	0.5416 *
4 / 30: Train Loss:	0.4882	Val Loss:	1.3398	Accuracy:	0.6280	F1:	0.5853 *
5 / 30: Train Loss:	0.3088	Val Loss:	1.2540	Accuracy:	0.6543	F1:	0.6262 *
6 / 30: Train Loss:	0.2196	Val Loss:	1.3769	Accuracy:	0.6543	F1:	0.6378 *
7 / 30: Train Loss:	0.1764	Val Loss:	1.4553	Accuracy:	0.6608	F1:	0.6395 *
8 / 30: Train Loss:	0.1145	Val Loss:	1.4185	Accuracy:	0.6783	F1:	0.6684 *
9 / 30: Train Loss:	0.0847	Val Loss:	1.7438	Accuracy:	0.6083	F1:	0.5953
10 / 30: Train Loss:	0.1049	Val Loss:	1.4977	Accuracy:	0.6761	F1:	0.6549
11 / 30: Train Loss:	0.0915	Val Loss:	1.6758	Accuracy:	0.6127	F1:	0.6173
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.69  

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	1.9289	Val Loss:	1.6456	Accuracy:	0.4464	F1:	0.3756 *
2 / 30: Train Loss:	1.2828	Val Loss:	1.4576	Accuracy:	0.5711	F1:	0.4716 *
3 / 30: Train Loss:	0.8443	Val Loss:	1.3532	Accuracy:	0.5886	F1:	0.5077 *
4 / 30: Train Loss:	0.5607	Val Loss:	1.6290	Accuracy:	0.5821	F1:	0.5728 *
5 / 30: Train Loss:	0.3970	Val Loss:	1.6619	Accuracy:	0.5974	F1:	0.5865 *
6 / 30: Train Loss:	0.2629	Val Loss:	1.5617	Accuracy:	0.6389	F1:	0.6201 *
7 / 30: Train Loss:	0.2199	Val Loss:	1.6901	Accuracy:	0.6258	F1:	0.5993
8 / 30: Train Loss:	0.1640	Val Loss:	1.5278	Accuracy:	0.6543	F1:	0.6425 *
9 / 30: Train Loss:	0.1366	Val Loss:	1.8228	Accuracy:	0.6236	F1:	0.6050
10 / 30: Train Loss:	0.1692	Val Loss:	1.6195	Accuracy:	0.6236	F1:	0.6048
11 / 30: Train Loss:	0.1723	Val Loss:	1.5422	Accuracy:	0.6565	F1:	0.6417
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.75    

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.0129	Val Loss:	1.2845	Accuracy:	0.5733	F1:	0.4359 *
2 / 30: Train Loss:	1.0848	Val Loss:	0.9379	Accuracy:	0.6937	F1:	0.6550 *
3 / 30: Train Loss:	0.6729	Val Loss:	0.8689	Accuracy:	0.7177	F1:	0.7139 *
4 / 30: Train Loss:	0.4314	Val Loss:	0.8517	Accuracy:	0.7505	F1:	0.7359 *
5 / 30: Train Loss:	0.2699	Val Loss:	0.9026	Accuracy:	0.7593	F1:	0.7468 *
6 / 30: Train Loss:	0.1613	Val Loss:	1.0972	Accuracy:	0.7352	F1:	0.7294
7 / 30: Train Loss:	0.1045	Val Loss:	1.1819	Accuracy:	0.7265	F1:	0.7225
8 / 30: Train Loss:	0.0722	Val Loss:	1.1893	Accuracy:	0.7396	F1:	0.7142
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.81      0.75      0.78        67
               anecdote       0.95      0.88      0.92        43
         cherry picking       0.72      0.73      0.73        56
      conspiracy theory       0.83      0.87      0.85        39


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.0304	Val Loss:	1.4738	Accuracy:	0.5208	F1:	0.3938 *
2 / 30: Train Loss:	1.8828	Val Loss:	2.3789	Accuracy:	0.1225	F1:	0.0182
3 / 30: Train Loss:	2.3911	Val Loss:	2.3776	Accuracy:	0.1466	F1:	0.0213
4 / 30: Train Loss:	2.3942	Val Loss:	2.3752	Accuracy:	0.1466	F1:	0.0213
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.65      0.55      0.60        67
               anecdote       0.79      0.88      0.84        43
         cherry picking       0.37      0.71      0.48        56
      conspiracy theory       0.56      0.92      0.70        39
           fake experts       0.50      0.08      0.14        12
           false choice       0.00      0.00      0.00        13
      false equivalence       0.00      0.00      0.00        14
impossible expectations       0.31      0.65      0.42        37
      misrepresentation       0.90  

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 / 30: Train Loss:	2.4197	Val Loss:	2.3865	Accuracy:	0.1225	F1:	0.0182 *
2 / 30: Train Loss:	2.3963	Val Loss:	2.3895	Accuracy:	0.1225	F1:	0.0182
3 / 30: Train Loss:	2.3947	Val Loss:	2.3887	Accuracy:	0.1225	F1:	0.0182
4 / 30: Train Loss:	2.3949	Val Loss:	2.3859	Accuracy:	0.1225	F1:	0.0182
No improvement for 3 epochs. Stopping early.
best (higgest macro f1-score) val results:
                         precision    recall  f1-score   support

             ad hominem       0.00      0.00      0.00        67
               anecdote       0.00      0.00      0.00        43
         cherry picking       0.12      1.00      0.22        56
      conspiracy theory       0.00      0.00      0.00        39
           fake experts       0.00      0.00      0.00        12
           false choice       0.00      0.00      0.00        13
      false equivalence       0.00      0.00      0.00        14
impossible expectations       0.00      0.00      0.00        37
      misrepresentation       0.00  

In [5]:
import pandas as pd

In [6]:
pd.DataFrame(results)

Unnamed: 0,test_acc,test_f1,eval_acc,eval_f1,lr,model
0,0.628906,0.562975,0.667396,0.635417,1e-05,bert-base-uncased
1,0.65625,0.647461,0.669584,0.633479,5e-05,bert-base-uncased
2,0.621094,0.584376,0.643326,0.631104,0.0001,bert-base-uncased
3,0.699219,0.66367,0.728665,0.711222,1e-05,roberta-large
4,0.6875,0.675537,0.713348,0.706779,5e-05,roberta-large
5,0.144531,0.021047,0.146608,0.02131,0.0001,roberta-large
6,0.472656,0.424396,0.512035,0.48946,1e-05,gpt2
7,0.59375,0.562751,0.586433,0.572095,5e-05,gpt2
8,0.511719,0.474738,0.560175,0.545294,0.0001,gpt2
9,0.574219,0.538595,0.619256,0.610368,1e-05,bigscience/bloom-560m
