### Installation 

In [None]:
! pip install datasets transformers rouge-score nltk
!git clone https://github.com/xmu-xiaoma666/External-Attention-pytorch.git ext_attns

### About dataset 

In [4]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
        num_rows: 11334
    })
})

In [6]:
print(f'train: {len(tokenized_datasets["train"])}')
print(f'val:   {len(tokenized_datasets["validation"])}')
print(f'test:  {len(tokenized_datasets["test"])}')

train: 204045
val:   11332
test:  11334


### 2L Attentions

|i|attention | used|
|---|--------| ----|
|0  |UFOAttention| ✔|
|1  | AFT_FULL   |x|
|2  | MUSEAttention|✔ |
|3  | EMSA|     x|
|4  |SimplifiedScaledDotProductAttention|✔ |
|5  |ScaledDotProductAttention|✔ |
|6  |ExternalAttention|✔ |

In [1]:
import os
ext_path = os.path.abspath('./ext_attns/')
import sys
sys.path.append(ext_path)
# list
from model.attention.UFOAttention import *
from model.attention.AFT import AFT_FULL
from model.attention.MUSEAttention import MUSEAttention
from model.attention.EMSA import EMSA
from model.attention.SimplifiedSelfAttention import SimplifiedScaledDotProductAttention
from model.attention.SelfAttention import ScaledDotProductAttention
from model.attention.ExternalAttention import ExternalAttention
d_model = 768 
n_head  = 12  
ufo = UFOAttention(d_model=d_model, d_k=d_model//n_head, d_v=d_model//n_head, h=n_head)
aft_full = AFT_FULL(d_model=d_model, n=n_head)
ma = MUSEAttention(d_model=d_model, d_k=d_model//n_head, d_v=d_model//n_head, h=n_head)
emsa = EMSA(d_model=d_model, d_k=d_model//n_head, d_v=d_model//n_head, h=n_head,H=n_head,W=n_head,ratio=2,apply_transform=True)
ssa = SimplifiedScaledDotProductAttention(d_model=d_model, h=n_head)
sa = ScaledDotProductAttention(d_model=d_model, d_k=d_model//n_head, d_v=d_model//n_head, h=n_head)
ea = ExternalAttention(d_model=d_model,S=8) #v

attn_li = [ufo,aft_full,ma,emsa,ssa,sa,ea]

import transformers

# print(transformers.__version__)

model_checkpoint = "t5-small"
model_checkpoint = 'facebook/bart-base'

from datasets import load_dataset, load_metric

raw_datasets = load_dataset("xsum")
metric = load_metric("rouge")

# raw_datasets["train"][0]

import datasets

from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# ?
# with tokenizer.as_target_tokenizer():
#     print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))

if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

batch_size = 3 #16

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

Using custom data configuration default
Reusing dataset xsum (/home/yp/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/yp/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-5d732b1c86657ea0.arrow
Loading cached processed dataset at /home/yp/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-38966a651c72a103.arrow
Loading cached processed dataset at /home/yp/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-d19c279816291ade.arrow


In [2]:
trvaltest = tokenized_datasets['train'].train_test_split(test_size=0.004,train_size=0.005) # 800,1080
# trvaltest = tokenized_datasets['train'].train_test_split(test_size=0.004,train_size=0.01) # 800,2080
# trvaltest = tokenized_datasets['train'].train_test_split(test_size=0.004,train_size=0.02) # 800,4080
tr = trvaltest['train']
valtest = trvaltest['test'].train_test_split(test_size=0.5,train_size=0.5) # 208, 208
# valtest = trvaltest['test'].train_test_split(test_size=0.5,train_size=0.5) # 408, 408
# valtest = trvaltest['test'].train_test_split(test_size=0.05,train_size=0.05)
val = valtest['train']
te = valtest['test']

del trvaltest
del valtest

Loading cached split indices for dataset at /home/yp/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-a37fd9f11dd44add.arrow and /home/yp/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-195107633ef23937.arrow


In [3]:
from torch import nn
evals = {}
for i,attn in enumerate(attn_li[:]):
    not_allowed = ('AFT_FULL','EMSA')
    print(attn)
    if attn.__str__().startswith(not_allowed):
        continue
    
    
    import os
    model_name = attn.__str__().split('(')[0]
    # model_name = model_checkpoint.split("/")[-1] + model_name
    saved_path = os.path.join('./output',model_name)
    
    args = Seq2SeqTrainingArguments(
        saved_path,
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=15,
        predict_with_generate=True,
        fp16=True,
        save_strategy='epoch',
        load_best_model_at_end=True,
    #     push_to_hub=True,
    )

    class tel2lin1(nn.Module):
        def __init__(self):
            super(tel2lin1,self).__init__()
            # self.tel = nn.TransformerEncoderLayer(d_model=768,nhead=3,batch_first=True)
            self.tel = attn
            self.lin = nn.Linear(in_features=768,out_features=50265, bias=False)
            
        def forward(self,x):
            if self.tel.__str__().startswith('ExternalAttention'):
                x = self.tel(x)
            else:
                x = self.tel(x,x,x)
            x = self.lin(x)
            return x
    model.lm_head = tel2lin1()

    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=tr,
        eval_dataset=val,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    
    evals[f'{model_name}'] = trainer.evaluate(eval_dataset=te)

UFOAttention(
  (fc_q): Linear(in_features=768, out_features=768, bias=True)
  (fc_k): Linear(in_features=768, out_features=768, bias=True)
  (fc_v): Linear(in_features=768, out_features=768, bias=True)
  (fc_o): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running training *****
  Num examples = 1020
  Num Epochs = 15
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 2550


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,7.253441,0.0,0.0,0.0,0.0,5.9118
2,No log,6.886457,2.9173,0.2198,2.6642,2.6699,9.4534
3,7.255100,6.739688,4.8756,0.3332,4.2949,4.2706,10.2108
4,7.255100,6.627885,10.1412,0.9464,9.0573,9.0356,19.1446
5,7.255100,6.531991,9.3069,1.1117,8.4899,8.5067,20.0
6,5.949700,6.498371,11.9356,1.6367,10.3237,10.2761,19.6985
7,5.949700,6.465492,11.4283,1.4238,9.8823,9.8871,16.5
8,5.949700,6.43223,13.2559,1.6666,11.2642,11.2524,17.576
9,5.559900,6.386285,11.311,1.5103,9.8474,9.8576,14.951
10,5.559900,6.381336,10.8451,1.4282,9.4299,9.4427,14.7574


  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./output/UFOAttention/checkpoint-170
Configuration saved in ./output/UFOAttention/checkpoint-170/config.json
Model weights saved in ./output/UFOAttention/checkpoint-170/pytorch_model.bin
tokenizer config file saved in ./output/UFOAttention/checkpoint-170/tokenizer_config.json
Special tokens file saved in ./output/UFOAttention/checkpoint-170/special_tokens_map.json
Deleting older checkpoint [output/UFOAttention/checkpoint-7] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model check

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


AFT_FULL(
  (fc_q): Linear(in_features=768, out_features=768, bias=True)
  (fc_k): Linear(in_features=768, out_features=768, bias=True)
  (fc_v): Linear(in_features=768, out_features=768, bias=True)
  (sigmoid): Sigmoid()
)
MUSEAttention(
  (fc_q): Linear(in_features=768, out_features=768, bias=True)
  (fc_k): Linear(in_features=768, out_features=768, bias=True)
  (fc_v): Linear(in_features=768, out_features=768, bias=True)
  (fc_o): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (conv1): Depth_Pointwise_Conv1d(
    (depth_conv): Identity()
    (pointwise_conv): Conv1d(768, 768, kernel_size=(1,), stride=(1,))
  )
  (conv3): Depth_Pointwise_Conv1d(
    (depth_conv): Conv1d(768, 768, kernel_size=(3,), stride=(1,), padding=(1,), groups=768)
    (pointwise_conv): Conv1d(768, 768, kernel_size=(1,), stride=(1,))
  )
  (conv5): Depth_Pointwise_Conv1d(
    (depth_conv): Conv1d(768, 768, kernel_size=(5,), stride=(1,), padding=(2,), groups=768)


Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running training *****
  Num examples = 1020
  Num Epochs = 15
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 2550


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,7.041088,3.62,0.267,3.4324,3.4194,9.6765
2,No log,6.644437,12.6803,0.5043,10.5927,10.5518,19.9485
3,6.940900,6.484882,10.4912,0.2607,9.3104,9.3077,19.9191
4,6.940900,6.404532,8.3805,0.0607,7.4959,7.4942,19.701
5,6.940900,6.314076,8.0364,0.0918,7.1395,7.124,19.875
6,5.450200,6.303671,7.6486,0.0422,7.1533,7.1333,19.7941
7,5.450200,6.306914,6.8999,0.0,6.4237,6.4007,19.2451
8,5.450200,6.286471,7.0362,0.0136,6.3392,6.331,17.924
9,4.980900,6.262648,7.0518,0.0,6.4358,6.4109,15.9608
10,4.980900,6.267742,6.9302,0.0,6.3704,6.3522,15.7132


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./output/MUSEAttention/checkpoint-170
Configuration saved in ./output/MUSEAttention/checkpoint-170/config.json
Model weights saved in ./output/MUSEAttention/checkpoint-170/pytorch_model.bin
tokenizer config file saved in ./output/MUSEAttention/checkpoint-170/tokenizer_config.json
Special tokens file saved in ./output/MUSEAttention/checkpoint-170/special_tokens_map.json
Deleting older checkpoint [output/MUSEAttention/checkpoint-7] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./outpu

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


EMSA(
  (fc_q): Linear(in_features=768, out_features=768, bias=True)
  (fc_k): Linear(in_features=768, out_features=768, bias=True)
  (fc_v): Linear(in_features=768, out_features=768, bias=True)
  (fc_o): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (sr): Sequential()
  (sr_conv): Conv2d(768, 768, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=768)
  (sr_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (transform): Sequential(
    (conv): Conv2d(12, 12, kernel_size=(1, 1), stride=(1, 1))
    (softmax): Softmax(dim=-1)
    (in): InstanceNorm2d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  )
)
SimplifiedScaledDotProductAttention(
  (fc_o): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running training *****
  Num examples = 1020
  Num Epochs = 15
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 2550


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,6.857039,0.9006,0.0705,0.8276,0.8303,5.0588
2,No log,6.544621,4.9812,0.3237,4.5622,4.5566,11.8505
3,6.699700,6.492774,7.3269,0.8591,6.4537,6.464,14.2721
4,6.699700,6.518483,7.9678,0.9281,6.7366,6.7278,13.4926
5,6.699700,6.501227,10.4852,1.2745,8.7857,8.7873,15.7475
6,5.267500,6.539152,10.108,1.3462,8.4323,8.448,14.6765
7,5.267500,6.55037,9.8588,1.3107,7.9646,7.9801,14.3162
8,5.267500,6.601139,10.8097,1.3077,9.1113,9.1257,15.5123
9,4.928700,6.559929,10.2109,1.3265,8.5124,8.5567,14.7353
10,4.928700,6.573251,10.3894,1.2337,8.6595,8.6666,15.3382


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./output/SimplifiedScaledDotProductAttention/checkpoint-170
Configuration saved in ./output/SimplifiedScaledDotProductAttention/checkpoint-170/config.json
Model weights saved in ./output/SimplifiedScaledDotProductAttention/checkpoint-170/pytorch_model.bin
tokenizer config file saved in ./output/SimplifiedScaledDotProductAttention/checkpoint-170/tokenizer_config.json
Special tokens file saved in ./output/SimplifiedScaledDotProductAttention/checkpoint-170/special_tokens_map.json
Deleting older checkpoint [output/SimplifiedScaledDotProductAttention/checkpoint-7] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ign

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


ScaledDotProductAttention(
  (fc_q): Linear(in_features=768, out_features=768, bias=True)
  (fc_k): Linear(in_features=768, out_features=768, bias=True)
  (fc_v): Linear(in_features=768, out_features=768, bias=True)
  (fc_o): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running training *****
  Num examples = 1020
  Num Epochs = 15
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 2550


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,7.611351,7.6686,0.0,7.643,7.6761,7.1176
2,No log,7.116733,0.0,0.0,0.0,0.0,4.0098
3,7.327000,7.085515,0.0,0.0,0.0,0.0,4.0
4,7.327000,6.972314,0.0,0.0,0.0,0.0,4.0
5,7.327000,6.967639,0.0,0.0,0.0,0.0,4.0
6,5.834600,7.054784,0.0,0.0,0.0,0.0,4.0
7,5.834600,7.103719,0.0,0.0,0.0,0.0,4.0
8,5.834600,7.127185,0.0,0.0,0.0,0.0,3.6299
9,5.484100,7.137162,0.0,0.0,0.0,0.0,3.0
10,5.484100,7.201315,0.4522,0.0,0.4546,0.4598,12.1887


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./output/ScaledDotProductAttention/checkpoint-170
Configuration saved in ./output/ScaledDotProductAttention/checkpoint-170/config.json
Model weights saved in ./output/ScaledDotProductAttention/checkpoint-170/pytorch_model.bin
tokenizer config file saved in ./output/ScaledDotProductAttention/checkpoint-170/tokenizer_config.json
Special tokens file saved in ./output/ScaledDotProductAttention/checkpoint-170/special_tokens_map.json
Deleting older checkpoint [output/ScaledDotProductAttention/checkpoint-7] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


ExternalAttention(
  (mk): Linear(in_features=768, out_features=8, bias=False)
  (mv): Linear(in_features=8, out_features=768, bias=False)
  (softmax): Softmax(dim=1)
)


Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running training *****
  Num examples = 1020
  Num Epochs = 15
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 2550


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,10.813046,7.761,0.0,7.7245,7.5304,8.0
2,No log,10.768181,10.7895,0.0,10.6884,10.6971,11.0
3,10.780100,10.683411,10.4293,0.0,10.3333,10.4329,14.0
4,10.780100,10.577523,8.1469,0.0,8.1152,7.5914,11.0
5,10.780100,10.462443,6.0274,0.0,6.0235,6.0384,8.0
6,10.497300,10.346169,6.0274,0.0,6.0235,6.0384,8.0
7,10.497300,10.234465,6.0274,0.0,6.0235,6.0384,8.0
8,10.497300,10.13295,0.0,0.0,0.0,0.0,5.0
9,10.127700,10.041049,0.0,0.0,0.0,0.0,5.0
10,10.127700,9.963591,0.0,0.0,0.0,0.0,5.0


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./output/ExternalAttention/checkpoint-170
Configuration saved in ./output/ExternalAttention/checkpoint-170/config.json
Model weights saved in ./output/ExternalAttention/checkpoint-170/pytorch_model.bin
tokenizer config file saved in ./output/ExternalAttention/checkpoint-170/tokenizer_config.json
Special tokens file saved in ./output/ExternalAttention/checkpoint-170/special_tokens_map.json
Deleting older checkpoint [output/ExternalAttention/checkpoint-7] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving mod

### Light 2L Attentions 

### 2L Attentions with Skip connection 

### 4L Attentions

### Etc

In [11]:
from torch import nn
import torch,copy
class cons(nn.Module):
    def __init__(self):
        super(cons,self).__init__()
        self.front = nn.Sequential(
            nn.Conv2d(1,32,3),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(32),
            nn.ReLU() ,#batchnorm 2d ?
            nn.Conv2d(32,16,2),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Flatten()
        )
        sple = torch.rand(1,1,28,28)
        front_out = self.front(sple).size()[-1]
        self.body = nn.Sequential(
            nn.Linear(front_out,256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(p=0.2),
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            # nn.Linear(128,10),
        )
        self.medium = nn.Sequential(
            nn.Linear(128,64),
            nn.Linear(64,128)
        )
        self.macro = [copy.deepcopy(self.medium) for _ in range(3)]
        self.macro = nn.ModuleList(self.macro)
        
        self.fc = nn.Sequential(
            nn.Linear(128,10),
            nn.Softmax(dim=1)
        )
    def forward(self,x):
        x = self.front(x)
        x = self.body(x)
        for i in range(len(self.macro)):
            x = self.macro[i](x)
        x = self.fc(x)
        return x
md = cons()

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [2]:
from torch import nn
sli = [nn.Sequential(nn.Linear(128,64),nn.Linear(64,128)),
       nn.Sequential(nn.Linear(128,64),nn.Linear(64,128)),
       nn.Sequential(nn.Linear(128,64),nn.Linear(64,128))]
mli = nn.ModuleList(sli)

In [12]:
from torchinfo import summary as sm
sm(md,(4,1,28,28))

Layer (type:depth-idx)                   Output Shape              Param #
cons                                     --                        --
├─ModuleList: 1-1                        --                        --
├─Sequential: 1-2                        [4, 576]                  --
│    └─Conv2d: 2-1                       [4, 32, 26, 26]           320
│    └─MaxPool2d: 2-2                    [4, 32, 13, 13]           --
│    └─BatchNorm2d: 2-3                  [4, 32, 13, 13]           64
│    └─ReLU: 2-4                         [4, 32, 13, 13]           --
│    └─Conv2d: 2-5                       [4, 16, 12, 12]           2,064
│    └─MaxPool2d: 2-6                    [4, 16, 6, 6]             --
│    └─BatchNorm2d: 2-7                  [4, 16, 6, 6]             32
│    └─ReLU: 2-8                         [4, 16, 6, 6]             --
│    └─Flatten: 2-9                      [4, 576]                  --
├─Sequential: 1-3                        [4, 128]                  --
│    └─Line

In [13]:
md

cons(
  (front): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): ReLU()
    (4): Conv2d(32, 16, kernel_size=(2, 2), stride=(1, 1))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): ReLU()
    (8): Flatten(start_dim=1, end_dim=-1)
  )
  (body): Sequential(
    (0): Linear(in_features=576, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
  )
  (medium): Sequential(
    (

### Eval 

In [8]:
import pickle
with open("attns.pickle","wb") as fw:
    pickle.dump(evals, fw)

In [9]:
evals.update(evals_base)

In [10]:
import pandas as pd
pd.DataFrame(evals).T

Unnamed: 0,eval_loss,eval_rouge1,eval_rouge2,eval_rougeL,eval_rougeLsum,eval_gen_len,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
UFOAttention,6.409752,11.7499,1.408,10.2117,10.2238,15.4963,39.6509,10.315,1.74,15.0
MUSEAttention,6.341923,7.2341,0.0,6.3786,6.3928,16.335,40.2946,10.15,1.712,15.0
SimplifiedScaledDotProductAttention,6.524034,7.5712,0.6479,6.6932,6.724,14.6406,39.1062,10.459,1.764,15.0
ScaledDotProductAttention,7.004627,0.0,0.0,0.0,0.0,4.0,29.5214,13.854,2.337,15.0
ExternalAttention,9.794738,0.0,0.0,0.0,0.0,5.0,29.9376,13.662,2.305,15.0
base,8.350389,0.0,0.0,0.0,0.0,2.0,29.3628,13.929,2.35,15.0


### basic 

In [7]:
from torch import nn
evals_base = {}
for i,attn in enumerate(['base']):

    import os
    model_name = attn
    # model_name = model_checkpoint.split("/")[-1] + model_name
    saved_path = os.path.join('./output',model_name)
    
    args = Seq2SeqTrainingArguments(
        saved_path,
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=15,
        predict_with_generate=True,
        fp16=True,
        save_strategy='epoch',
        load_best_model_at_end=True,
    #     push_to_hub=True,
    )

    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=tr,
        eval_dataset=val,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    
    evals[f'{model_name}'] = trainer.evaluate(eval_dataset=te)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp fp16 backend
The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running training *****
  Num examples = 1020
  Num Epochs = 15
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 2550


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,9.611067,0.0,0.0,0.0,0.0,2.0
2,No log,9.436999,0.0,0.0,0.0,0.0,2.0
3,9.370300,9.270031,0.0,0.0,0.0,0.0,2.0
4,9.370300,9.115933,0.0,0.0,0.0,0.0,2.0
5,9.370300,8.973625,0.0,0.0,0.0,0.0,2.0
6,8.827900,8.845672,0.0,0.0,0.0,0.0,2.0
7,8.827900,8.732598,0.0,0.0,0.0,0.0,2.0
8,8.827900,8.635551,0.0,0.0,0.0,0.0,2.0
9,8.406700,8.551472,0.0,0.0,0.0,0.0,2.0
10,8.406700,8.482576,0.0,0.0,0.0,0.0,2.0


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./output/base/checkpoint-170
Configuration saved in ./output/base/checkpoint-170/config.json
Model weights saved in ./output/base/checkpoint-170/pytorch_model.bin
tokenizer config file saved in ./output/base/checkpoint-170/tokenizer_config.json
Special tokens file saved in ./output/base/checkpoint-170/special_tokens_map.json
  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 408
  Batch size = 6
Saving model checkpoint to ./output/base/checkpoint-340
Configuration saved in ./output/base/checkpoint-340/config.json
Model weights saved in ./out