In [1]:
!cp ../input/feedback-utils/*.py .

In [2]:
# !git clone https://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@github.com/kenkrige/Feedback.git

In [3]:
from feedback_utils import *
import warnings
from tqdm.notebook import tqdm

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
pd.set_option('display.float_format', '{:0.2f}'.format)
warnings.filterwarnings("ignore")

In [4]:
NUM_JOBS = 4 #4

base_models = ['roberta-base',
               'microsoft/deberta-base',
               'xlnet-base-cased',
               'google/electra-base-discriminator',
               'bert-base-cased',
               'bert-base-uncased',
               'microsoft/deberta-v3-base', #'microsoft/mdeberta-v3-base'
              ]
large_models = [model.replace('base', 'large') for model in base_models] + ['bayartsogt/structbert-large','microsoft/deberta-v2-xlarge']

class Args:
    fold = 0
    domain_weights = base_models[1] # '../input/robertadomainweights/roberta-domain' #base_models[6] # domain_weights = '../input/garypc-weights'
    model = base_models[1]
    transformer_lr = 3e-5 # 1e-5
    lr_layer_decay = 0.95
    classifier_lr = 3e-5 # 1e-3
    output = './trainer_test'
    input = '../input/feedback-prize-2021/train'
    longformer_max = 1024 - 2
    shortformer_max = 512 - 64 # 512 - summary_length
    summary_length = 64
    skim_length = 64
    batch_size = 8 # 5
    valid_batch_size = 1
    epochs = 1
    accumulation_steps = 2
    local_attention_len = 64
    global_attention_lead = 64
    global_attention_tail = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
args = Args()

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model)

os.makedirs(args.output, exist_ok=True)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [6]:
df = pd.read_csv('../input/feedback-split/train_folds.csv')
predictions = df.predictionstring.str.split()
df['start'] = predictions.map(lambda t: int(t[0]))
df['end'] = predictions.map(lambda t: int(t[-1]) + 1)
train_df = df[df["kfold"] != args.fold].reset_index(drop=True)
valid_df = df[df["kfold"] == args.fold].reset_index(drop=True)

# Debug only
# train_df = train_df[train_df.id.isin(train_df.id.unique()[:1000])]
# valid_df = valid_df[valid_df.id.isin(valid_df.id.unique()[:100])]

In [7]:
import os

import numpy as np
import pandas as pd
import torch
import re
from joblib import Parallel, delayed
from tqdm.auto import tqdm


paragraph_terminator = re.compile(r'\n\n.')
word_terminator = re.compile(r'\s+')

MIN_EDGE_PARAGRAPH = 15 # Tokens
MIN_BODY_PARAGRAPH = 15 # Tokens
MAX_BODY_PARAGRAPH = 512 - 64 # Tokens
# Splitting long paragraphs, look for sentence bounds near ideal position, but don't stray further than this number of tokens.
MAX_SPLIT = 30

def _bounds(terminator, text):
    n = len(text)
    text = text.strip()
    return [0] + [s.span(0)[0] for s in terminator.finditer(text)] + [n]

def get_paragraphs(text, args, word_from_token, sent_token_offsets):
    bounds = _bounds(paragraph_terminator, text)
#     print(bounds)
    starts = bounds[:-1]
    ends = bounds[1:]
    para_word_offsets = np.cumsum([0] + [len(text[starts[i]:ends[i]].strip().split()) for i in range(len(starts))])
    bounds = np.searchsorted(word_from_token, para_word_offsets)
#     print(bounds)
    
    lengths = bounds[1:] - bounds[:-1]
    if len(bounds) > 3:
        # Blend short heading with lead and short salutation with conclusion.
        if lengths[0] < MIN_EDGE_PARAGRAPH: bounds = np.concatenate([bounds[:1],bounds[2:]])
        if lengths[-1] < MIN_EDGE_PARAGRAPH: bounds = np.concatenate([bounds[:-2], bounds[-1:]])
    lengths = bounds[1:] - bounds[:-1]
    # Blend sequential short paragraphs
    shorts = lengths < MIN_BODY_PARAGRAPH
    # Concatenates sequential short paragraphs
    keeps = [True] + list(~(shorts[1:] * shorts[:-1])) + [True]
    # Concatenates all short paragraphs to preceding paragraph.
#     keeps = [True] + list(~shorts[1:]) + [True]
    bounds = bounds[keeps]

    lengths = bounds[1:] - bounds[:-1]
    longs = lengths > args.shortformer_max
#     print(longs)
    longs_ = np.stack([bounds[:-1], bounds[1:]], axis=1)[longs]
    lengths = lengths[longs]
    lengths = lengths // (lengths // args.shortformer_max + 1) + 1
    sliced_bounds = []
    for span, new_para in zip(longs_,lengths):
        candidate_splits = list(range(span[0] + new_para, span[1], new_para))
        nearest_sent_idx = np.searchsorted(sent_token_offsets[:-1,1], list(range(span[0] + new_para, span[1], new_para)))
        possible_splits = sent_token_offsets[nearest_sent_idx]
        shifts = [candidate_splits - possible_splits[:,0], possible_splits[:,1] - candidate_splits, np.ones(len(candidate_splits), dtype=int) * MAX_SPLIT]
        shifts = np.transpose(shifts)
        idxs = np.argmin(shifts, axis=1)
        all_candidates = np.concatenate([possible_splits, np.transpose([candidate_splits])], axis=1)
        extras = np.sum(all_candidates * np.eye(3, dtype=int)[idxs], axis=1)
        sliced_bounds.extend(extras)
    sliced_bounds = sorted(sliced_bounds)
#     print(sliced_bounds)
    sliced_bounds = list(zip(sliced_bounds,np.zeros(len(sliced_bounds),dtype=int)))
    all_bounds = list(zip(bounds,np.ones(len(bounds),dtype=int))) + sliced_bounds
    all_bounds = sorted(all_bounds, key=lambda t:t[0])
    paragraph_types = [b[1] for b in all_bounds]
    paragraph_types = list(zip(paragraph_types[:-1], paragraph_types[1:]))
    all_bounds = [b[0] for b in all_bounds]
#     print(all_bounds)

    return bounds, all_bounds, paragraph_types


def _data_prep_loop(args, tokenizer, sentence_tokenizer, ids=None, df=None):
    if ids is None:
        ids = [f[:-4] for f in os.listdir(args.input)] if df is None else df["id"].unique()
    training_samples = []
    token_from_id = {v:k for k,v in tokenizer.vocab.items()}
    SPACE = token_from_id[tokenizer.encode('it is', add_special_tokens=False)[1]][0]
    try:
        RET = tokenizer.encode('it.\n\nIs', add_special_tokens=False)[-2]
    except:
        RET = -100
    back_tick = re.compile(r'´')
    for idx in tqdm(ids):
        text = open(os.path.join(args.input, idx + ".txt"), 'r').read().strip()
        text = '\n\n'.join([' '.join(p.split()) for p in text.split('\n\n')])
        text = back_tick.sub("'", text)
        text = text.encode('ascii', 'ignore').decode('utf-8')
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
        )
        input_ids = encoded_text["input_ids"]
        num_tokens = len(input_ids)
        num_words = len(text.split())

        # This works for roberta / gpt type tokenizers
        # TODO: Check the +1
        word_from_token = np.cumsum([0] + [(token_from_id[input_ids[i]].startswith(SPACE) 
                                     or (input_ids[i - 1] == RET) 
                                     and (input_ids[i] != RET))
                                     for i in range(1, len(input_ids))])

        sent_word_offsets = np.cumsum([len(s.split()) for s in sentence_tokenizer.tokenize(text)])
        sent_token_offsets = np.searchsorted(word_from_token, sent_word_offsets)
        sent_token_offsets = np.transpose(np.stack([np.append([0], sent_token_offsets[:-1]), sent_token_offsets]))

        paragraphs, sliced_paragraphs, para_types = get_paragraphs(text, args, word_from_token, sent_token_offsets)
        
        long_chunks = [0, num_tokens]
        chunk_types = [[1,1]]
        if num_tokens > args.longformer_max:
            long_split = paragraphs[np.searchsorted(paragraphs, args.longformer_max) - 1]
            if num_tokens - long_split > args.longformer_max - args.summary_length:
                long_split = sent_token_offsets[:,0][np.searchsorted(sent_token_offsets[:,0], args.longformer_max) - 1]
            long_chunks = [0, long_split, num_tokens]
            chunk_types = [[1,0],[0,1]]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "word_from_token": word_from_token,
            "sent_token_offsets": sent_token_offsets,
            "paragraphs": paragraphs,
            "sliced_paragraphs": sliced_paragraphs,
            "para_types": para_types,
            "long_chunks": long_chunks,
            "chunk_types": chunk_types,
            "num_tokens": num_tokens,
        }

        if df is not None:
            token_labels = np.ones(len(input_ids)) * -10
            discourse_types = df.loc[df.id == idx,['class']].to_numpy(dtype=int)
            discourse_word_offsets = df.loc[df.id == idx,['start','end']].to_numpy(dtype=int)
            discourse_token_offsets = np.searchsorted(word_from_token, discourse_word_offsets)
            for i, (start,end) in enumerate(discourse_token_offsets):
                try:
                    token_labels[start] = discourse_types[i]
                    token_labels[start+1:end] = discourse_types[i] + 1
                except:
                    print(idx, start)
            token_labels[token_labels < 0] = 14 # Fix 14 with 'No class'
            sample["input_labels"] = list(token_labels)

        training_samples.append(sample)

    return training_samples

from joblib import Parallel, delayed
import nltk
def parallel_prep_data(args, tokenizer, df=None, num_jobs=4):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    if df is not None:
        df['class'] = df.discourse_type.map(lambda t: id_from_discourse[t] * 2)
        df = df.sort_values(['id','discourse_start']).reset_index()
        ids = df["id"].unique()
    else:
        ids = [f[:-4] for f in os.listdir(args.input)]

    id_splits = np.array_split(ids, num_jobs)
    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_data_prep_loop)(args, tokenizer, sent_tokenizer, idx, df) for idx in id_splits)
    training_samples = []
    for result in results:
        training_samples.extend(result)
    return training_samples

In [8]:
%%time
training_samples = parallel_prep_data(args, tokenizer, train_df, num_jobs=NUM_JOBS)
valid_samples = parallel_prep_data(args, tokenizer, valid_df, num_jobs=NUM_JOBS)

Token indices sequence length is longer than the specified maximum sequence length for this model (723 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (837 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1201 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

CPU times: user 24.9 s, sys: 8.45 s, total: 33.4 s
Wall time: 9min 43s


In [9]:
from feedback_datasets import SkimreadDataset
from transformers import Trainer, AutoConfig, AutoModel, AdamW, get_cosine_schedule_with_warmup
from torch import nn

def skim_read(args, samples, tokenizer, inplace=False):
    model = AutoModel.from_pretrained(args.domain_weights).cuda()
    skim_dataset = SkimreadDataset(samples, args.skim_length, tokenizer)

    trainer = Trainer(model=model, data_collator=Collate(tokenizer))
    predictions = trainer.predict(skim_dataset)
    
    if not inplace: return predictions

    for j, sample in enumerate(samples):
        if type(predictions.predictions) == tuple:
            preds = predictions.predictions[0][j]
        else:
            preds = predictions.predictions[j]
        if sample['num_tokens'] < preds.shape[0]:
            preds = preds[:sample['num_tokens']]
        sentiment = np.mean(preds, axis=0)
        scores = []
        for sent in sample['sent_token_offsets']:
            if sent[1] <= preds.shape[0]:
                sentence_sentiment = np.mean(preds[sent[0]:sent[1]], axis=0)
                score = 1 #np.matmul()
                score = np.matmul(sentiment, sentence_sentiment)
                scores.append(score)

        best = 0 if scores == [] else np.argmax(scores)
        sent_ment = sample['sent_token_offsets'][best]
        num_sentences = len(sample['sent_token_offsets'])
        while sent_ment[1] - sent_ment[0] < 15:
            best += 1
            if best >= num_sentences: break
            sent_ment[1] = sample['sent_token_offsets'][best][1]
        sample['global'] = sent_ment
#         sample['sentiment'] = sentiment

In [10]:
# from feedback_datasets import SkimreadDataset
# from feedback_models import skim_read
skim_read(args, valid_samples, tokenizer, inplace=True)

skim_read(args, training_samples, tokenizer, inplace=True)

Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.12.5",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

loading weights file https://huggingface.co

In [11]:
# import pickle
# training_samples = pickle.load(open('../input/longform-torch/training_samples.pkl', 'rb'))[:16]
# valid_samples = pickle.load(open('../input/longform-torch/valid_samples.pkl', 'rb'))[:8]

In [12]:
from feedback_datasets import ParagraphTrainingDataset

para_lengths = [len(sample['para_types']) for sample in valid_samples]
valid_sample_mapping = [e for sub in [zip([k] * chunks, range(chunks)) for k, chunks in enumerate(para_lengths)] for e in sub]       
valid_dataset = ParagraphTrainingDataset(valid_samples, args.shortformer_max, tokenizer, valid_sample_mapping)

para_lengths = [len(sample['para_types']) for sample in training_samples]
train_sample_mapping = [e for sub in [zip([k] * chunks, range(chunks)) for k, chunks in enumerate(para_lengths)] for e in sub]       
train_dataset = ParagraphTrainingDataset(training_samples, args.shortformer_max, tokenizer, train_sample_mapping)
num_train_steps = int(len(train_dataset) * args.epochs / args.batch_size / args.accumulation_steps)

len(train_dataset)

75024

In [13]:
predictions = valid_df.predictionstring.str.split()
ground_truth = valid_df[['id','discourse_type', 'start', 'end']]
ground_truth = ground_truth.rename(columns={'discourse_type':'class'})

ground_truth['len'] = ground_truth.end - ground_truth.start

In [14]:
from transformers import AutoConfig, AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput
from torch import nn
import re
from sklearn import metrics

from torch.utils.data import DataLoader

class ShortformerModel(nn.Module):
    def __init__(self, args, 
                 num_train_steps, 
                 attention_window, 
                 num_labels, 
                 steps_per_epoch, 
                 ground_truth=None, 
                 valid_samples=None, 
                 valid_index_map=None,
                 loss_weights=[1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]):
#                  loss_weights=[0.7, 1.0, 0.7, 1.0, 1.2, 1.1, 0.7, 1.1, 0.7, 1.1, 0.7, 1.0, 0.7, 1.0, 1.0]):
#                  loss_weights=[0.5,1.,0.5,1.,3.,2.,1.,2.,1.,2.,0.5,0.6,0.5,1.,0.6]):
        super().__init__()
        self.args = args
        self.num_train_steps = num_train_steps
        self.num_labels = num_labels
        self.steps_per_epoch = steps_per_epoch
        self.step_scheduler_after = "batch"
        self.loss_weights = loss_weights
        self.ground_truth = ground_truth
        self.valid_samples = valid_samples
        self.valid_index_map = valid_index_map
        self.preds = []

        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        config = AutoConfig.from_pretrained(args.model)
        config.update(
            {
                "attention_window": attention_window,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": self.num_labels,
            }
        )
        self.transformer = AutoModel.from_pretrained(args.model, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.output = nn.Linear(config.hidden_size + 4, self.num_labels)
        self.avg3 = nn.AvgPool1d(3, stride=1, padding=1)
#         self.inners = torch.tensor([0,1,0,1,0,1,0,1,0,1,0,1,0,1,1], dtype=torch.long)
#         self.begins = torch.tensor([1,0,1,0,1,0,1,0,1,0,1,0,1,0,0], dtype=torch.long)


    def loss(self, outputs, labels, loss_mask):
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(self.loss_weights), reduction='none')
        loss_fct = nn.CrossEntropyLoss(reduction='none')
        return  loss_fct(outputs.view(-1, self.num_labels), 
                         labels.view(-1)).matmul(loss_mask.view(-1).float()) / torch.sum(loss_mask)

    
    def forward(self, input_ids, attention_mask, loss_mask, features, token_type_ids=None, labels=None):

        if token_type_ids:
            transformer_out = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        else:
            transformer_out = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        features = features.unsqueeze(1).repeat(1, transformer_out.last_hidden_state.shape[1], 1)
            
        sequence_output = torch.cat([transformer_out.last_hidden_state,features],dim=-1)
    
        if self.training:
#             sequence_output = self.dropout(sequence_output)
#             logits = self.output(sequence_output)
#             loss = self.loss(logits, labels, loss_mask=loss_mask)
            
            logits1 = self.output(self.dropout1(sequence_output))
            logits2 = self.output(self.dropout2(sequence_output))
            logits3 = self.output(self.dropout3(sequence_output))
            logits = (logits1 + logits2 + logits3) / 3
            loss1 = self.loss(logits1, labels, loss_mask=loss_mask)
            loss2 = self.loss(logits2, labels, loss_mask=loss_mask)
            loss3 = self.loss(logits3, labels, loss_mask=loss_mask)
            loss = (loss1 + loss2 + loss3) / 3

            return SequenceClassifierOutput(loss=loss)

        inner_logits = torch.swapaxes(sequence_output,1,2)
        inner_logits = self.avg3(inner_logits)
        inner_logits = torch.swapaxes(inner_logits,1,2)
        inner_logits = self.output(inner_logits)
        inners = torch.tensor([0,1,0,1,0,1,0,1,0,1,0,1,0,1,1]).to(self.args.device)
        inner_logits = torch.multiply(inner_logits, inners)
        
        begin_logits = self.output(sequence_output)
        begins = torch.tensor([1,0,1,0,1,0,1,0,1,0,1,0,1,0,0]).to(self.args.device)
        begin_logits = torch.multiply(begin_logits, begins)

        logits = torch.add(inner_logits, begin_logits)
        
        if labels is None: return SequenceClassifierOutput(logits=logits)

        loss = self.loss(logits, labels, loss_mask=loss_mask)
        
        return SequenceClassifierOutput(loss=loss, logits=logits) 
    

In [15]:
model = ShortformerModel(
    args=args,
    num_train_steps=num_train_steps, #5
    attention_window=args.local_attention_len,
    num_labels=len(target_id_map) - 1,
    steps_per_epoch=len(train_dataset) / args.batch_size, #100
    ground_truth=ground_truth,
    valid_samples=valid_samples,
    valid_index_map=valid_sample_mapping,
)

loading configuration file https://huggingface.co/microsoft/deberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e313266bff73867debdfa78c78a9a4966d5e78281ac4ed7048c178b16a37eba7.fb501413b9cef9cef6babdc543bb4153cbec58d52bce077647efba3e3f14ccf3
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.12.5",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

loading weights file https://huggingface.co

In [16]:
# weights = torch.load(os.path.join(args.domain_weights, 'pytorch_model.bin'),map_location=torch.device(args.device))
# weights = {k.replace('roberta','transformer'):v for k,v in weights.items()}
# missing = [k for k in model.state_dict() if k not in weights.keys()]
# extra = [k for k in weights.keys() if k not in model.state_dict()]
# for k in extra: weights.pop(k)
# wrong_size = ['transformer.embeddings.position_ids',
#               'transformer.embeddings.position_embeddings.weight']
# for k in missing + wrong_size: weights[k] = model.state_dict()[k]

# model.load_state_dict(weights)

In [17]:
# from feedback_models import FeedbackTrainer

In [18]:
from transformers import Trainer,  AdamW, get_cosine_schedule_with_warmup
from torch.utils.data import DataLoader
class FeedbackTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        print(kwargs.keys())
        self.global_args = kwargs.pop('global_args', None)
        self.tokenizer = tokenizer #kwargs.pop('tokenizer')
        super().__init__(*args, **kwargs)

    def create_optimizer(self):
        layers = re.compile(r'layer.([0-9]+)')
        num_layers = max([int(l) for l in layers.findall(''.join([n for n,p in self.model.transformer.named_parameters()]))]) + 1
        def layer_lr(layer):
            return np.power(self.global_args.lr_layer_decay, num_layers - layer - 1) * self.global_args.transformer_lr    

        # Also try param.requires_grad = False for lower layers
        no_decay = {n:0 for n, p in self.model.transformer.named_parameters() 
                    if any(nd in n for nd in ['bias', 'gamma', 'beta'])}
        p0 = {'params': [p for n, p in self.model.named_parameters() if "transformer" not in n],
              'lr':self.global_args.classifier_lr, "momentum" : 0.99}
        p1 = {"params": [p for n, p in  self.model.transformer.named_parameters() 
                         if not n in no_decay and not 'layer.' in n],
              'weight_decay_rate': 0.01}
        p2 = {"params": [p for n, p in  self.model.transformer.named_parameters() 
                         if n in no_decay and not 'layer.' in n],
              'weight_decay_rate': 0.0}
        params = [p0,p1,p2]
        for layer in range(num_layers):
            p3 = {'params': [p for n, p in self.model.transformer.named_parameters() 
                             if not n in no_decay and f'layer.{str(layer)}.' in n],
                    'weight_decay_rate': 0.01, 'lr': layer_lr(layer)}
            params.append(p3)
            p4 = {'params': [p for n, p in self.model.transformer.named_parameters() 
                             if n in no_decay and f'layer.{str(layer)}.' in n],
                    'weight_decay_rate': 0.0, 'lr': layer_lr(layer)}
            params.append(p4)
        
        self.optimizer = AdamW(params)    

        return  self.optimizer

In [19]:
from feedback_utils import Collate

In [20]:
from transformers import TrainingArguments, EarlyStoppingCallback
train_args = TrainingArguments(
    overwrite_output_dir=True,
    num_train_epochs = args.epochs,
    gradient_accumulation_steps = args.accumulation_steps,
    lr_scheduler_type = 'cosine',
    warmup_ratio  = 0.1,
    output_dir = args.output,
#     bf16 = False,
    tpu_num_cores = 0,
    dataloader_num_workers = 2,
    evaluation_strategy='no', # 'steps', 'epoch'
    save_steps=2345,
#     save_total_limit=3,
#     eval_steps=100,
    logging_steps=100,
    report_to='none',
    per_device_train_batch_size=args.batch_size,
    seed=56,
#     load_best_model_at_end=True,
#     metric_for_best_model='f1',
    logging_first_step=True,
)
trainer = FeedbackTrainer(
    model=model, 
    args=train_args,
    global_args=args,
    tokenizer=tokenizer,
    data_collator=Collate(tokenizer), 
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
#     compute_metrics=compute_metrics,
)


PyTorch: setting up devices


dict_keys(['model', 'args', 'global_args', 'tokenizer', 'data_collator', 'train_dataset', 'eval_dataset'])


In [21]:
trainer.train()

***** Running training *****
  Num examples = 75024
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 4689


Step,Training Loss
1,3.0256
100,2.2188
200,1.2831
300,1.0998
400,0.9616
500,0.9238
600,0.8996
700,0.8225
800,0.7728
900,0.7505


Saving model checkpoint to ./trainer_test/checkpoint-2345
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in ./trainer_test/checkpoint-2345/tokenizer_config.json
Special tokens file saved in ./trainer_test/checkpoint-2345/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4689, training_loss=0.7562219087287901, metrics={'train_runtime': 2849.7371, 'train_samples_per_second': 26.327, 'train_steps_per_second': 1.645, 'total_flos': 0.0, 'train_loss': 0.7562219087287901, 'epoch': 1.0})

In [22]:
predictions = trainer.predict(valid_dataset)

***** Running Prediction *****
  Num examples = 5342
  Batch size = 8


In [23]:
submission = get_submission_ken([predictions.predictions], 
                   valid_samples, 
                   valid_sample_mapping, 
                   chunk_type='sliced_paragraphs')
epoch_score, board = feedback_score(submission, ground_truth)
print(f'{board}\n\nLB Score: {epoch_score:0.3f}\n') #0.325 deberta 3 cleaned. 0.30 deberta-3 original. 0.321 roberta og. 0.387 deberta og


                        TP    FP    FN  Prec  Recall   F1
class                                                    
Claim                 1799  2219  1412  0.45    0.56 0.50
Concluding Statement   726   320   138  0.69    0.84 0.76
Counterclaim           169   175   212  0.49    0.44 0.47
Evidence              2138  1490   789  0.59    0.73 0.65
Lead                   495   292   101  0.63    0.83 0.72
Position               649   459   339  0.59    0.66 0.62
Rebuttal                94   127   182  0.43    0.34 0.38

LB Score: 0.584



In [24]:
# 0.318 deberta 3 cleaned
# 0.383 deberta cleaned
# 0.332 roberta cleaned
# FAIL ids electra cleaned
# 0.339 xlnet cleaned
# 0.296 roberta cleaned, domain pretrained, better losses ... ?