In [1]:
import torch
import numpy as np
import random
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (LongformerConfig, LongformerModel, LongformerTokenizerFast, AutoConfig, AutoModel,
                          AutoTokenizer)
from torch.cuda.amp import autocast, GradScaler
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

In [2]:
class Config:
    random_seed = 42  # 随机数种子
    n_epoch = 1  # 训练轮数(训练更多轮次更好)
    verbose_steps = 1000

    model_name = 'allenai/longformer-base-4096'  # 选择'allenai/longformer-large-4096'更好
    max_length = 1024  # 句子最大长度(选择1600更好)
    lr = 4e-5
    train_batch_size = 4
    valid_batch_size = 4

In [3]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


set_seed(Config.random_seed)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
IGNORE_INDEX = -100  # special tokens(如'[CLS]', '[SEP]', '[PAD]')的labels ids设置为-100
NON_LABEL = -1  # special tokens(如'[CLS]', '[SEP]', '[PAD]')的word ids设置为-1(默认为None)

OUTPUT_LABELS = ['O',
                 'B-Lead', 'I-Lead',
                 'B-Position', 'I-Position',
                 'B-Claim', 'I-Claim',
                 'B-Counterclaim', 'I-Counterclaim',
                 'B-Rebuttal', 'I-Rebuttal',
                 'B-Evidence', 'I-Evidence',
                 'B-Concluding Statement', 'I-Concluding Statement']
LABELS_TO_IDS = {v: k for k, v in enumerate(OUTPUT_LABELS)}
IDS_TO_LABELS = {k: v for k, v in enumerate(OUTPUT_LABELS)}
LABELS_TO_IDS

{'O': 0,
 'B-Lead': 1,
 'I-Lead': 2,
 'B-Position': 3,
 'I-Position': 4,
 'B-Claim': 5,
 'I-Claim': 6,
 'B-Counterclaim': 7,
 'I-Counterclaim': 8,
 'B-Rebuttal': 9,
 'I-Rebuttal': 10,
 'B-Evidence': 11,
 'I-Evidence': 12,
 'B-Concluding Statement': 13,
 'I-Concluding Statement': 14}

In [6]:
# 超参数,可通过optuna包学习可得
MIN_THRESH = {"I-Lead": 9, "I-Position": 5, "I-Evidence": 14, "I-Claim": 3,
              "I-Concluding Statement": 11, "I-Counterclaim": 6, "I-Rebuttal": 4}  # 最小单词长度固定为 3

# 超参数,可通过optuna包学习可得
PROB_THRESH = {"I-Lead": 0.7, "I-Position": 0.55, "I-Evidence": 0.65, "I-Claim": 0.55,
               "I-Concluding Statement": 0.7, "I-Counterclaim": 0.5, "I-Rebuttal": 0.55}

In [7]:
df_alltrain = pd.read_csv('datasets/train.csv')
df_alltrain.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [8]:
alltrain_texts = pd.read_pickle(f'data/alltrain_texts.pkl')
# alltrain_texts = alltrain_texts.sample(frac=0.2, replace=False, random_state=Config.random_seed + 1).reset_index(drop=True)  # 使用1%的数据进行debug
alltrain_texts.head()

Unnamed: 0,id,text,text_split,entities,fold
0,0000D23A521A,"Some people belive that the so called ""face"" o...","[Some, people, belive, that, the, so, called, ...","[B-Position, I-Position, I-Position, I-Positio...",0.0
1,00066EA9880D,Driverless cars are exaclty what you would exp...,"[Driverless, cars, are, exaclty, what, you, wo...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",4.0
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"[Dear:, Principal, I, am, arguing, against, th...","[O, O, B-Position, I-Position, I-Position, I-P...",3.0
3,001552828BD0,Would you be able to give your car up? Having ...,"[Would, you, be, able, to, give, your, car, up...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",0.0
4,0016926B079C,I think that students would benefit from learn...,"[I, think, that, students, would, benefit, fro...","[B-Position, I-Position, I-Position, I-Positio...",4.0


In [9]:
class FeedbackPrizeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, has_labels):
        self.len = dataframe.shape[0]
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_labels = has_labels

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        text = self.data.text[index]
        encoding = self.tokenizer(
            text.split(),
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len)
        # # Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
        word_ids = encoding.word_ids()

        # targets
        if self.has_labels:
            word_labels = self.data.entities[index]
            labels_ids = []
            for word_idx in word_ids:
                if word_idx is None:  # 此时token为special tokens
                    labels_ids.append(IGNORE_INDEX)
                else:
                    labels_ids.append(LABELS_TO_IDS[word_labels[word_idx]])
            encoding['labels'] = labels_ids

        # convert to torch.tensor
        item = {k: torch.tensor(v) for k, v in encoding.items()}
        word_ids2 = [w if w is not None else NON_LABEL for w in word_ids]
        item['word_ids'] = torch.tensor(word_ids2)
        return item

In [10]:
class FeedbackModel(nn.Module):
    def __init__(self):
        super(FeedbackModel, self).__init__()
        if Config.model_name.find('longformer') != -1:
            # longformer不能使用AutoConfig,AutoModel导入
            model_config = LongformerConfig.from_pretrained(Config.model_name)
            self.pretrained = LongformerModel.from_pretrained(Config.model_name, config=model_config)
        else:
            model_config = AutoConfig.from_pretrained(Config.model_name)
            self.pretrained = AutoModel.from_pretrained(Config.model_name, config=model_config)

        self.model_config = model_config
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.head = nn.Linear(model_config.hidden_size, len(LABELS_TO_IDS))

    def forward(self, input_ids, mask):
        # x.shape=[batch_size, sequence_length, hidden_size]
        x = self.pretrained(input_ids, mask)[0]
        logits1 = self.head(self.dropout1(x))
        logits2 = self.head(self.dropout2(x))
        logits3 = self.head(self.dropout3(x))
        logits4 = self.head(self.dropout4(x))
        logits5 = self.head(self.dropout5(x))
        # logits.shape=[batch_size, sequence_length, len(LABELS_TO_IDS)]
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits

In [11]:
def build_tokenizer_model():
    if Config.model_name.find('longformer') != -1:
        # 必须使用快速标记器(word_ids()方法所要求)
        tokenizer = LongformerTokenizerFast.from_pretrained(Config.model_name, add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(Config.model_name, add_prefix_space=True)
    model = FeedbackModel()
    return model, tokenizer

In [12]:
def active_logits(raw_logits, word_ids):
    # word_ids.shape=[batch_size * seq_length]
    word_ids = word_ids.view(-1)
    # activate_mask.shape=[batch_size * seq_length, len(LABELS_TO_IDS)]
    active_mask = word_ids.unsqueeze(1).expand(word_ids.shape[0], len(LABELS_TO_IDS))
    active_mask = active_mask != NON_LABEL
    # active_logits.shape=[batch_size * sequence_length, len(LABELS_TO_IDS)]
    active_logits = raw_logits.view(-1, len(LABELS_TO_IDS))
    active_logits = torch.masked_select(active_logits, active_mask)  # 返回一个新的一维tensor
    # active_logits.shape=[?, len(LABELS_TO_IDS)]
    active_logits = active_logits.view(-1, len(LABELS_TO_IDS))
    return active_logits


def active_labels(labels):
    # active_mask.shape=[batch_size * seq_length]
    active_mask = labels.view(-1) != IGNORE_INDEX
    # active_labels.shape=[?]
    active_labels = torch.masked_select(labels.view(-1), active_mask)
    return active_labels

In [13]:
def train(model, dl_train, optimizer, epoch, criterion):
    model.train()
    scaler = GradScaler()

    train_loss = 0.0
    train_accuracy = 0.0
    for batch_idx, batch in enumerate(tqdm(dl_train), start=1):
        # ids.shape=[batch_size, seq_length]
        ids = batch['input_ids'].to(device, dtype=torch.long)
        # mask.shape=[batch_size, seq_length]
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        # raw_labels.shape=[batch_size, seq_length]
        raw_labels = batch['labels'].to(device, dtype=torch.long)
        # word_ids.shape=[batch_size, seq_length]
        word_ids = batch['word_ids'].to(device, dtype=torch.long)

        optimizer.zero_grad()

        # 混合精度运行
        with autocast():
            # raw_logits.shape=[batch_size, sequence_length, len(LABELS_TO_IDS)]
            raw_logits = model(input_ids=ids, mask=mask)

        logits = active_logits(raw_logits, word_ids)
        labels = active_labels(raw_labels)
        sf_logits = torch.softmax(logits, dim=-1)
        preds = torch.argmax(sf_logits, dim=-1)  # 最大值索引

        loss = criterion(logits, labels)
        train_loss += loss.item()
        train_accuracy += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        if batch_idx % Config.verbose_steps == 0:
            loss_step = train_loss / batch_idx
            print(f'Training loss after {batch_idx:04d} training steps: {loss_step}')

    epoch_loss = train_loss / batch_idx
    epoch_accuracy = train_accuracy / batch_idx
    print('| end of epoch {:5d} | training loss {:8.5f} | training accuracy {:8.5f} |'.format(epoch,
                                                                                              epoch_loss,
                                                                                              epoch_accuracy))

In [14]:
def inference(model, dl, criterion):
    model.eval()

    valid_loss = 0.0
    valid_accuracy = 0.0
    all_logits = None  # 验证数据集预测结果
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(dl), start=1):
            # ids.shape=[batch_size, seq_length]
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            word_ids = batch['word_ids'].to(device, dtype=torch.long)
            raw_labels = batch['labels'].to(device, dtype=torch.long)

            # raw_logits.shape=[batch_size, sequence_length, len(LABELS_TO_IDS)]
            raw_logits = model(input_ids=ids, mask=mask)

            logits = active_logits(raw_logits, word_ids)
            labels = active_labels(raw_labels)
            sf_logits = torch.softmax(logits, dim=-1)
            preds = torch.argmax(sf_logits, dim=1)  # 最大值索引

            loss = criterion(logits, labels)
            valid_loss += loss.item()
            valid_accuracy += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

            sf_raw_logits = torch.softmax(raw_logits, dim=-1)
            if batch_idx == 1:
                all_logits = sf_raw_logits.cpu().numpy()
            else:
                all_logits = np.append(all_logits, sf_raw_logits.cpu().numpy(), axis=0)

    epoch_loss = valid_loss / batch_idx  # 每个epoch验证数据集平均损失
    epoch_accuracy = valid_accuracy / batch_idx  # 每个epoch验证数据集平均准确率
    # all_logits.shape=[len(all_data), sequence_length, len(LABELS_TO_IDS)]
    return all_logits, epoch_loss, epoch_accuracy

In [15]:
def preds_class_prob(all_logits, dl):
    print("predict target class and its probabilty")
    len_sample = all_logits.shape[0]

    final_predictions = []  # 所有句子每个单词的标签预测
    final_predictions_score = []  # 所有句子每个单词标签预测的概率
    for batch_idx, batch in enumerate(tqdm(dl), start=0):
        for minibatch_idx in range(Config.valid_batch_size):
            predictions = []  # 单个句子每个单词的标签预测
            predictions_prob = []  # 单个句子每个单词标签预测的概率

            sample_idx = int(batch_idx * Config.valid_batch_size + minibatch_idx)
            if sample_idx > len_sample - 1:
                break  # 最后一批数据可能不能被batch_size整除

            # word_ids.shape=[sequence_length]
            word_ids = batch['word_ids'][minibatch_idx].numpy()

            # pred_class_id.shape=[sequence_length]
            pred_class_id = np.argmax(all_logits[sample_idx], axis=1)
            pred_class_labels = [IDS_TO_LABELS[i] for i in pred_class_id]
            # pred_score.shape=[sequence_length]
            pred_score = np.max(all_logits[sample_idx], axis=1)

            prev_word_idx = -1
            for idx, word_idx in enumerate(word_ids):
                if word_idx == -1:
                    pass
                elif word_idx != prev_word_idx:  # 英文可以用更小的词片段来组成更大的词(wordPiece或BPE分词),这里取第一个词片段的label作为整个词的label
                    predictions.append(pred_class_labels[idx])
                    predictions_prob.append(pred_score[idx])
                    prev_word_idx = word_idx

            final_predictions.append(predictions)
            final_predictions_score.append(predictions_prob)
    return final_predictions, final_predictions_score

In [16]:
def post_process_pred(df, all_preds, all_preds_prob):
    final_preds = []
    for i in range(len(df)):
        idx = df.id.values[i]
        pred = all_preds[i]
        pred_prob = all_preds_prob[i]

        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == 'O':
                j += 1
            else:
                # 合并具有相同标签的连续预测step 1===>'B-???????'替换为'I-???????'
                cls = cls.replace('B', 'I')

            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1

            if cls != 'O' and cls != '':
                avg_score = np.mean(pred_prob[j:end])  # 该段论证(discourse)的平均概率
                if end - j > MIN_THRESH[cls] and avg_score > PROB_THRESH[cls]:  # 只保留平均概率高于阈值且大于固定单词长度的预测
                    # 合并具有相同标签的连续预测step 2===>
                    # 最终合并后的标签为:"Lead"/"Position"/"Evidence"/"Claim"/"Concluding Statement"/"Counterclaim"/"I-Rebuttal"
                    final_preds.append((idx, cls.replace('I-', ''), ' '.join(map(str, list(range(j, end))))))
            j = end

    df_pred = pd.DataFrame(final_preds, columns=['id', 'class', 'predictionstring'])
    return df_pred

In [17]:
def calc_overlap(row):
    """
    calculate the overlap between prediction and ground truth
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # length of each end intersection
    len_pred = len(set_pred)
    len_gt = len(set_gt)
    intersection = len(set_gt.intersection(set_pred))
    overlap_1 = intersection / len_gt
    overlap_2 = intersection / len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id', 'discourse_type', 'predictionstring']].reset_index(drop=True).copy()
    pred_df = pred_df[['id', 'class', 'predictionstring']].reset_index(drop=True).copy()
    gt_df['gt_id'] = gt_df.index
    pred_df['pred_id'] = pred_df.index
    joined = pred_df.merge(
        gt_df,
        left_on=['id', 'class'],
        right_on=['id', 'discourse_type'],
        how='outer',
        suffixes=['_pred', '_gt'])
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')
    joined['overlaps'] = joined.apply(calc_overlap, axis=1)
    # overlap over 0.5: true positive
    # If nultiple overlaps exists, the higher is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])

    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1', 'overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP').sort_values('max_overlap', ascending=False).groupby(
        ['id', 'predictionstring_gt']).first()['pred_id'].values

    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]
    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    macro_f1_score = TP / (TP + 1 / 2 * (FP + FN))
    return macro_f1_score

In [18]:
def evaluate(model, df_val, df_val_eval, dl_val, criterion):
    # logits.shape=[len(all_data), sequence_length, len(LABELS_TO_IDS)]
    logits, valid_loss, valid_acc = inference(model, dl_val, criterion)
    all_preds, all_preds_prob = preds_class_prob(logits, dl_val)
    oof = post_process_pred(df_val, all_preds, all_preds_prob)

    f1score = []
    classes = ['Lead', 'Position', 'Claim', 'Counterclaim', 'Rebuttal', 'Evidence', 'Concluding Statement']
    print(f"Validation F1 scores")

    for c in classes:
        # 计算不同分类的f1 score
        pred_df = oof.loc[oof['class'] == c].copy()
        gt_df = df_val_eval.loc[df_val_eval['discourse_type'] == c].copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        print(f' * {c:<10}: {f1:4f}')
        f1score.append(f1)
    f1avg = np.mean(f1score)  # 平均f1 score
    print(f'Overall Validation avg F1: {f1avg:.4f} val_loss:{valid_loss:.4f} val_accuracy:{valid_acc:.4f}')
    return valid_loss, oof

In [19]:
oof = pd.DataFrame()

for i_fold in range(5):
    print(f'===================== fold {i_fold} training =====================')

    model, tokenizer = build_tokenizer_model()
    model = model.to(device)
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=Config.lr)
    criterion = nn.CrossEntropyLoss()

    df_train = alltrain_texts[alltrain_texts['fold'] != i_fold].reset_index(drop=True)
    ds_train = FeedbackPrizeDataset(df_train, tokenizer, Config.max_length, True)
    dl_train = DataLoader(ds_train, batch_size=Config.train_batch_size, shuffle=True)

    df_val = alltrain_texts[alltrain_texts['fold'] == i_fold].reset_index(drop=True)
    ds_val = FeedbackPrizeDataset(df_val, tokenizer, Config.max_length, True)
    dl_val = DataLoader(ds_val, batch_size=Config.valid_batch_size, shuffle=False)

    val_idlist = df_val['id'].unique().tolist()
    df_val_eval = df_alltrain.query('id==@val_idlist').reset_index(drop=True)
    best_val_loss = np.inf

    _oof_fold_best = pd.DataFrame()
    for epoch in range(1, Config.n_epoch + 1):
        train(model, dl_train, optimizer, epoch, criterion)
        valid_loss, _oof = evaluate(model, df_val, df_val_eval, dl_val, criterion)
        if valid_loss < best_val_loss:
            best_val_loss = valid_loss
            _oof_fold_best = _oof
            _oof_fold_best['fold'] = i_fold
            # torch.save(model.state_dict(), 'longformer' + str(i_fold) + '.bin')  # 保存最优模型的状态字典

    oof = pd.concat([oof, _oof_fold_best])



Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3119 [00:00<?, ?it/s]

Training loss after 1000 training steps: 0.91301611328125
Training loss after 2000 training steps: 0.826619384765625
Training loss after 3000 training steps: 0.78606591796875
| end of epoch     1 | training loss  0.78356 | training accuracy  0.74206 |


  0%|          | 0/780 [00:00<?, ?it/s]

predict target class and its probabilty


  0%|          | 0/780 [00:00<?, ?it/s]

Validation F1 scores
 * Lead      : 0.791221
 * Position  : 0.617256
 * Claim     : 0.533475
 * Counterclaim: 0.406780
 * Rebuttal  : 0.264730
 * Evidence  : 0.636029
 * Concluding Statement: 0.792843
Overall Validation avg F1: 0.5775 val_loss:0.6991 val_accuracy:0.7648


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3119 [00:00<?, ?it/s]

Training loss after 1000 training steps: 0.924780517578125
Training loss after 2000 training steps: 0.828940673828125
Training loss after 3000 training steps: 0.7849332682291666
| end of epoch     1 | training loss  0.78202 | training accuracy  0.74304 |


  0%|          | 0/780 [00:00<?, ?it/s]

predict target class and its probabilty


  0%|          | 0/780 [00:00<?, ?it/s]

Validation F1 scores
 * Lead      : 0.761088
 * Position  : 0.642587
 * Claim     : 0.499107
 * Counterclaim: 0.448140
 * Rebuttal  : 0.249302
 * Evidence  : 0.625435
 * Concluding Statement: 0.702150
Overall Validation avg F1: 0.5611 val_loss:0.6856 val_accuracy:0.7715


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3119 [00:00<?, ?it/s]

Training loss after 1000 training steps: 0.896685791015625
Training loss after 2000 training steps: 0.819468994140625
Training loss after 3000 training steps: 0.7864876302083333
| end of epoch     1 | training loss  0.78339 | training accuracy  0.74286 |


  0%|          | 0/780 [00:00<?, ?it/s]

predict target class and its probabilty


  0%|          | 0/780 [00:00<?, ?it/s]

Validation F1 scores
 * Lead      : 0.794230
 * Position  : 0.621461
 * Claim     : 0.524358
 * Counterclaim: 0.426836
 * Rebuttal  : 0.292152
 * Evidence  : 0.654976
 * Concluding Statement: 0.751546
Overall Validation avg F1: 0.5808 val_loss:0.6668 val_accuracy:0.7754


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3119 [00:00<?, ?it/s]

Training loss after 1000 training steps: 0.909854248046875
Training loss after 2000 training steps: 0.8289337158203125
Training loss after 3000 training steps: 0.7885320638020833
| end of epoch     1 | training loss  0.78435 | training accuracy  0.74276 |


  0%|          | 0/780 [00:00<?, ?it/s]

predict target class and its probabilty


  0%|          | 0/780 [00:00<?, ?it/s]

Validation F1 scores
 * Lead      : 0.799893
 * Position  : 0.618356
 * Claim     : 0.509941
 * Counterclaim: 0.439113
 * Rebuttal  : 0.298028
 * Evidence  : 0.660642
 * Concluding Statement: 0.763449
Overall Validation avg F1: 0.5842 val_loss:0.6638 val_accuracy:0.7760


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3119 [00:00<?, ?it/s]

Training loss after 1000 training steps: 0.9483232421875
Training loss after 2000 training steps: 0.8452591552734375
Training loss after 3000 training steps: 0.799323486328125
| end of epoch     1 | training loss  0.79472 | training accuracy  0.73887 |


  0%|          | 0/780 [00:00<?, ?it/s]

predict target class and its probabilty


  0%|          | 0/780 [00:00<?, ?it/s]

Validation F1 scores
 * Lead      : 0.783731
 * Position  : 0.642378
 * Claim     : 0.504736
 * Counterclaim: 0.451000
 * Rebuttal  : 0.339085
 * Evidence  : 0.641245
 * Concluding Statement: 0.783657
Overall Validation avg F1: 0.5923 val_loss:0.6717 val_accuracy:0.7703


In [21]:
oof

Unnamed: 0,id,class,predictionstring,fold
0,0000D23A521A,Lead,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,0
1,0000D23A521A,Claim,51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 6...,0
2,0000D23A521A,Evidence,84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 9...,0
3,0000D23A521A,Counterclaim,117 118 119 120 121 122 123 124 125 126 127 12...,0
4,0000D23A521A,Rebuttal,134 135 136 137 138 139,0
...,...,...,...,...
20620,FFF868E06176,Claim,168 169 170 171 172 173 174,4
20621,FFF868E06176,Claim,177 178 179 180 181 182 183 184 185 186 187 18...,4
20622,FFF868E06176,Evidence,191 192 193 194 195 196 197 198 199 200 201 20...,4
20623,FFF868E06176,Evidence,270 271 272 273 274 275 276 277 278 279 280 28...,4
