In [2]:
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
import random
import pandas as pd
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import copy
import math
import torch.nn.functional as F
import joblib

In [3]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


seed = 42
set_seed(seed)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
df_test = pd.read_csv("../../datasets/test.csv")
df_test.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [6]:
class MyDataset(Data.Dataset):
    """定义数据集"""

    def __init__(self, df, have_target=True):
        self.dataset = df
        self.have_target = have_target

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        """定义索引方式"""
        text = self.dataset.iloc[i]['excerpt']
        if self.have_target:
            target = self.dataset.iloc[i]['target']
            return text, target
        else:
            return text,


data_test = MyDataset(df_test, have_target=False)

for text, in data_test:
    # 调用__getitem__方法
    print(text)
    break

My hope lay in Jack's promise that he would keep a bright light burning in the upper story to guide me on my course. On a clear night this light was visible from the village, but somehow or other I failed to take into account the state of the weather. The air was full of eddying flakes, which would render the headlight of a locomotive invisible a hundred yards distant. Strange that this important fact never occurred to me until I was fully a fourth of a mile from the village. Then, after looking in vain for the beacon light, the danger of my situation struck me, and I halted.
"I am certain to go wrong," I said to myself.
"It is out of my power to follow a direct course without something to serve as a compass. I will go back to the village and wait till morning."


In [7]:
model_name = 'albert-xxlarge-v2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.model_input_names)

pretrained = AutoModel.from_pretrained('../继续训练预训练模型_albert-xxlarge-v2/save_model/')
print(pretrained.num_parameters())  # 参数量

Downloading:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

['input_ids', 'token_type_ids', 'attention_mask']


Some weights of the model checkpoint at ../继续训练预训练模型_albert-xxlarge-v2/save_model/ were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertModel were not initialized from the model checkpoint at ../继续训练预训练模型_albert-xxlarge-v2/save_model/ and are newly initialized: ['albert.pooler.bias', 'albert.pooler.weight']
You should probably TRAIN

222595584


In [8]:
def get_collate_fn(tokenizer, max_len=256):
    """返回collate_fun函数(通过闭包函数引入形参)"""

    def collate_fn(data):
        sents = [i[0] for i in data]

        # 批量编码句子
        text_t = tokenizer(text=sents,
                           truncation=True,
                           padding='max_length',
                           max_length=max_len,
                           return_token_type_ids=True,
                           return_attention_mask=True,
                           return_tensors='pt')

        input_ids = text_t['input_ids']
        attention_mask = text_t['attention_mask']
        token_type_ids = text_t['token_type_ids']
        if len(data[0]) == 1:
            return input_ids, attention_mask, token_type_ids
        else:
            target = torch.tensor([i[1] for i in data], dtype=torch.float32)
            return input_ids, attention_mask, token_type_ids, target

    return collate_fn


dataloader_test = torch.utils.data.DataLoader(dataset=data_test, batch_size=8, collate_fn=get_collate_fn(tokenizer),
                                              shuffle=False)

for input_ids, attention_mask, token_type_ids in dataloader_test:
    print(input_ids)
    print(attention_mask)
    print(token_type_ids)
    break

tensor([[    2,    51,  1376,  ...,     0,     0,     0],
        [    2,   107, 12007,  ...,     0,     0,     0],
        [    2,    32,    23,  ...,     0,     0,     0],
        ...,
        [    2,   121, 16254,  ...,     0,     0,     0],
        [    2,    20,  3271,  ...,     0,     0,     0],
        [    2,  4820,    58,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])


In [9]:
class MyModel(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.pretrained = pretrained
        self.norm = nn.LayerNorm(4096)  # albert-xxlarge-v2隐藏层大小为4096
        self.linear = nn.Linear(4096, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        emb = self.pretrained(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[
            "last_hidden_state"]
        emb = torch.mean(emb, dim=1)
        output = self.norm(emb)
        output = self.linear(output)
        return output

In [10]:
def get_parameters(model,
                   pretrained_lr=2e-5,  # 预训练模型网络层学习率
                   custom_lr=1e-4):  # 全自定义的网络层学习率
    parameters = []

    # 预训练模型网络层:
    pretrained_params = {
        'params': [param for name, param in model.named_parameters() if 'pretrained' in name],
        'lr': pretrained_lr
    }
    parameters.append(pretrained_params)

    # 自定义网络层:下游任务自定义的网络层(具体任务对应修改)
    custom_params = {
        'params': [param for name, param in model.named_parameters() if 'pretrained' not in name],
        'lr': custom_lr
    }
    parameters.append(custom_params)
    return parameters

In [11]:
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 学习率预热(线性增加)
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))  # 根据cos函数变化

    return LambdaLR(optimizer, lr_lambda)

### 随机step次进行一次模型验证

In [12]:
# 模型验证
def evaluate(model, dataloader, device):
    model.eval()

    loss_7 = []
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids, target in dataloader:
            # 数据设备切换
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            loss = torch.sqrt(F.mse_loss(out.cpu().reshape(-1), target, reduction='mean'))
            loss_7.append(loss.item())

    return torch.mean(torch.tensor(loss_7)).item()  # 平均rmse


# 每训练random.randint(8, 20)个step进行一次模型验证
def train_and_evaluate(model, dataloader_train, dataloader_val, criterion, optimizer, scheduler_lr, epochs, device):
    model.train()

    best_valid_rmse = 1e7  # 最佳模型验证数据集的rmse
    best_model_state_dict = [None]  # 最佳模型的状态字典
    for epoch in range(1, epochs + 1):
        val_idx = random.randint(8, 20)
        for idx, (input_ids, attention_mask, token_type_ids, target) in enumerate(dataloader_train):
            # 数据设备切换
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            target = target.to(device)

            optimizer.zero_grad()
            out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            out = out.reshape(-1)

            loss = criterion(out, target)  # 每个step的损失值

            loss.backward()
            optimizer.step()
            scheduler_lr.step()

            if idx == val_idx:
                val_idx += random.randint(8, 20)
                val_rmse = evaluate(model, dataloader_val, device)
                if val_rmse < best_valid_rmse:
                    best_valid_rmse = val_rmse
                    best_model_state_dict.pop()
                    best_model_state_dict.append(copy.deepcopy(model.state_dict()))  # 状态字典必须进行深拷贝
                    print('| end of epoch {:5d} | step: {:5d} | valild rmse {:8.5f} |'.format(epoch, idx, val_rmse))

    return best_valid_rmse, best_model_state_dict

In [13]:
data_oof = pd.read_csv("../../data/df_train_oof.csv", index_col=0)  # 加载分层5折数据集
data_oof.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,fold
0,b51730f9c,,,"Alice looked at the jury-box, and saw that, in...",-0.432678,0.487498,0
1,4d403fd57,https://en.wikipedia.org/wiki/Artificial_intel...,CC BY-SA 3.0,Artificial intelligence (AI) is intelligence e...,-1.161746,0.458396,2
2,0f789ee41,,,A gruff squire on horseback with shiny top boo...,-2.367914,0.519369,4
3,87f96eb79,,,But that hadn't helped Washington.\nThe Americ...,-0.842596,0.466193,0
4,b9cca6661,,,The principal business of the people of this c...,-0.748452,0.433,4


In [14]:
fold_num_list = range(5)

best_valid_rmse_lst = []  # 每折交叉验证最佳模型验证数据集的rmse

for fold in fold_num_list:
    print('*' * 40 + str(fold) + '*' * 40)

    train_data = data_oof[data_oof['fold'] != fold]  # 训练数据集
    val_data = data_oof[data_oof['fold'] == fold]  # 验证数据集
    dataloader_train = torch.utils.data.DataLoader(dataset=MyDataset(train_data), batch_size=8,
                                                   collate_fn=get_collate_fn(tokenizer), shuffle=True)
    dataloader_val = torch.utils.data.DataLoader(dataset=MyDataset(val_data),
                                                 # 测试数据只有7条((\sqrt{(n1 + n2)/2} + \sqrt{(n3 + n4)/2} + \sqrt{(n5 + n6)/2}) / 3 不等于\sqrt{(n1 + n2 + n3 + n4 + n5 + n6)/6})
                                                 batch_size=7,
                                                 collate_fn=get_collate_fn(tokenizer),
                                                 shuffle=False, drop_last=True)

    # **************************************************************************************
    albert_xxlarge_v2 = MyModel(copy.deepcopy(pretrained))  # 必须进行深拷贝(pretrained会参与更新),否则会造成标签泄露
    albert_xxlarge_v2 = albert_xxlarge_v2.to(device)

    loss_mse = nn.MSELoss()

    parameters = get_parameters(albert_xxlarge_v2, 2e-5, 1e-4)
    # 优化器
    optimizer_adamw = optim.AdamW(parameters)
    scheduler_lr = get_cosine_schedule_with_warmup(optimizer_adamw, 0, len(dataloader_train) * 5)
    # **************************************************************************************

    bvr, bmsd = train_and_evaluate(albert_xxlarge_v2, dataloader_train, dataloader_val, loss_mse, optimizer_adamw,
                                   scheduler_lr, 5, device)

    best_valid_rmse_lst.append(bvr)
    torch.save(bmsd, 'model' + str(fold) + '.bin')  # 保存最优模型的状态字典

****************************************0****************************************
| end of epoch     1 | step:    18 | valild rmse  0.80530 |
| end of epoch     1 | step:    27 | valild rmse  0.61621 |
| end of epoch     1 | step:    35 | valild rmse  0.60319 |
| end of epoch     1 | step:    54 | valild rmse  0.60014 |
| end of epoch     1 | step:    77 | valild rmse  0.58998 |
| end of epoch     1 | step:    98 | valild rmse  0.56372 |
| end of epoch     1 | step:   117 | valild rmse  0.53360 |
| end of epoch     1 | step:   144 | valild rmse  0.50319 |
| end of epoch     1 | step:   235 | valild rmse  0.50028 |
| end of epoch     2 | step:    72 | valild rmse  0.49407 |
| end of epoch     2 | step:   121 | valild rmse  0.49014 |
| end of epoch     2 | step:   132 | valild rmse  0.48111 |
| end of epoch     2 | step:   196 | valild rmse  0.47319 |
| end of epoch     2 | step:   204 | valild rmse  0.47230 |
| end of epoch     3 | step:    35 | valild rmse  0.47162 |
| end of epoch    

In [15]:
# 模型预测
def predict(model, dataloader, device):
    model.eval()

    predict_list = []
    with torch.no_grad():
        for i in dataloader:
            # 数据设备
            input_ids = i[0].to(device)
            attention_mask = i[1].to(device)
            token_type_ids = i[2].to(device)
            out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            predict_list.append(out.cpu())
    predict_all = torch.cat(predict_list, dim=0)
    return predict_all

In [16]:
all_tr_predict = torch.tensor([]).reshape((0, 1))
all_te_predict = torch.tensor([]).reshape((0, 1))

for fold in fold_num_list:
    val_d = data_oof[data_oof['fold'] == fold]
    dl_val = torch.utils.data.DataLoader(dataset=MyDataset(val_d), batch_size=8, collate_fn=get_collate_fn(tokenizer))
    # **************************************************************************************
    albert_xxlarge_v2 = MyModel(copy.deepcopy(pretrained))  # 必须进行深拷贝(pretrained会参与更新),否则会造成标签泄露
    best_state_dict = torch.load('model' + str(fold) + '.bin')
    albert_xxlarge_v2.load_state_dict(best_state_dict[0])  # 记载状态字典
    albert_xxlarge_v2 = albert_xxlarge_v2.to(device)
    # **************************************************************************************
    predict_result_tr = predict(albert_xxlarge_v2, dl_val, device)
    all_tr_predict = torch.cat([all_tr_predict, predict_result_tr])

    predict_result_te = predict(albert_xxlarge_v2, dataloader_test, device)
    all_te_predict = torch.cat([all_te_predict, predict_result_te])

In [17]:
all_tr_predict.shape, all_te_predict.shape

(torch.Size([2834, 1]), torch.Size([35, 1]))

In [18]:
joblib.dump(all_tr_predict.numpy(), 'albert_xxlarge_v2_tr.pkl')

['albert_xxlarge_v2_tr.pkl']

In [19]:
joblib.dump(all_te_predict.numpy(), 'albert_xxlarge_v2_te.pkl')

['albert_xxlarge_v2_te.pkl']