In [1]:
import sys
sys.path.append('../')
from util import load_data
data_path='../data/sst2_shuffled.tsv.1'
train_data,test_data,categories=load_data.load_sentence_polarity(data_path=data_path)

In [2]:
# 这里新的模型transformer
# 先确定pre train模型的名称，所确定的tokenize
# 加载预训练模型，因为这里是英文数据集，需要用在英文上的预训练模型：bert-base-uncased
# uncased指该预训练模型对应的词表不区分字母的大小写
# 详情可了解：https://huggingface.co/bert-base-uncased
pretrained_model_name = 'bert-base-uncased'
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel
from tqdm import tqdm
import os
import time
from transformers import BertTokenizer
from transformers import logging

In [3]:
# 编写好制作数据集的方式，先定义dataset、后面定义dataloader
class BertDataset(Dataset):
    def __init__(self,dataset):
        self.dataset=dataset
        self.data_size=len(dataset)
    
    def __len__(self):
        return self.data_size
    
    def __getitem__(self, index):
        return self.dataset[index]

def coffate_fn(examples):
    inputs,targets=[],[]
    for polar,sent in examples:
        inputs.append(sent)
        targets.append(int(polar))
    # 这里的tokenizer是后面提供好pretrain model之后的API
    inputs = tokenizer(inputs,
                       padding=True,
                       truncation=True,
                       return_tensors="pt",
                       max_length=512)
    targets = torch.tensor(targets)
    return inputs,targets

pretrained_model_name = 'bert-base-uncased'
# 加载预训练模型对应的tokenizer
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
batch_size=32
train_dataset=BertDataset(train_data)
test_dataset=BertDataset(test_data)
train_dataloader=DataLoader(train_dataset,batch_size=batch_size,collate_fn=coffate_fn,shuffle=True)
test_dataloader=DataLoader(test_dataset,batch_size=batch_size,collate_fn=coffate_fn,shuffle=True)


In [4]:
for batch in train_dataloader:
    print(batch)
    break

({'input_ids': tensor([[  101,  1996, 16655,  ...,     0,     0,     0],
        [  101,  1037,  4276,  ...,     0,     0,     0],
        [  101, 23734, 21566,  ...,     0,     0,     0],
        ...,
        [  101, 13463,  1996,  ...,     0,     0,     0],
        [  101,  1037,  2422,  ...,     0,     0,     0],
        [  101, 17958,  2007,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}, tensor([0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        0, 1, 0, 1, 0, 1, 0, 0]))


In [5]:
# 之后是定义模型的名称
class BertSST2Model(nn.Module):
    def __init__(self,class_size,pretrained_model_name=pretrained_model_name) -> None:
        super(BertSST2Model,self).__init__()
        # 记载hugging face的bertmodel
        # bertmodel的最终输出维度默认为768
        # 对其进行调整输入的维度调整
        self.bert=BertModel.from_pretrained(pretrained_model_name,return_dict=True)
        # 修改最后一个线性层
        self.classifier=nn.Linear(768,class_size)
    
    def forward(self,inputs):
        """
        前向推理的过程
        inputs 处理好的数据 shape=batchsize*max_len

        """
        input_ids,input_tyi,input_attn_mask=inputs['input_ids'],inputs['token_type_ids'],inputs['attention_mask']
        # TODO 如何实现
        output=self.bert(input_ids,input_tyi,input_attn_mask)
        categories_numberic=self.classifier(output.pooler_output)
        return categories_numberic

def save_pretrained(model, path):
    # 保存模型，先利用os模块创建文件夹，后利用torch.save()写入模型文件
    os.makedirs(path, exist_ok=True)
    torch.save(model, os.path.join(path, 'model.pth'))

In [6]:
# 定义超参数
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size=64
num_epoch=200
check_step=20
learning_rate=1e-5
model=BertSST2Model(class_size=2)
model.to(device)
optimizer=Adam(model.parameters(),learning_rate)
celoss=nn.CrossEntropyLoss()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# 训练过程
# 记录当前训练时间，用以记录日志和存储
timestamp = time.strftime("%m_%d_%H_%M", time.localtime())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.train()
for epoch in range(1,num_epoch+1):
    total_loss=0
    for batch in tqdm(train_dataloader,desc=f'Training epoch {epoch}'):
        inputs,targets=[x.to(device) for x in batch]
        optimizer.zero_grad()
        bert_output=model(inputs)
        loss=celoss(bert_output,targets)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    #测试过程
    # acc统计模型在测试数据上分类结果中的正确个数
    acc = 0
    for batch in tqdm(test_dataloader, desc=f"Testing"):
        inputs, targets = [x.to(device) for x in batch]
        # with torch.no_grad(): 为固定写法，
        # 这个代码块中的全部有关tensor的操作都不产生梯度。目的是节省时间和空间，不加也没事
        with torch.no_grad():
            bert_output = model(inputs)
            """
            .argmax()用于取出一个tensor向量中的最大值对应的下表序号，dim指定了维度
            假设 bert_output为3*2的tensor：
            tensor
            [
                [3.2,1.1],
                [0.4,0.6],
                [-0.1,0.2]
            ]
            则 bert_output.argmax(dim=1) 的结果为：tensor[0,1,1]
            """
            acc += (bert_output.argmax(dim=1) == targets).sum().item()
    #输出在测试集上的准确率
    print(f"Acc: {acc / len(test_dataloader):.2f}")
    if epoch % check_step == 0:
        # 保存模型
        checkpoints_dirname = "bert_sst2_" + timestamp
        os.makedirs(checkpoints_dirname, exist_ok=True)
        save_pretrained(model,
                        checkpoints_dirname + '/checkpoints-{}/'.format(epoch))

Training epoch 1: 100%|██████████| 250/250 [00:37<00:00,  6.68it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 18.37it/s]


Acc: 19.03


Training epoch 2: 100%|██████████| 250/250 [00:37<00:00,  6.67it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.90it/s]


Acc: 21.79


Training epoch 3: 100%|██████████| 250/250 [00:37<00:00,  6.61it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.77it/s]


Acc: 21.78


Training epoch 4: 100%|██████████| 250/250 [00:37<00:00,  6.60it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.90it/s]


Acc: 22.25


Training epoch 5: 100%|██████████| 250/250 [00:38<00:00,  6.54it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.62it/s]


Acc: 22.49


Training epoch 6: 100%|██████████| 250/250 [00:38<00:00,  6.54it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.48it/s]


Acc: 22.13


Training epoch 7: 100%|██████████| 250/250 [00:38<00:00,  6.47it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.55it/s]


Acc: 21.97


Training epoch 8: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.61it/s]


Acc: 22.65


Training epoch 9: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.74it/s]


Acc: 22.86


Training epoch 10: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.42it/s]


Acc: 22.67


Training epoch 11: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.33it/s]


Acc: 22.43


Training epoch 12: 100%|██████████| 250/250 [00:38<00:00,  6.50it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.71it/s]


Acc: 22.70


Training epoch 13: 100%|██████████| 250/250 [00:38<00:00,  6.50it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.50it/s]


Acc: 22.27


Training epoch 14: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.78it/s]


Acc: 23.05


Training epoch 15: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.64it/s]


Acc: 21.14


Training epoch 16: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.82it/s]


Acc: 23.00


Training epoch 17: 100%|██████████| 250/250 [00:38<00:00,  6.54it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.54it/s]


Acc: 23.24


Training epoch 18: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.57it/s]


Acc: 21.21


Training epoch 19: 100%|██████████| 250/250 [00:38<00:00,  6.54it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.50it/s]


Acc: 22.67


Training epoch 20: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.72it/s]


Acc: 22.51


Training epoch 21: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.73it/s]


Acc: 23.11


Training epoch 22: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.55it/s]


Acc: 23.35


Training epoch 23: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.35it/s]


Acc: 21.67


Training epoch 24: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.84it/s]


Acc: 21.25


Training epoch 25: 100%|██████████| 250/250 [00:38<00:00,  6.53it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.28it/s]


Acc: 22.70


Training epoch 26: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.80it/s]


Acc: 22.79


Training epoch 27: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.72it/s]


Acc: 23.11


Training epoch 28: 100%|██████████| 250/250 [00:38<00:00,  6.53it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Acc: 23.38


Training epoch 29: 100%|██████████| 250/250 [00:38<00:00,  6.58it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.77it/s]


Acc: 22.14


Training epoch 30: 100%|██████████| 250/250 [00:38<00:00,  6.53it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.52it/s]


Acc: 22.70


Training epoch 31: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.60it/s]


Acc: 22.52


Training epoch 32: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.59it/s]


Acc: 22.40


Training epoch 33: 100%|██████████| 250/250 [00:38<00:00,  6.47it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.72it/s]


Acc: 22.97


Training epoch 34: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.43it/s]


Acc: 22.57


Training epoch 35: 100%|██████████| 250/250 [00:38<00:00,  6.53it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 18.01it/s]


Acc: 22.78


Training epoch 36: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Acc: 22.59


Training epoch 37: 100%|██████████| 250/250 [00:38<00:00,  6.54it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.57it/s]


Acc: 22.76


Training epoch 38: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.60it/s]


Acc: 22.63


Training epoch 39: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.85it/s]


Acc: 23.19


Training epoch 40: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.82it/s]


Acc: 22.65


Training epoch 41: 100%|██████████| 250/250 [00:38<00:00,  6.55it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.73it/s]


Acc: 23.21


Training epoch 42: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.41it/s]


Acc: 22.78


Training epoch 43: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.73it/s]


Acc: 23.08


Training epoch 44: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.68it/s]


Acc: 23.00


Training epoch 45: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.68it/s]


Acc: 23.10


Training epoch 46: 100%|██████████| 250/250 [00:38<00:00,  6.49it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.58it/s]


Acc: 20.63


Training epoch 47: 100%|██████████| 250/250 [00:38<00:00,  6.47it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.40it/s]


Acc: 23.19


Training epoch 48: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.60it/s]


Acc: 22.43


Training epoch 49: 100%|██████████| 250/250 [00:38<00:00,  6.53it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.51it/s]


Acc: 23.25


Training epoch 50: 100%|██████████| 250/250 [00:38<00:00,  6.53it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.65it/s]


Acc: 21.98


Training epoch 51: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.85it/s]


Acc: 23.00


Training epoch 52: 100%|██████████| 250/250 [00:38<00:00,  6.54it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.87it/s]


Acc: 22.98


Training epoch 53: 100%|██████████| 250/250 [00:38<00:00,  6.54it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.93it/s]


Acc: 22.67


Training epoch 54: 100%|██████████| 250/250 [00:38<00:00,  6.53it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.48it/s]


Acc: 22.86


Training epoch 55: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.92it/s]


Acc: 23.25


Training epoch 56: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.83it/s]


Acc: 23.06


Training epoch 57: 100%|██████████| 250/250 [00:38<00:00,  6.48it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.37it/s]


Acc: 22.89


Training epoch 58: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.62it/s]


Acc: 22.94


Training epoch 59: 100%|██████████| 250/250 [00:38<00:00,  6.52it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.68it/s]


Acc: 22.63


Training epoch 60: 100%|██████████| 250/250 [00:38<00:00,  6.51it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.61it/s]


Acc: 22.94


Training epoch 61: 100%|██████████| 250/250 [00:39<00:00,  6.34it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.64it/s]


Acc: 22.68


Training epoch 62: 100%|██████████| 250/250 [00:38<00:00,  6.42it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.43it/s]


Acc: 23.16


Training epoch 63: 100%|██████████| 250/250 [00:38<00:00,  6.46it/s]
Testing: 100%|██████████| 63/63 [00:03<00:00, 17.29it/s]


Acc: 23.44


Training epoch 64:  40%|████      | 101/250 [00:15<00:22,  6.62it/s]

: 

: 

In [None]:
model  = torch.load('../src/bert_sst2_04_05_20_50/checkpoints-200/model.pth')
model.eval()
test='why you are so nerd'
test=tokenizer(test,padding=True,
                    truncation=True,
                    return_tensors="pt",
                    max_length=512)
test.to(device)
if model(test).argmax(-1).item()==1:
    print('This is a negative sentence')
else:
    print('This is a positive sentence')

This is a negative sentence
