In [66]:
import warnings
from datetime import datetime
import time 
import torch
import os
from transformers import BertModel,BertConfig,BertModel,BertTokenizerFast,get_cosine_schedule_with_warmup,BertForMaskedLM
import pandas  as pd
import torch 
import torch.nn as nn 
import torch.optim as optim 
import torch.utils.data as Data
from torch.utils.tensorboard import SummaryWriter 
 
# hyperparameters 
EPOCH=200
RANDOM_SEED=2022 
TRAIN_BATCH_SIZE=32  #小批训练， 批大小增大时需要提升学习率  https://zhuanlan.zhihu.com/p/413656738
TEST_BATCH_SIZE=96   #大批测试
EVAL_PERIOD=20
MODEL_NAME="bert-base-uncased"  # bert-base-chinese
DATA_PATH="dataset/twitter_sentiment/"
MASK_POS=3  # "it was [mask]" 中 [mask] 位置
train_file="twitter-2013train-A.tsv"
dev_file="twitter-2013dev-A.tsv"
test_file="twitter-2013test-A.tsv"
 
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [67]:
writer = SummaryWriter('./tb_log')

pd.options.display.max_columns = None
pd.options.display.max_rows = None

prefix = 'It was [mask]. '
 
class Bert_Model(nn.Module):
    def __init__(self,  bert_path ,config_file ):
        super(Bert_Model, self).__init__()
        self.bert = BertForMaskedLM.from_pretrained(bert_path,config=config_file)  # 加载预训练模型权重
 
 
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask, token_type_ids) #masked LM 输出的是 mask的值 对应的ids的概率 ，输出 会是词表大小，里面是概率 
        logit = outputs[0]  # 池化后的输出 [bs, config.hidden_size]

        return logit 

In [68]:
#构建数据集
class MyDataSet(Data.Dataset):
    def __init__(self, sen , mask , typ ,label ):
        super(MyDataSet, self).__init__()
        self.sen = torch.tensor(sen,dtype=torch.long)
        self.mask = torch.tensor(mask,dtype=torch.long)
        self.typ =torch.tensor( typ,dtype=torch.long)
        self.label = torch.tensor(label,dtype=torch.long)
 
    def __len__(self):
        return self.sen.shape[0]
 
    def __getitem__(self, idx):
        return self.sen[idx], self.mask[idx],self.typ[idx],self.label[idx]
#load  data
   
def load_data(tsvpath):
    data=pd.read_csv(tsvpath,sep="\t",header=None,names=["sn","polarity","text"])
    data=data[data["polarity"] != "neutral"]
    yy=data["polarity"].replace({"negative":0,"positive":1,"neutral":2})
    return data.values[:,2:3].tolist(),yy.tolist() #data.values[:,1:2].tolist()
 
tokenizer=BertTokenizerFast.from_pretrained(MODEL_NAME)
config=BertConfig.from_pretrained(MODEL_NAME)
model=Bert_Model(bert_path=MODEL_NAME,config_file=config).to(device)
 
pos_id=tokenizer.convert_tokens_to_ids("good") #9005
neg_id=tokenizer.convert_tokens_to_ids("bad")  #12139

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [70]:
# get the data and label 
def ProcessData(filepath):
    x_train,y_train=load_data(DATA_PATH+os.sep+filepath)
    #x_train,x_test,y_train,y_test=train_test_split(StrongData,StrongLabel,test_size=0.3, random_state=42)
 
    Inputid=[]
    Labelid=[]
    typeid=[]
    attenmask=[]
 
    for i in range(len(x_train)):
 
        text_ = prefix+x_train[i][0]
 
        encode_dict = tokenizer.encode_plus(text_,max_length=60,padding="max_length",truncation=True)
        input_ids=encode_dict["input_ids"]
        type_ids=encode_dict["token_type_ids"]
        atten_mask=encode_dict["attention_mask"]
        labelid,inputid= input_ids[:],input_ids[:]
        if y_train[i] == 0:
            labelid[MASK_POS] = neg_id
            labelid[:MASK_POS] = [-1]* len(labelid[:MASK_POS]) 
            labelid[MASK_POS+1:] = [-1] * len(labelid[MASK_POS+1:])
            inputid[MASK_POS] = tokenizer.mask_token_id
        else:
            labelid[MASK_POS] = pos_id
            labelid[:MASK_POS] = [-1]* len(labelid[:MASK_POS]) 
            labelid[MASK_POS+1:] = [-1] * len(labelid[MASK_POS+1:])
            inputid[MASK_POS] = tokenizer.mask_token_id
 
        Labelid.append(labelid)
        Inputid.append(inputid)
        typeid.append(type_ids)
        attenmask.append(atten_mask)
 
    return Inputid,Labelid,typeid,attenmask
 

Inputid_train,Labelid_train,typeids_train,inputnmask_train=ProcessData(train_file)
Inputid_dev,Labelid_dev,typeids_dev,inputnmask_dev=ProcessData(dev_file)
Inputid_test,Labelid_test,typeids_test,inputnmask_test=ProcessData(test_file)

train_dataset = Data.DataLoader(MyDataSet(Inputid_train,  inputnmask_train , typeids_train , Labelid_train), TRAIN_BATCH_SIZE, True)
valid_dataset = Data.DataLoader(MyDataSet(Inputid_dev,  inputnmask_dev , typeids_dev , Labelid_dev), TRAIN_BATCH_SIZE, True)
test_dataset = Data.DataLoader(MyDataSet(Inputid_test,  inputnmask_test , typeids_test , Labelid_test), TEST_BATCH_SIZE, True)
 
train_data_num=len(Inputid_train)
test_data_num=len(Inputid_test)
#print("hello!")

In [71]:
len(train_dataset), len(train_dataset.dataset)

(160, 5098)

In [72]:
optimizer = AdamW(model.parameters(),lr=2e-5,weight_decay=1e-4)  #使用Adam优化器
loss_func = nn.CrossEntropyLoss(ignore_index=-1)
EPOCH = 200
# schedule = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=len(train_dataset),num_training_steps=EPOCH*len(train_dataset))
print("正在训练中。。。")
totaltime=0
for epoch in range(10):
 
    starttime_train=datetime.now()
 
    start =time.time()
    correct=0
    train_loss_sum=0
    model.train()
 
    for idx,(ids,att_mask,type,y) in enumerate(train_dataset):
        ids,att_mask,type,y = ids.to(device),att_mask.to(device),type.to(device),y.to(device)
        out_train = model(ids,att_mask,type)
       #print(out_train.view(-1, tokenizer.vocab_size).shape, y.view(-1).shape)
        loss = loss_func(out_train.view(-1, tokenizer.vocab_size),y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # schedule.step()
        train_loss_sum += loss.item()
       
        if( idx+1)% EVAL_PERIOD == 0:
            print("Epoch {:04d} | Step {:06d}/{:06d} | Loss {:.4f} | Time {:.0f}".format(
                epoch + 1, idx + 1, len(train_dataset), train_loss_sum / (idx + 1), time.time() - start))
            writer.add_scalar('loss/train_loss', train_loss_sum / (idx + 1), epoch)
 
        truelabel=y[:,MASK_POS]
        out_train_mask=out_train[:,MASK_POS,:]
        predicted=torch.max(out_train_mask,1)[1]
        correct += (predicted == truelabel).sum()
        correct = float(correct)
    
    acc =float(correct /train_data_num)
 
    eval_loss_sum=0.0
    model.eval()
    correct_test=0
    with torch.no_grad():
        for ids, att, tpe, y in test_dataset:
            ids, att, tpe, y = ids.to(device), att.to(device), tpe.to(device), y.to(device)
            out_test = model(ids , att , tpe)
            loss_eval = loss_func(out_test.view(-1, tokenizer.vocab_size), y.view(-1))
            eval_loss_sum += loss_eval.item()
            ttruelabel = y[:, MASK_POS]
            tout_train_mask = out_test[:, MASK_POS, :]
            predicted_test = torch.max(tout_train_mask.data, 1)[1]
            correct_test += (predicted_test == ttruelabel).sum()
            correct_test = float(correct_test)
    acc_test = float(correct_test / test_data_num)
 
    if epoch % 1 == 0:
        out = ("epoch {}, train_loss {},  train_acc {} , eval_loss {} ,acc_test {}"
               .format(epoch + 1, train_loss_sum / (len(train_dataset)), acc, eval_loss_sum / (len(test_dataset)),
                acc_test))
        writer.add_scalar('loss/test_loss', train_loss_sum / (idx + 1), epoch)
        print(out)
    end=time.time()
 
    print("epoch {} duration:".format(epoch+1),end-start)
    totaltime+=end-start
 
print("total training time: ",totaltime)

正在训练中。。。
Epoch 0001 | Step 000020/000160 | Loss 1.2969 | Time 4
Epoch 0001 | Step 000040/000160 | Loss 0.8165 | Time 9
Epoch 0001 | Step 000060/000160 | Loss 0.6466 | Time 13
Epoch 0001 | Step 000080/000160 | Loss 0.5754 | Time 17
Epoch 0001 | Step 000100/000160 | Loss 0.5138 | Time 22
Epoch 0001 | Step 000120/000160 | Loss 0.4688 | Time 26
Epoch 0001 | Step 000140/000160 | Loss 0.4432 | Time 30
Epoch 0001 | Step 000160/000160 | Loss 0.4131 | Time 35
epoch 1, train_loss 0.4131312052253634,  train_acc 0.8587681443703413 , eval_loss 0.2061903913590041 ,acc_test 0.9228121927236972
epoch 1 duration: 39.09117555618286
Epoch 0002 | Step 000020/000160 | Loss 0.1164 | Time 4
Epoch 0002 | Step 000040/000160 | Loss 0.1335 | Time 9
Epoch 0002 | Step 000060/000160 | Loss 0.1306 | Time 13
Epoch 0002 | Step 000080/000160 | Loss 0.1292 | Time 17
Epoch 0002 | Step 000100/000160 | Loss 0.1295 | Time 22
Epoch 0002 | Step 000120/000160 | Loss 0.1349 | Time 26
Epoch 0002 | Step 000140/000160 | Loss 0.1368

In [56]:
train_df = pd.read_csv(DATA_PATH + train_file, sep="\t",header=None,names=["sn","polarity","text"])
train_df=train_df[train_df["polarity"] != "neutral"]
train_df['polarity'] = train_df['polarity'].map({'negative': 0, 'positive': 1, 'neutral': 1})

val_df = pd.read_csv(DATA_PATH + dev_file, sep="\t",header=None,names=["sn","polarity","text"])
val_df=val_df[val_df["polarity"] != "neutral"]
val_df['polarity'] = val_df['polarity'].map({'negative': 0, 'positive': 1, 'neutral': 1})

test_df = pd.read_csv(DATA_PATH + test_file, sep="\t",header=None,names=["sn","polarity","text"])
test_df=test_df[test_df["polarity"] != "neutral"]
test_df['polarity'] = test_df['polarity'].map({'negative': 0, 'positive': 1, 'neutral': 1})

In [57]:
train_df.shape

(5098, 3)

In [58]:
train_encoding = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=55)
test_encoding = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=55)

In [59]:
# 数据集读取
class NewsDataset(Data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encoding, train_df['polarity'].tolist())
test_dataset = NewsDataset(test_encoding, test_df['polarity'].tolist())

In [60]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = Data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = Data.DataLoader(test_dataset, batch_size=32, shuffle=True)

optim = AdamW(model.parameters(),lr=2e-5,weight_decay=1e-4)  #使用Adam优化器


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [61]:
import numpy as np
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.3529, 62.50%
Epoch: 0, Average training loss: 0.3573
Accuracy: 0.9156
Average testing loss: 0.2111
-------------------------------
------------Epoch: 1 ----------------
epoth: 1, iter_num: 100, loss: 0.1082, 62.50%
Epoch: 1, Average training loss: 0.1582
Accuracy: 0.8989
Average testing loss: 0.2469
-------------------------------
------------Epoch: 2 ----------------
epoth: 2, iter_num: 100, loss: 0.1826, 62.50%
Epoch: 2, Average training loss: 0.0745
Accuracy: 0.9192
Average testing loss: 0.2933
-------------------------------
------------Epoch: 3 ----------------
epoth: 3, iter_num: 100, loss: 0.0127, 62.50%
Epoch: 3, Average training loss: 0.0375
Accuracy: 0.9264
Average testing loss: 0.3523
-------------------------------
