In [1]:
import torch
import time
import torch.nn as nn
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd_all = pd.read_csv('weibo_senti_100k.csv')
print('评论数目（总体）：%d' % pd_all.shape[0])
print('评论数目（正向）：%d' % pd_all[pd_all.label==1].shape[0])
print('评论数目（负向）：%d' % pd_all[pd_all.label==0].shape[0])
pd_all= sklearn.utils.shuffle(pd_all)
pd_all.head()

评论数目（总体）：119988
评论数目（正向）：59993
评论数目（负向）：59995


Unnamed: 0,label,review
81001,0,人生第一罚。。。[泪]
74189,0,#深夜发吃#我饿啊，好饿！[泪][泪][泪]
92629,0,下次在去一下甲米更漂亮！//@酒红冰蓝:回复@sab_xiao:是！真不想回去了！可惜，只能...
71261,0,回复@胡小薇的天空:下次成功就是。//@胡小薇的天空: @梦想家艺行天下:忙了一下午，最后还...
73448,0,[心]安全回来!!!!!//@早安兔子: [心] //@马不苦shine:加油，小英雄们！安...


In [3]:
X = list(pd_all.iloc[0:10000,:].review)
Y = list(pd_all.iloc[0:10000,:].label)
x, x_test, y, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [4]:
def preprocess_data(tokenizer, data):
    input_ids = []
    token_type_ids = [] 
    attention_masks = []
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            sent, 
            add_special_tokens=True, 
            max_length=200, 
            padding = 'max_length', 
            truncation = True,
            return_attention_mask = True
        )
        input_ids.append(encoded_sent.get('input_ids'))
        token_type_ids.append(encoded_sent.get('token_type_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
    
    return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_masks)

In [5]:
#预训练模型
tokenizer = BertTokenizer.from_pretrained('bert')
#超参数
epochs = 2
lr = 5e-5
batch_size = 32
#设备
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
#获得编码    
train_input, train_token_type, train_mask = preprocess_data(tokenizer, x)
test_input, test_token_type, test_mask = preprocess_data(tokenizer, x_test)
train_label = torch.tensor(y)
test_label = torch.tensor(y_test)

In [6]:
train_data = TensorDataset(train_input, train_token_type, train_mask, train_label)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_data = TensorDataset(test_input, test_token_type, test_mask, test_label)
test_dataloader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [7]:
#分类器
class BertClassifier(nn.Module):
    def __init__(self, ):
        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 128, 2
        self.bert = BertModel.from_pretrained('bert')
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H), 
            nn.ReLU(),  
            nn.Linear(H, D_out),  
        )

    def forward(self, input_ids, token_type_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, 
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]
        out = self.classifier(last_hidden_state_cls)
        return out

In [8]:
def train(model, train_dataloader, test_dataloader=None, epochs=2, evaluation=False):
    for epoch_i in range(epochs):
        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()
        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            b_input_ids, b_token_type_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()
            logits = model(b_input_ids, b_token_type_ids, b_attn_mask)
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            if (step % 10 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                
                print(f"epoch ={epoch_i + 1:^4} train_loss ={batch_loss / batch_counts:^8.2f} time ={time_elapsed:^6.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        avg_train_loss = total_loss / len(train_dataloader)
        if evaluation:  
            test_loss, test_accuracy = evaluate(model, test_dataloader)
            time_elapsed = time.time() - t0_epoch

            print(f"epoch = {epoch_i + 1:^4} avg_train_loss ={avg_train_loss:^8.2f} test loss ={test_loss:^8.2f} test_accuracy={test_accuracy:^8.2f}% time ={time_elapsed:^6.2f}")
            print("-" * 80)
        print("\n")


In [9]:
def evaluate(model, test_dataloader):
    model.eval()
    test_accuracy = []
    test_loss = []
    for batch in test_dataloader:
        b_input_ids, b_token_type_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits = model(b_input_ids, b_token_type_ids, b_attn_mask)
        loss = loss_fn(logits, b_labels.long())
        test_loss.append(loss.item())
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        test_accuracy.append(accuracy)
    val_loss = np.mean(test_loss)
    val_accuracy = np.mean(test_accuracy)

    return val_loss, val_accuracy

In [None]:
bert_classifier = BertClassifier()
print("Total paramerters in networks: {}  ".format(sum(x.numel() for x in bert_classifier.parameters())))
bert_classifier.to(device)
optimizer = AdamW(bert_classifier.parameters(),
                  lr=lr,  
                  eps=1e-8
                      )
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,  # Default value
                                            num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss() 
#训练
train(bert_classifier, train_dataloader, test_dataloader, epochs=2, evaluation=True)

Some weights of the model checkpoint at bert were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total paramerters in networks: 102366338  
epoch = 1   train_loss =  0.47   time =302.81
