<a href="https://colab.research.google.com/github/cshmzin/nlp-code/blob/main/Bert_%E7%9C%9F%E5%81%87%E6%96%B0%E9%97%BB%E5%88%86%E7%B1%BB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd

# 文件地址：https://www.kaggle.com/c/fake-news-pair-classification-challenge/data
# 模型形式：BERT + Linear Classifier
df_train = pd.read_csv("train.csv")

#除空
empty_title = ((df_train['title2_zh'].isnull()) \
              | (df_train['title1_zh'].isnull()) \
              | (df_train['title2_zh'] == '') \
              | (df_train['title2_zh'] == '0'))
df_train = df_train[~empty_title]

# 去除过长的样本
MAX_LENGTH = 30
df_train = df_train[~(df_train.title1_zh.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_zh.apply(lambda x : len(x)) > MAX_LENGTH)]

# 只用 1% 的训练集，看看bert的强大
SAMPLE_FRAC = 0.01
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=6666)

# 去除没用的列
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_zh', 'title2_zh', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']

# 将结果另存成 tsv 供 PyTorch 使用
df_train.to_csv("train.tsv", sep="\t", index=False)

print("训练样本数量：", len(df_train))
df_train.head()

训练样本数量： 2657


Unnamed: 0,text_a,text_b,label
0,晚上吃苹果就成毒苹果了吗,早上吃金苹果，晚上吃毒苹果，苹果真不能晚上吃吗？,agreed
1,吃酱油会变黑？伤口会留疤？,经常吃酱油会变黑，这件事终于有答案了！,agreed
2,加湿器加自来水堪比雾霾,华为金立OPPO：我们手机明年要涨价！网友：有小米就够了,unrelated
3,有谁希望丧尸病毒爆发，外星人入侵，世界巨,丧尸病毒爆发之后 逃上一个小岛是否是一个最好的方案,unrelated
4,山药好吃又营养，这么做还能补充维C、降血压,常吃这3种食物，把血液垃圾清理的一干二净，还能降低血压,unrelated


In [6]:
df_train.label.value_counts() / len(df_train)

unrelated    0.675574
agreed       0.292811
disagreed    0.031615
Name: label, dtype: float64

In [7]:
df_test = pd.read_csv("test.csv")
df_test = df_test.loc[:, ["title1_zh", "title2_zh", "id"]]
df_test.columns = ["text_a", "text_b", "Id"]
df_test.to_csv("test.tsv", sep="\t", index=False)

print("预测样本数：", len(df_test))
df_test.head()

预测样本数： 80126


Unnamed: 0,text_a,text_b,Id
0,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,321187
1,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,321190
2,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,321189
3,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,321193
4,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,321191


In [10]:
from torch.utils.data import Dataset
!pip install transformers tqdm boto3 requests regex -q
from transformers import BertTokenizer
!pip install pysnooper -q
import pysnooper

PRETRAINED_MODEL_NAME = "bert-base-chinese"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode
        #iterator=True pd读取大文件方法
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer  #使用 BERT tokenizer

    #@pysnooper.snoop()  # 加入以了解所有转换过程
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
trainset = FakeNewsDataset("train", tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [12]:
import torch
sample_idx = 0
text_a, text_b, label = trainset.df.iloc[sample_idx].values

tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)
print(combined_text)

[CLS]晚上吃苹果就成毒苹果了吗[SEP]早上吃金苹果，晚上吃毒苹果，苹果真不能晚上吃吗？[SEP]


In [13]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列长度
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，将 tokens_tensors 不为 zero padding 的位置设为1
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

BATCH_SIZE = 64
trainloader = DataLoader(trainset,batch_size=BATCH_SIZE,collate_fn=create_mini_batch)

In [15]:
from transformers import BertForSequenceClassification
from IPython.display import clear_output
PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()
#model.config



```
# 此内容为代码格式

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=2, ...):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config, ...)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
          ...

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, ...):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, ...)
        ...
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        elif self.output_attentions:
            return all_attentions, logits
        return logit
```



In [16]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors,token_type_ids=segments_tensors,attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 0.26270229582235605


In [17]:
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 6
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        optimizer.zero_grad()
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 28.292, acc: 0.830
[epoch 2] loss: 18.815, acc: 0.884
[epoch 3] loss: 14.057, acc: 0.925
[epoch 4] loss: 9.776, acc: 0.943
[epoch 5] loss: 6.425, acc: 0.971
[epoch 6] loss: 4.782, acc: 0.973


In [18]:
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

predictions = get_predictions(model, testloader)
index_map = {v: k for k, v in testset.label_map.items()}

df = pd.DataFrame({"Category": predictions.tolist()})
df['Category'] = df.Category.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["Id"]], df.loc[:, 'Category']], axis=1)
df_pred.to_csv('bert_1_prec_training_samples.csv', index=False)
df_pred.head()

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated


In [19]:
predictions = get_predictions(model, trainloader)
df = pd.DataFrame({"predicted": predictions.tolist()})
df['predicted'] = df.predicted.apply(lambda x: index_map[x])
df1 = pd.concat([trainset.df, df.loc[:, 'predicted']], axis=1)
disagreed_tp = ((df1.label == 'disagreed') & \
                (df1.label == df1.predicted) & \
                (df1.text_a.apply(lambda x: True if len(x) < 10 else False)))
df1[disagreed_tp].head()

Unnamed: 0,text_a,text_b,label,predicted
1605,李天一已被安排出国,李天一即将提前出狱？官方辟谣：仍在服刑！,disagreed,disagreed
2571,沈阳两名女子偷孩子,两名女子偷孩子 沈阳网警辟谣：假的！,disagreed,disagreed


In [44]:
text_a = "李天一已被安排出国"
text_b = "李天一即将提前出狱？官方：是的！"
word_pieces = ["[CLS]"]
tokens_a = tokenizer.tokenize(text_a)
word_pieces += tokens_a + ["[SEP]"]
len_a = len(word_pieces)
        
tokens_b = tokenizer.tokenize(text_b)
word_pieces += tokens_b + ["[SEP]"]
len_b = len(word_pieces) - len_a

ids = tokenizer.convert_tokens_to_ids(word_pieces)
tokens_tensor = torch.tensor(ids).unsqueeze(0)
     
segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long).unsqueeze(0)

masks_tensors = torch.zeros(tokens_tensor.shape,dtype=torch.long)
masks_tensors = masks_tensors.masked_fill(tokens_tensor != 0, 1).unsqueeze(0)

outputs = model(input_ids=tokens_tensor.to(device),token_type_ids=segments_tensor.to(device),attention_mask=masks_tensors.to(device))
logits = outputs[0]
_, pred = torch.max(logits.data, 1)
label_map = {0:'agreed', 1: 'disagreed', 2: 'unrelated'}

print(outputs)
print(label_map[pred.cpu().tolist()[0]])




(tensor([[ 1.5578, -0.0229, -0.4285]], device='cuda:0', grad_fn=<AddmmBackward>),)
agreed
