In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

data_frame=pd.read_csv("../data/test_data1_2type.csv",
                       names=['category', 'text'],
                       index_col=False
                       )

In [3]:
possible_categories=data_frame.category.unique()
possible_categories

array([1, 5], dtype=int64)

In [4]:
# 标签映射
label_dict={}
for index, possible_category in enumerate(possible_categories):
    label_dict[possible_category]=index
label_dict

{1: 0, 5: 1}

In [5]:
label_dict[5]=1
label_dict[1]=0

In [6]:
data_frame['label']=data_frame.category.replace(label_dict)
data_frame.head(10)

Unnamed: 0,category,text,label
0,1,还好吧，不喜欢里面的中国元素。,0
1,1,名气很高，但真实情况是？失望！,0
2,1,不喜欢阿三,0
3,1,我还以为英国的电影呢我是看一群帅哥的面才看完这部电影的。电影的故事不外乎成人世界压抑天性，正...,0
4,1,讲真，美国真是好人多，中国真是坏人多，这种人在中国太多了，估计一大堆做假的各种领导学者在中国...,0
5,1,这部电影对于我个人非常有纪念意义，它令我惊觉自己的改变。那种宁愿守着旧梦死去也不愿意走向迷茫...,0
6,1,就是个小时长的日剧嘛。拉大提琴的几场尤其造作尤其傻，这种半调子剧本、台词、烂表演、配这种业余...,0
7,1,很低俗，猥琐，，，周星驰就爱这个是吧，我几乎要放弃看他的作品了。只有黄圣依美是真美，但目的呢...,0
8,1,就靠各类动物的体型差和一路不停的耍宝来制造萌点吧。食肉动物的本性本来就是野蛮：树獭一点也不好...,0
9,1,昏昏欲睡。以为会对自己的刑法理论有一定的冲击，但失望了，讲的不就是罪刑法定原则吗？需要拍的那...,0


In [7]:
selected_col=['text','label']
sub_df=data_frame[selected_col]
sub_df=sub_df.dropna()
# sub_df=sub_df.sample(frac=0.5)
sub_df

Unnamed: 0,text,label
0,还好吧，不喜欢里面的中国元素。,0
1,名气很高，但真实情况是？失望！,0
2,不喜欢阿三,0
3,我还以为英国的电影呢我是看一群帅哥的面才看完这部电影的。电影的故事不外乎成人世界压抑天性，正...,0
4,讲真，美国真是好人多，中国真是坏人多，这种人在中国太多了，估计一大堆做假的各种领导学者在中国...,0
...,...,...
1595,很悲伤的故事，爸爸的爱让人感动又心痛！为什么可爱的孩童受到伤害后还得不到守护！,1
1596,太好看了！今年最喜欢的一部电影！还好憋着没去看网络版一直等着上映，大银幕看那美轮美奂的画面真...,1
1597,当Joker决心暗杀Thomas Wayne时，我们发现了反向的双子杀手的结构：缺失的父亲之...,1
1598,有生之年能够大荧幕四刷，太幸福了。人生哪有什么机缘巧合，全是辜负和错过。以前不喜欢刘嘉玲，但...,1


## 加载Tokenizer和Encode the Data

In [8]:
# 下载的预训练文件路径
BERT_PATH = '../Models/bert-base-chinese'

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

# 加载分词器
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)


# 将strings转化为tokens
encoded_data_test = tokenizer.batch_encode_plus(
    sub_df.text.values,
    add_special_tokens=True,    # This is just the BERT way of knowing that when the sentence ends and when the a new one begins.
    return_attention_mask=True, # 为将不同长度的句子拥有相同的维度，将max_length设置为很大的数字256，
    pad_to_max_length=True,      # attention_mask表示实际值在哪里，同时哪里为空值
    max_length=256,
    return_tensors='pt' # pt表示PyTorch
    )
 

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(sub_df.label.values)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

print("len(dataset_test):", len(dataset_test))




Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


len(dataset_test): 1600


## 加载预训练模型

In [9]:
label_dict

{1: 0, 5: 1}

In [10]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    BERT_PATH,
    num_labels = len(label_dict),
    output_attentions=False,    # 是否输出注意力分散
    output_hidden_states=False  # 是否输出模型的隐藏状态
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../Models/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import random
import numpy as np

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [12]:
# model放到cuda中
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


## 创建DataLoader

In [13]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 48
dataloader_test = DataLoader(dataset_test,
                              sampler=RandomSampler(dataset_test),
                              batch_size=batch_size)

## Define the performace Metrics

In [14]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score,accuracy_score, recall_score, precision_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    labels_dict_inverse = {v: k for k,v in label_dict.items()}

    preds_flat = np.argmax(preds,axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {labels_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])/len(y_true)}\n')

def accuracy_score_func(preds, labels): # 准确率
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    labels_flat=labels.flatten()
    labels_num=preds_flat.size
    accurate_num=(preds_flat==labels_flat).sum()
    print("total label num:", labels_num)
    print("error num:", labels_num-accurate_num)
    print("准确率（accuracy）:",accuracy_score(labels_flat, preds_flat))

def recall_score_func(preds, labels):   # 召回率
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    print("召回率（recall）:",recall_score(labels_flat, preds_flat))

def precision_score_func(preds, labels):    # 精确率
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    print("精确率（precision）:",precision_score(labels_flat, preds_flat))

def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

## 加载训练好的权重

In [15]:
save_path='./bert_checkpoint'

model.load_state_dict(torch.load(f'{save_path}/bert-base-chinese_2type_39200_epoch_6.model'))
# model.load_state_dict(torch.load(f'{save_path}/bert-base-chinese_2type_20000num_epoch_10.model'))
# model.load_state_dict(torch.load(f'{save_path}/bert-base-chinese_2type_39200_epoch_6.model',map_location=torch.device('cpu')))

<All keys matched successfully>

## 输入单个文本进行预测分类

In [20]:
def predict_single_example(model, tokenizer, text):
    # 使用 tokenizer 对文本进行编码
    # inputs = tokenizer(text, return_tensors="pt", truncation=True)
    encoded_data=tokenizer.encode_plus(
        text=text,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )
    input_ids = encoded_data['input_ids']
    attention_masks=encoded_data['attention_mask']

    inputs = {
            'input_ids': input_ids.to(device),
            'attention_mask': attention_masks.to(device),
        }
    
    # 将编码后的数据传递给模型进行预测
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 获取模型输出中的分类概率
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # print(probabilities.flatten().tolist())
    # 获取预测的类别
    predicted_class = torch.argmax(probabilities).item()

    print("predicted_class:",predicted_class)

def predict_single_example_2(model, tokenizer, text):
    # 使用 tokenizer 对文本进行编码
    # inputs = tokenizer(text, return_tensors="pt", truncation=True)
    encoded_data=tokenizer.encode_plus(
        text=text,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )
    input_ids = encoded_data['input_ids']
    attention_masks=encoded_data['attention_mask']
    inputs = {
            'input_ids': input_ids.to(device),
            'attention_mask': attention_masks.to(device),
        }
    
    # 将编码后的数据传递给模型进行预测
    with torch.no_grad():
        outputs = model(**inputs)

    # loss = outputs[0]
    # print(outputs)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    preds_flat = np.argmax(logits).flatten().item()
    print("prediction for label:", preds_flat)

        
# 示例使用
# text_to_predict = "这个剧算是我2023年最大的惊喜，本以为翻拍剧会不及韩国人那么会营造氛围感，结果这个剧令我相当意外，不愧是文艺片演员来执导。作为春夜剧粉，客观来说，我觉得张晚意孙怡以及一众配角要比原版的好看。BGM和男主的声音，都太好听了。一见钟情时无声无息，但已经暗潮汹涌。第一场雪是你，心动是你，纠结是你，无法自控忍不住想靠近的也是你。看得我很想谈恋爱。。。"
text_to_predict = "对爆米花商业电影真的一点感觉都没有啦"
# predict_single_example(model, tokenizer, text_to_predict)
predict_single_example_2(model, tokenizer, text_to_predict)


prediction for label: 0


## 测试集上评估

In [21]:
_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals )
accuracy_score_func(predictions, true_vals)
recall_score_func(predictions, true_vals)
precision_score_func(predictions, true_vals)
val_f1 = f1_score_func(predictions, true_vals)
print("f1 score:", val_f1)

Class: 1
Accuracy: 0.86125

Class: 5
Accuracy: 0.93375

total label num: 1600
error num: 164
准确率（accuracy）: 0.8975
召回率（recall）: 0.93375
精确率（precision）: 0.8706293706293706
f1 score: 0.8973651313679383
