In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 假设我们有一些文本数据和标签
texts = ["this is a positive example", "this is a negative example",
         "this is also positive", "this is also negative",
         "this is a good example", "this is a bad example"]
labels = [1, 0, 1, 0, 1, 0]  # 假设1是正面，0是负面

# 将文本数据和标签分割为训练集和测试集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42)
train_texts, train_labels, val_texts, val_labels

(['this is a bad example',
  'this is also positive',
  'this is a good example',
  'this is also negative'],
 [0, 1, 1, 0],
 ['this is a positive example', 'this is a negative example'],
 [1, 0])

In [3]:
# 使用TfidfVectorizer进行特征构造 将文本转换为TF-IDF特征向量
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

# 使用LabelEncoder进行标签编码
label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_labels)
encoded_val_labels = label_encoder.transform(val_labels)

In [4]:
# 定义一个自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
# 初始化BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=False)

In [6]:
# 创建数据集
train_dataset = CustomDataset(train_texts, encoded_train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, encoded_val_labels, tokenizer)

In [7]:
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

In [8]:
# 初始化BERT模型
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 定义优化器
optimizer = AdamW(model.parameters(), lr=5e-5)

In [10]:
# 训练模型
model.train()
for epoch in range(3):  # 这里我们只训练3个epoch作为示例
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(
            batch['ids'], attention_mask=batch['mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [11]:
# 验证模型
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in val_loader:
        outputs = model(
            batch['ids'], attention_mask=batch['mask'], labels=batch['labels'])
        preds = torch.argmax(outputs.logits, dim=1).flatten()
        correct += (preds == batch['labels']).sum().item()
        total += batch['labels'].size(0)
correct, total

(2, 2)

In [12]:
val_accuracy = correct / total
print(f'Validation accuracy: {val_accuracy:.2f}')

Validation accuracy: 1.00
