In [None]:
import torch
import torchtext
from torch import nn
from torch.optim import Adam
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from transformers import BertModel

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

# 数据处理

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataset_path, tokenizer):
        df = pd.read_csv(dataset_path).dropna().sample(100_000).reset_index(drop=True)
        self.labels = df['label']
        self.n_classes = len(df['label'].unique())
        self.texts = [tokenizer(title, padding='max_length', max_length=32, return_tensors='pt')
                      for title in tqdm(df['title'])]
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        return self.texts[idx], np.array(self.labels[idx])

In [None]:
# 预训练模型 bert-base-chinese
model_path = '/Users/gechengze/project/models/bert-base-chinese/'
# bert tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)

# 构造train dataloader和valid dataloader
train_dataset = MyDataset('../data/THUCNews/train.csv', tokenizer)

valid_dataset = MyDataset('../data/THUCNews/valid.csv', tokenizer)

# 构造模型

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.5):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, n_classes)
        self.relu = nn.ReLU()
        
    def forward(self, input_ids, atention_mask):
        _, pooled_output = self.bert(input_ids, atention_mask, return_dict=False)
        output = self.dropout(pooled_output)
        output = self.linear(output)
        output = self.relu(output)
        return output
    
model = BertClassifier(n_classes=train_dataset.n_classes)

# 训练模型

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-6)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)

valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, drop_last=True)

for train_input, train_label in train_dataloader:
    input_ids = train_input['input_ids'].squeeze(1).to(device)
    attention_mask = train_input['attention_mask'].squeeze(1).to(device)
    train_label = train_label.to(device)
    output = model(input_ids, attention_mask)
    loss = criterion(output, train_label)
    acc = (output.argmax(dim=1) == train_label).sum().item()
    
    model.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(format(loss.item(), '.4f'))