In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision
import torchtext
import torch.nn.functional as F
import torch.optim as optim
from collections import defaultdict

from datasets import Dataset, DatasetDict

In [3]:
print(torch.__version__)
print(torch.backends.cudnn.version())
print(torchvision.__version__)
print(torchtext.__version__)

2.0.0+cu118
8700
0.15.1+cu118
0.15.1


In [4]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
plt.rcParams['figure.figsize'] = [12, 6]

In [9]:
# 训练相关参数
MODEL_NAME = 'pretrain_models/xlnet-base-cased'
MAX_LENGTH = 512
BATCH_SIZE = 32
LOSS_TYPE = 'ce_loss'  # 自定义参数 focal_loss dsc_loss ce_loss
WEIGHT_DECAY = 0.01
LEARNING_RATE = 5e-6
EPOCHS = 3
warmup_steps = 100
label_smoothing_factor = 0.0

In [7]:
df = pd.read_csv(f'./data/citation_sentiment_corpus_new.csv')
df.head()

Unnamed: 0,Source_Paper_ID,Target_Paper_ID,Sentiment,Citation_Text,Sentence_Length
0,A00-1043,A00-2024,0,We analyzed a set of articles and identified s...,486
1,H05-1033,A00-2024,0,Table 3: Example compressions Compression AvgL...,349
2,I05-2009,A00-2024,0,5.3 Related works and discussion Our two-step ...,159
3,I05-2009,A00-2024,0,(1999) proposed a summarization system based o...,368
4,I05-2009,A00-2024,0,We found that the deletion of lead parts did n...,125


In [11]:
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

# 分割数据集：训练集80%，验证集10%，测试集10%
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['Citation_Text'].tolist(), df['Sentiment'].tolist(), test_size=0.4,
    stratify=df['Sentiment'], random_state=42)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, stratify=temp_labels, random_state=42)


class MyDataset(Dataset):
    """
    重构数据集类，使其能够返回字典格式的数据，有标签
    """

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

id2label={0:"Neutral", 1:"Positive", 2:"Negative"}
label2id={"Neutral":0, "Positive":1, "Negative":2}
model = AutoModel.from_pretrained(MODEL_NAME, num_labels=3, id2label=id2label, label2id=label2id).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = MyDataset(tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt', max_length=512), train_labels)
test_dataset = MyDataset(tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt', max_length=512), test_labels)
val_dataset = MyDataset(tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt', max_length=512), val_labels)

print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")
print(f"Val Dataset Size: {len(val_dataset)}")

Train Dataset Size: 5219
Test Dataset Size: 1740
Val Dataset Size: 1740


In [30]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

total_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [36]:
from sklearn import metrics

def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    acc = 0
    counter = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].reshape(BATCH_SIZE, MAX_LENGTH).to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=targets)
        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        _, prediction = torch.max(outputs[1], dim=1)
        targets = targets.cpu().detach().numpy()
        prediction = prediction.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, prediction)

        acc += accuracy
        losses.append(loss.item())
        
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        counter = counter + 1

    return acc / counter, np.mean(losses)

In [39]:
def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    acc = 0
    counter = 0
  
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].reshape(4,512).to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
            loss = outputs[0]
            logits = outputs[1]

            _, prediction = torch.max(outputs[1], dim=1)
            targets = targets.cpu().detach().numpy()
            prediction = prediction.cpu().detach().numpy()
            accuracy = metrics.accuracy_score(targets, prediction)

            acc += accuracy
            losses.append(loss.item())
            counter += 1

    return acc / counter, np.mean(losses)

In [40]:
from tqdm import *

history = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(EPOCHS)):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,     
        optimizer, 
        device, 
        scheduler, 
        len(train_dataset)
    )

    print(f'Train loss {train_loss} Train accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_loader, 
        device, 
        len(val_dataset)
    )

    print(f'Val loss {val_loss} Val accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), '/content/drive/My Drive/NLP/Sentiment Analysis Series/models/xlnet_model.bin')
        best_accuracy = val_acc

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/3
----------





RuntimeError: shape '[32, 512]' is invalid for input of size 14688