In [95]:
import torch
import numpy as np
from gensim.models import KeyedVectors

In [96]:
# 加载词向量模型
word_vector_dict = KeyedVectors.load_word2vec_format(
    "./Dataset/wiki_word2vec_50.bin", binary=True
)

# 确定词向量的维度
vector_size = word_vector_dict.vector_size

In [97]:
# 从文件中读取数据
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    sentences = []
    labels = []

    for line in lines:
        parts = line.strip().split()
        label = torch.tensor(float(parts[0]))
        words = parts[1:]

        word_vectors = []

        for word in words:
            if word in word_vector_dict:
                word_vector = word_vector_dict[word]
                word_vectors.append(word_vector)

        word_vectors = np.array(word_vectors)

        if len(word_vectors) > 0:
            sentence_vector = torch.tensor(word_vectors).view(-1, vector_size)
            sentences.append(sentence_vector)
            labels.append(label)
        else:
            print(f"Empty sentence: {line}")

    return {
        "sentences": sentences,
        "labels": labels,
    }

In [98]:
# 加载数据
train_data = load_data("./Dataset/train.txt")
valid_data = load_data("./Dataset/validation.txt")
test_data = load_data("./Dataset/test.txt")

Empty sentence: 0	鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆



In [99]:
# 将句子填充到相同长度
def unify_columns(matrix, unified_length):
    col_length, row_length = matrix.size()
    if col_length > unified_length:
        new_matrix = matrix[:unified_length, :]
    elif col_length < unified_length:
        padding = torch.zeros(unified_length - col_length, row_length)
        new_matrix = torch.cat((padding, matrix), dim=0)
    else:
        new_matrix = matrix
    return new_matrix

In [100]:
from torch.utils.data import Dataset


# 定义数据集类
class SentimentDataset(Dataset):
    def __init__(self, sentences, labels, sentences_len):
        self.sentences = sentences
        self.labels = labels
        self.sentence_len = sentences_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        sentence = self.sentences[index].float()
        label = self.labels[index]
        # 填充句子到最大长度
        output_matrix = unify_columns(sentence, self.sentence_len).requires_grad_()
        return output_matrix, label

In [101]:
# 创建Dataset对象
SENTENCE_LEN = 64
train_dataset = SentimentDataset(
    train_data["sentences"], train_data["labels"], sentences_len=SENTENCE_LEN
)
valid_dataset = SentimentDataset(
    valid_data["sentences"], valid_data["labels"], sentences_len=SENTENCE_LEN
)
test_dataset = SentimentDataset(
    test_data["sentences"], test_data["labels"], sentences_len=SENTENCE_LEN
)

print("训练集大小:", len(train_dataset))
print("验证集大小:", len(valid_dataset))
print("测试集大小:", len(test_dataset))

训练集大小: 19997
验证集大小: 5629
测试集大小: 369


In [102]:
from torch.utils.data import DataLoader

# 创建 DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [103]:
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F

In [108]:
# RNN模型
class RNNSentimentClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers, dropout):
        super(RNNSentimentClassifier, self).__init__()

        # LSTM层
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            dropout=dropout,
            batch_first=True,
        )

        # 初始化LSTM层的权重和偏置
        for name, param in self.lstm.named_parameters():
            if 'weight_ih' in name:
                init.orthogonal_(param.data)
            elif 'weight_hh' in name:
                init.orthogonal_(param.data)
            elif 'bias' in name:
                init.zeros_(param.data)

        # Dropout层
        self.dropout = nn.Dropout(dropout)

        # 全连接层
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        # 初始化全连接层的权重和偏置
        init.orthogonal_(self.fc.weight)
        init.zeros_(self.fc.bias)

    def forward(self, text):

        # LSTM层的输出
        lstm_output, (hidden, cell) = self.lstm(text)

        # 使用最后一个时间步的输出
        last_hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)

        # 应用全连接层
        logits = self.fc(last_hidden)

        # 使用sigmoid激活函数
        probs = F.sigmoid(logits)

        return probs

In [109]:
# 超参数
EMBEDDING_DIM = vector_size
HIDDEN_DIM = 128
OUTPUT_DIM = 1
NUM_LAYERS = 2
DROPOUT = 0.5

# 初始化模型
model = RNNSentimentClassifier(
    EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, DROPOUT
)

# 查看模型结构
print(model)

# 定义损失函数和优化器
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

RNNSentimentClassifier(
  (lstm): LSTM(50, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


In [110]:
# 定义准确率计算函数
def accuracy(logits, labels):
    labels = labels.view(-1, 1)
    y_pred = logits >= 0.5
    y = labels == 1
    correct_predictions = (y_pred == y).float()
    correct_predictions = correct_predictions.view(-1)
    accuracy = correct_predictions.sum() / len(labels)
    return accuracy

def train_epoch(model, data_loader, loss_function, optimizer):
    model.train()
    total_loss = 0
    total_accuracy = 0
    for sentences, labels in data_loader:
        optimizer.zero_grad()
        logits = model(sentences)
        loss = loss_function(logits, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batch_accuracy = accuracy(logits, labels)
        total_accuracy += batch_accuracy.item()

    average_loss = total_loss / len(data_loader)
    average_accuracy = total_accuracy / len(data_loader)
    return average_loss, average_accuracy


def valid_epoch(model, data_loader, loss_function):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    with torch.no_grad():
        for sentences, labels in data_loader:
            logits = model(sentences)
            loss = loss_function(logits, labels.unsqueeze(1))
            total_loss += loss.item()
            batch_accuracy = accuracy(logits, labels)
            total_accuracy += batch_accuracy.item()

    average_loss = total_loss / len(data_loader)
    average_accuracy = total_accuracy / len(data_loader)
    return average_loss, average_accuracy


num_epochs = 24
for epoch in range(num_epochs):
    train_loss, train_accuracy = train_epoch(
        model, train_loader, loss_function, optimizer
    )
    valid_loss, valid_accuracy = valid_epoch(model, valid_loader, loss_function)
    print(
        f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, "
        f"Train Accuracy: {train_accuracy:.4f}, Valid Accuracy: {valid_accuracy:.4f}"
    )

Epoch 1/24, Train Loss: 0.6932, Valid Loss: 0.6898, Train Accuracy: 0.5008, Valid Accuracy: 0.5014
Epoch 2/24, Train Loss: 0.6766, Valid Loss: 0.6797, Train Accuracy: 0.5705, Valid Accuracy: 0.5523
Epoch 3/24, Train Loss: 0.6839, Valid Loss: 0.6898, Train Accuracy: 0.5350, Valid Accuracy: 0.5100
Epoch 4/24, Train Loss: 0.6895, Valid Loss: 0.6894, Train Accuracy: 0.5110, Valid Accuracy: 0.5111
Epoch 5/24, Train Loss: 0.6891, Valid Loss: 0.6890, Train Accuracy: 0.5122, Valid Accuracy: 0.5127
Epoch 6/24, Train Loss: 0.6829, Valid Loss: 0.6823, Train Accuracy: 0.5521, Valid Accuracy: 0.6090
Epoch 7/24, Train Loss: 0.6771, Valid Loss: 0.6483, Train Accuracy: 0.5681, Valid Accuracy: 0.6390
Epoch 8/24, Train Loss: 0.6439, Valid Loss: 0.6221, Train Accuracy: 0.6771, Valid Accuracy: 0.7395
Epoch 9/24, Train Loss: 0.6267, Valid Loss: 0.6269, Train Accuracy: 0.7200, Valid Accuracy: 0.7574
Epoch 10/24, Train Loss: 0.6169, Valid Loss: 0.6161, Train Accuracy: 0.7500, Valid Accuracy: 0.7328
Epoch 11/

In [None]:
# 测试循环以评估模型性能
def test_epoch(model, data_loader, loss_function):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    with torch.no_grad():
        for sentences, labels in data_loader:
            logits = model(sentences)
            loss = loss_function(logits, labels.unsqueeze(1))
            total_loss += loss.item()
            batch_accuracy = accuracy(logits, labels)
            total_accuracy += batch_accuracy.item()

    average_loss = total_loss / len(data_loader)
    average_accuracy = total_accuracy / len(data_loader)
    return average_loss, average_accuracy


# 在训练完成后，评估模型在测试集上的性能
test_loss, test_accuracy = test_epoch(model, test_loader, loss_function)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.6151, Test Accuracy: 0.7975


In [111]:
# 保存模型
torch.save(model.state_dict(), "rnn_sentiment_model.pth")

In [112]:
def precision_score(probs, labels):
    probs = probs.view(-1)
    y_true = list(labels == 1)
    for i in range(len(y_true)):
        y_true[i] = y_true[i].item()
    print(y_true)
    y_pred = list(probs >= 0.5)
    for i in range(len(y_pred)):
        y_pred[i] = y_pred[i].item()
    print(y_pred)
    true_positives = sum([1 for y, p in zip(y_true, y_pred) if y == True and p == True])
    false_positives = sum(
        [1 for y, p in zip(y_true, y_pred) if y == False and p == True]
    )
    precision = true_positives / (true_positives + false_positives)
    return precision


def recall_score(probs, labels):
    probs = probs.view(-1)
    y_true = list(labels == 1)
    for i in range(len(y_true)):
        y_true[i] = y_true[i].item()

    y_pred = list(probs >= 0.5)
    for i in range(len(y_pred)):
        y_pred[i] = y_pred[i].item()

    true_positives = sum([1 for y, p in zip(y_true, y_pred) if y == True and p == True])
    false_negatives = sum(
        [1 for y, p in zip(y_true, y_pred) if y == True and p == False]
    )
    recall = true_positives / (true_positives + false_negatives)
    return recall

In [122]:
# 加载模型
model.load_state_dict(torch.load("rnn_sentiment_model.pth"))

# 设置模型为评估模式
model.eval()

RNNSentimentClassifier(
  (lstm): LSTM(50, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [123]:
# test
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

total_loss = 0
total_accuracy = 0
total_correct = 0
total_samples = 0
total_precision = 0
total_recall = 0
with torch.no_grad():  # 关闭梯度计算
    for sentences, labels in test_loader:
        probs = model(sentences)

        batch_loss = loss_function(probs, labels.unsqueeze(1))
        total_loss += batch_loss.item()

        batch_accuracy = accuracy(probs, labels)
        total_accuracy += batch_accuracy.item()

        # 计算精确率、召回率和F-score
        batch_precision = precision_score(probs, labels)
        batch_recall = recall_score(probs, labels)

        total_precision += batch_precision
        total_recall += batch_recall

average_loss = total_loss / len(test_loader)
average_accuracy = total_accuracy / len(test_loader)
average_precision = total_precision / len(test_loader)
average_recall = total_recall / len(test_loader)
average_f1 = 2 * ((average_precision * average_recall) / (average_precision + average_recall))

print(f"Average Loss: {average_loss:.4f}")
print(f"Average Accuracy: {average_accuracy:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, Tru