In [58]:
import torch
import numpy as np
from gensim.models import KeyedVectors

In [59]:
# 加载词向量模型
word_vector_dict = KeyedVectors.load_word2vec_format(
    "./Dataset/wiki_word2vec_50.bin", binary=True
)

# 确定词向量的维度
vector_size = word_vector_dict.vector_size

In [60]:
# 从文件中读取数据
def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    sentences = []
    labels = []

    for line in lines:
        parts = line.strip().split()
        label = torch.tensor(float(parts[0]))
        words = parts[1:]

        word_vectors = []

        for word in words:
            if word in word_vector_dict:
                word_vector = word_vector_dict[word]
                word_vectors.append(word_vector)

        word_vectors = np.array(word_vectors)

        if len(word_vectors) > 0:
            sentence_vector = torch.tensor(word_vectors).view(-1, vector_size)
            sentences.append(sentence_vector)
            labels.append(label)
        else:
            print(f"Empty sentence: {line}")

    return {
        "sentences": sentences,
        "labels": labels,
    }

In [61]:
# 加载数据
train_data = load_data("./Dataset/train.txt")
valid_data = load_data("./Dataset/validation.txt")
test_data = load_data("./Dataset/test.txt")

Empty sentence: 0	鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆 鸟爆



In [62]:
# 将句子填充到相同长度
def unify_columns(matrix, unified_length):
    col_length, row_length = matrix.size()
    if col_length > unified_length:
        new_matrix = matrix[:unified_length, :]
    elif col_length < unified_length:
        padding = torch.zeros(unified_length - col_length, row_length)
        new_matrix = torch.cat((matrix, padding), dim=0)
    else:
        new_matrix = matrix
    return new_matrix

In [63]:
from torch.utils.data import Dataset


# 定义数据集类
class SentimentDataset(Dataset):
    def __init__(self, sentences, labels, sentences_len):
        self.sentences = sentences
        self.labels = labels
        self.sentence_len = sentences_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        sentence = self.sentences[index].float()
        label = self.labels[index]
        output_matrix = unify_columns(sentence, self.sentence_len).view(-1).requires_grad_()
        return output_matrix, label

In [64]:
# 创建Dataset对象
SENTENCE_LEN = 64
train_dataset = SentimentDataset(
    train_data["sentences"], train_data["labels"], sentences_len=SENTENCE_LEN
)
valid_dataset = SentimentDataset(
    valid_data["sentences"], valid_data["labels"], sentences_len=SENTENCE_LEN
)
test_dataset = SentimentDataset(
    test_data["sentences"], test_data["labels"], sentences_len=SENTENCE_LEN
)

print("训练集大小:", len(train_dataset))
print("验证集大小:", len(valid_dataset))
print("测试集大小:", len(test_dataset))

训练集大小: 19997
验证集大小: 5629
测试集大小: 369


In [65]:
from torch.utils.data import DataLoader

# 创建DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [66]:
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F

In [67]:
class MLPSentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super(MLPSentimentClassifier, self).__init__()

        # 输入层
        self.input_layer = nn.Linear(input_dim, hidden_dim)

        # 隐藏层
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_dim, hidden_dim) for _ in range(3)]  # 假设有3个隐藏层
        )

        # Dropout层
        self.dropout = nn.Dropout(dropout)

        # 输出层
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # 通过输入层
        x = F.relu(self.input_layer(text))

        # 通过隐藏层
        for layer in self.hidden_layers:
            x = F.relu(layer(x))

        # 通过Dropout层
        x = self.dropout(x)

        # 通过输出层
        logits = self.output_layer(x)

        # 使用sigmoid激活函数
        probs = F.sigmoid(logits)

        return probs

In [68]:
# 超参数
INPUT_DIM = vector_size * SENTENCE_LEN
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5

# 初始化模型
model = MLPSentimentClassifier(
    INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT
)

# 查看模型结构
print(model)

# 定义损失函数和优化器
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

MLPSentimentClassifier(
  (input_layer): Linear(in_features=3200, out_features=256, bias=True)
  (hidden_layers): ModuleList(
    (0-2): 3 x Linear(in_features=256, out_features=256, bias=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (output_layer): Linear(in_features=256, out_features=1, bias=True)
)


In [69]:
# 定义准确率计算函数
def accuracy(logits, labels):
    labels = labels.view(-1, 1)
    y_pred = logits >= 0.5
    y = labels == 1
    correct_predictions = (y_pred == y).float()
    correct_predictions = correct_predictions.view(-1)
    accuracy = correct_predictions.sum() / len(labels)
    return accuracy


def train_epoch(model, data_loader, loss_function, optimizer):
    model.train()
    total_loss = 0
    total_accuracy = 0
    for sentences, labels in data_loader:
        optimizer.zero_grad()
        logits = model(sentences)
        loss = loss_function(logits, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batch_accuracy = accuracy(logits, labels)
        total_accuracy += batch_accuracy.item()

    average_loss = total_loss / len(data_loader)
    average_accuracy = total_accuracy / len(data_loader)
    return average_loss, average_accuracy


def valid_epoch(model, data_loader, loss_function):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    with torch.no_grad():
        for sentences, labels in data_loader:
            logits = model(sentences)
            loss = loss_function(logits, labels.unsqueeze(1))
            total_loss += loss.item()
            batch_accuracy = accuracy(logits, labels)
            total_accuracy += batch_accuracy.item()

    average_loss = total_loss / len(data_loader)
    average_accuracy = total_accuracy / len(data_loader)
    return average_loss, average_accuracy


# 开始训练，包括准确率的打印
num_epochs = 16
for epoch in range(num_epochs):
    train_loss, train_accuracy = train_epoch(
        model, train_loader, loss_function, optimizer
    )
    valid_loss, valid_accuracy = valid_epoch(model, valid_loader, loss_function)
    print(
        f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, "
        f"Train Accuracy: {train_accuracy:.4f}, Valid Accuracy: {valid_accuracy:.4f}"
    )

Epoch 1/16, Train Loss: 0.6465, Valid Loss: 0.6405, Train Accuracy: 0.6660, Valid Accuracy: 0.6569
Epoch 2/16, Train Loss: 0.6187, Valid Loss: 0.6289, Train Accuracy: 0.7374, Valid Accuracy: 0.7371
Epoch 3/16, Train Loss: 0.6114, Valid Loss: 0.6254, Train Accuracy: 0.7566, Valid Accuracy: 0.7114
Epoch 4/16, Train Loss: 0.6082, Valid Loss: 0.6290, Train Accuracy: 0.7637, Valid Accuracy: 0.7425
Epoch 5/16, Train Loss: 0.6017, Valid Loss: 0.6377, Train Accuracy: 0.7795, Valid Accuracy: 0.7419
Epoch 6/16, Train Loss: 0.5987, Valid Loss: 0.6265, Train Accuracy: 0.7852, Valid Accuracy: 0.7160
Epoch 7/16, Train Loss: 0.5954, Valid Loss: 0.6307, Train Accuracy: 0.7930, Valid Accuracy: 0.7373
Epoch 8/16, Train Loss: 0.6133, Valid Loss: 0.6398, Train Accuracy: 0.7560, Valid Accuracy: 0.7291
Epoch 9/16, Train Loss: 0.5972, Valid Loss: 0.6271, Train Accuracy: 0.7911, Valid Accuracy: 0.7362
Epoch 10/16, Train Loss: 0.5898, Valid Loss: 0.6308, Train Accuracy: 0.8075, Valid Accuracy: 0.7009
Epoch 11/

In [70]:
# 测试循环以评估模型性能
def test_epoch(model, data_loader, loss_function):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    with torch.no_grad():
        for sentences, labels in data_loader:
            logits = model(sentences)
            loss = loss_function(logits, labels.unsqueeze(1))
            total_loss += loss.item()
            batch_accuracy = accuracy(logits, labels)
            total_accuracy += batch_accuracy.item()

    average_loss = total_loss / len(data_loader)
    average_accuracy = total_accuracy / len(data_loader)
    return average_loss, average_accuracy


# 在训练完成后，评估模型在测试集上的性能
test_loss, test_accuracy = test_epoch(model, test_loader, loss_function)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.6369, Test Accuracy: 0.7099


In [71]:
# 保存模型
torch.save(model.state_dict(), "mlp_sentiment_model.pth")

In [72]:
def precision_score(probs, labels):
    probs = probs.view(-1)
    y_true = list(labels == 1)
    for i in range(len(y_true)):
        y_true[i] = y_true[i].item()
    print(y_true)
    y_pred = list(probs >= 0.5)
    for i in range(len(y_pred)):
        y_pred[i] = y_pred[i].item()
    print(y_pred)
    true_positives = sum([1 for y, p in zip(y_true, y_pred) if y == True and p == True])
    false_positives = sum(
        [1 for y, p in zip(y_true, y_pred) if y == False and p == True]
    )
    precision = true_positives / (true_positives + false_positives)
    return precision


def recall_score(probs, labels):
    probs = probs.view(-1)
    y_true = list(labels == 1)
    for i in range(len(y_true)):
        y_true[i] = y_true[i].item()

    y_pred = list(probs >= 0.5)
    for i in range(len(y_pred)):
        y_pred[i] = y_pred[i].item()

    true_positives = sum([1 for y, p in zip(y_true, y_pred) if y == True and p == True])
    false_negatives = sum(
        [1 for y, p in zip(y_true, y_pred) if y == True and p == False]
    )
    recall = true_positives / (true_positives + false_negatives)
    return recall

In [73]:
# 加载模型
model.load_state_dict(torch.load("mlp_sentiment_model.pth"))

# 设置模型为评估模式
model.eval()

MLPSentimentClassifier(
  (input_layer): Linear(in_features=3200, out_features=256, bias=True)
  (hidden_layers): ModuleList(
    (0-2): 3 x Linear(in_features=256, out_features=256, bias=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (output_layer): Linear(in_features=256, out_features=1, bias=True)
)

In [74]:
# test
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

total_loss = 0
total_accuracy = 0
total_correct = 0
total_samples = 0
total_precision = 0
total_recall = 0
with torch.no_grad():  # 关闭梯度计算
    for sentences, labels in test_loader:
        probs = model(sentences)

        batch_loss = loss_function(probs, labels.unsqueeze(1))
        total_loss += batch_loss.item()

        batch_accuracy = accuracy(probs, labels)
        total_accuracy += batch_accuracy.item()

        # 计算精确率、召回率和F-score
        batch_precision = precision_score(probs, labels)
        batch_recall = recall_score(probs, labels)

        total_precision += batch_precision
        total_recall += batch_recall

average_loss = total_loss / len(test_loader)
average_accuracy = total_accuracy / len(test_loader)
average_precision = total_precision / len(test_loader)
average_recall = total_recall / len(test_loader)
average_f1 = 2 * ((average_precision * average_recall) / (average_precision + average_recall))

print(f"Average Loss: {average_loss:.4f}")
print(f"Average Accuracy: {average_accuracy:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average Recall: {average_recall:.4f}")
print(f"Average F1 Score: {average_f1:.4f}")

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, Tru