In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import contractions


In [3]:
# 训练Word2Vec模型
w2v_model = Word2Vec(sentences=data['cleaned_text'], vector_size=300, window=5, min_count=1, workers=4)

# 定义数据集类 w2v_model
class TextDataset(Dataset):
    def __init__(self, texts, labels, word2vec):
        self.labels = labels.reset_index(drop=True)
        self.texts = texts.reset_index(drop=True)
        self.word2vec = word2vec

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        vectors = [self.word2vec.wv[word] for word in text if word in self.word2vec.wv]
        vectors_np = np.array(vectors, dtype=np.float32)  # 将列表转换为numpy数组
        vectors_tensor = torch.from_numpy(vectors_np)  # 从numpy数组创建tensor
        if len(vectors_tensor) > 200:
            vectors_tensor = vectors_tensor[:200]
        else:
            padding_size = 200 - len(vectors_tensor)
            padding = torch.zeros(padding_size, 300)  # 修改这里的维度为300，以匹配Word2Vec的输出
            vectors_tensor = torch.cat((vectors_tensor, padding), dim=0)
        return vectors_tensor, torch.tensor(label, dtype=torch.float)  # 注意: 对于BCEWithLogitsLoss，标签也应为float

In [11]:
# def load_glove_vectors(glove_file):
#     """加载GloVe词向量"""
#     print(f"Loading GloVe vectors from file: {glove_file}")
#     word2vec = {}
#     with open(glove_file, 'r', encoding='utf-8') as f:
#         for line in f:
#             parts = line.split()
#             word = parts[0]
#             vector = np.array(parts[1:], dtype=np.float32)
#             word2vec[word] = vector
#     return word2vec

# # 假设您的GloVe向量文件路径如下
# glove_path = r'/Users/wilsonlee/Library/Mobile Documents/com~apple~CloudDocs/01哥德堡大学/Course by Periods/02_2023_Fall_P2/LT2114 - Practical Natural Language Processing/Lecture Notes/glove.twitter.27B/glove.twitter.27B.200d.txt'
# glove_vectors = load_glove_vectors(glove_path)


# # 定义数据集类 GloVe
# class TextDataset(Dataset):
#     def __init__(self, texts, labels, word2vec):
#         self.labels = labels.reset_index(drop=True)
#         self.texts = texts.reset_index(drop=True)
#         self.word2vec = word2vec

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]  # 这里text应该是一个包含单词的列表
#         label = self.labels[idx]
#         vectors = [self.word2vec[word] for word in text if word in self.word2vec]
#         vectors_tensor = torch.tensor(vectors, dtype=torch.float)
#         if len(vectors_tensor) > 200:
#             vectors_tensor = vectors_tensor[:200]
#         else:
#             padding_size = 200 - len(vectors_tensor)
#             padding = torch.zeros(padding_size, 200)  # 确保这里的维度匹配GloVe向量的大小
#             vectors_tensor = torch.cat((vectors_tensor, padding), dim=0)
#         return vectors_tensor, torch.tensor(label, dtype=torch.float)

In [6]:
    

# 定义RNN模型
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # 初始化隐藏状态
        h0 = torch.zeros(1, x.size(0), hidden_dim)
        # 前向传播
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :]) # 取最后一个时间步
        return out

# 定义LSTM模型
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # 初始化隐藏状态和细胞状态
        h0 = torch.zeros(1, x.size(0), hidden_dim)
        c0 = torch.zeros(1, x.size(0), hidden_dim)
        # 前向传播
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :]) # 取最后一个时间步
        return out
    
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2, bidirectional=True, dropout_rate=0.5, device='cpu'):
        super(BiLSTMModel, self).__init__()
        self.device = torch.device(device)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        
        # 定义LSTM层
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout_rate if num_layers > 1 else 0).to(self.device)
        
        # 定义Dropout层
        self.dropout = nn.Dropout(dropout_rate)
        
        # 定义全连接层，如果是双向，则维度是隐藏维度的两倍
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim).to(self.device)
        
    def forward(self, x):
        # 初始化隐藏状态和细胞状态
        h0, c0 = self.init_hidden(x.size(0))
        
        # 前向传播
        out, _ = self.lstm(x, (h0, c0))
        
        # 应用Dropout
        out = self.dropout(out[:, -1, :])  # 取最后一个时间步
        
        # 通过全连接层
        out = self.fc(out)
        return out
    
    def init_hidden(self, batch_size):
        # 生成初始隐藏状态和细胞状态
        num_directions = 2 if self.bidirectional else 1
        h0 = torch.zeros(self.num_layers * num_directions, batch_size, self.hidden_dim).to(self.device)
        c0 = torch.zeros(self.num_layers * num_directions, batch_size, self.hidden_dim).to(self.device)
        return h0, c0

def train_model(model, optimizer, loss_function, train_loader, epochs=10):
    model.train()
    train_losses = []
    for epoch in range(epochs):
        total_loss = 0
        for vectors, labels in train_loader:
            vectors, labels = vectors.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(vectors)
            outputs = outputs.squeeze()  # 使用squeeze()方法调整模型输出的尺寸
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        print(f'Epoch {epoch+1}, Loss: {avg_loss}')
    return train_losses


def evaluate_model(model, validation_loader, threshold=0.5):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for vectors, labels in validation_loader:
            vectors, labels = vectors.to(device), labels.to(device)
            outputs = model(vectors).squeeze()  # 调整模型输出尺寸
            predictions = torch.sigmoid(outputs) > threshold  # 将sigmoid激活后的输出转换为二进制预测
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_predictions)
    print(f'Validation Accuracy: {accuracy}')
    return accuracy

# 准备数据集
train_data, validation_data = train_test_split(data, test_size=0.2, random_state=42)

# Creating TextDataset instances for training and validation
train_dataset = TextDataset(train_data['cleaned_text'], train_data['label'], w2v_model)
validation_dataset = TextDataset(validation_data['cleaned_text'], validation_data['label'], w2v_model)
# 使用GloVe词向量初始化训练和验证数据集
# train_dataset = TextDataset(train_data['cleaned_text'], train_data['label'], glove_vectors)
# validation_dataset = TextDataset(validation_data['cleaned_text'], validation_data['label'], glove_vectors)


# Creating DataLoader instances for training and validation
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

# Hyperparameters
input_dim = 300  
hidden_dim = 256
output_dim = 1  # 对于二分类任务，确保这里为1
bidirectional = True
dropout_rate = 0.5

# Check if GPU is available and move models to GPU if it is
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 实例化模型
rnn_model = RNNModel(input_dim, hidden_dim, output_dim).to(device)
lstm_model = LSTMModel(input_dim, hidden_dim, output_dim).to(device)
bilstm_model = BiLSTMModel(input_dim, hidden_dim, output_dim, bidirectional, dropout_rate).to(device)

# 定义损失函数和优化器
loss_function = nn.BCEWithLogitsLoss()

rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
bilstm_optimizer = optim.Adam(bilstm_model.parameters(), lr=0.001)


# 训练和评估模型
# 训练RNN模型
print("Training RNN model...")
rnn_losses = train_model(rnn_model, rnn_optimizer, loss_function, train_loader, epochs=10)

# 评估RNN模型
print("Evaluating RNN model...")
rnn_accuracy = evaluate_model(rnn_model, validation_loader)

# 训练LSTM模型
print("Training LSTM model...")
lstm_losses = train_model(lstm_model, lstm_optimizer, loss_function, train_loader, epochs=10)

# 评估LSTM模型
print("Evaluating LSTM model...")
lstm_accuracy = evaluate_model(lstm_model, validation_loader)

# 训练BiLSTM模型
print("Training BiLSTM model...")
bilstm_losses = train_model(bilstm_model, bilstm_optimizer, loss_function, train_loader, epochs=10)

# 评估BiLSTM模型
print("Evaluating BiLSTM model...")
bilstm_accuracy = evaluate_model(bilstm_model, validation_loader)

# 绘制训练损失
plt.figure(figsize=(10, 5))
plt.plot(rnn_losses, label='RNN Loss')
plt.plot(lstm_losses, label='LSTM Loss')
plt.plot(bilstm_losses, label='BiLSTM Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Losses')
plt.legend()
plt.show()

# 打印准确率
print(f'RNN Validation Accuracy: {rnn_accuracy}')
print(f'LSTM Validation Accuracy: {lstm_accuracy}')
print(f'BiLSTM Validation Accuracy: {bilstm_accuracy}')

Training RNN model...
Epoch 1, Loss: 0.6989728875230976
Epoch 2, Loss: 0.6961120840251102
Epoch 3, Loss: 0.6971137715781511
Epoch 4, Loss: 0.6980976008327167
Epoch 5, Loss: 0.696090811014372
Epoch 6, Loss: 0.6965836261484314
Epoch 7, Loss: 0.6972623024710131
Epoch 8, Loss: 0.6956991138088733
Epoch 9, Loss: 0.6955934151184038
Epoch 10, Loss: 0.6968284995715085
Evaluating RNN model...
Validation Accuracy: 0.5063904349618635
Training LSTM model...
Epoch 1, Loss: 0.6930777265921371
Epoch 2, Loss: 0.6929294142660186
Epoch 3, Loss: 0.692973527304908
Epoch 4, Loss: 0.6928684537015665
Epoch 5, Loss: 0.6928841976812823
Epoch 6, Loss: 0.6928773963146728
Epoch 7, Loss: 0.6928525570985805
Epoch 8, Loss: 0.6928390090070474
Epoch 9, Loss: 0.6928000762653272
Epoch 10, Loss: 0.6927532335990535
Evaluating LSTM model...
Validation Accuracy: 0.5063904349618635
Training BiLSTM model...


TypeError: lstm() received an invalid combination of arguments - got (Tensor, tuple, list, bool, bool, float, bool, float, bool), but expected one of:
 * (Tensor data, Tensor batch_sizes, tuple of Tensors hx, tuple of Tensors params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional)
      didn't match because some of the arguments have invalid types: (Tensor, !tuple of (Tensor, Tensor)!, !list of [Parameter, Parameter, Parameter, Parameter, Parameter, Parameter, Parameter, Parameter]!, !bool!, bool, !float!, !bool!, !float!, bool)
 * (Tensor input, tuple of Tensors hx, tuple of Tensors params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first)
      didn't match because some of the arguments have invalid types: (Tensor, !tuple of (Tensor, Tensor)!, !list of [Parameter, Parameter, Parameter, Parameter, Parameter, Parameter, Parameter, Parameter]!, bool, !bool!, float, bool, !float!, bool)
