## 1. 环境准备

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

## 2. 定义参数

In [6]:
file_path = "/content/labeledTrainData.tsv"

embedding_size = 256 #嵌入词向量的维度
kernel_size = [3,4,5] #卷积尺寸
num_filters = 100 #每个卷积的数目，即输出的通道数
num_classes = 2 #类别数

hidden_size = 512  # rnn的隐状态单元维度
dropout_rate = 0.2  # RNN的dropout参数
num_layers = 2  # rnn层数


batch_size = 256 #加载数据的批量大小
shuffle = True #加载数据时是否打乱
validation_split = 0.2 # 划分验证集的比例
num_workers = 1 # 处理器个数
max_text_len = 200 #将每条文本处理成相同的最大长度
vocab_size = 5000 # 取词频前5000的词

epochs = 1000 # 训练次数
lr = 0.1 # 学习率
do_validation = True # 是否在训练的时候，做验证

device = torch.device(("cuda:0" if torch.cuda.is_available() else "cpu"))
print(device)

cuda:0


## 3. 数据预处理，并划分数据集 

In [7]:
def text_clean(text):
    eng_stopwords = stopwords.words('english')
    text = BeautifulSoup(text, 'html.parser').get_text()  # 去除html标签
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # 去除标点
    words = text.lower().split()  # 全部转成小写，然后按空格分词
    words = [w for w in words if w not in eng_stopwords]  # 去除停用词
    return ' '.join(words)  # 重组成新的句子


def get_word2id(sentences, vocab_size):
    word_list = " ".join(sentences).split()
    if vocab_size > len(set(word_list)):
        vocab = list(set(word_list))
    else:
        counter = Counter(word_list).most_common(vocab_size-1)
        vocab, _ = list(zip(*counter))

    word2id = {w: i+1 for i, w in enumerate(vocab)}
    word2id['<UNK>'] = 0  # 未知词

    return word2id


def token2id(text, max_text_len, word2id):
    token2id = [
        word2id[w] if w in word2id else word2id['<UNK>'] for w in text.split()
    ]

    if len(token2id) >= max_text_len:
        token2id = token2id[:max_text_len]
    else:
        token2id = token2id + [word2id['<UNK>']] * \
            (max_text_len - len(token2id))

    return token2id


def valid_split(data, label, split):
    n_samples = data.shape[0]

    idx_full = np.arange(n_samples)
    np.random.shuffle(idx_full)
    
    if isinstance(split, int):
        assert split > 0
        assert split < n_samples
        len_valid = split
    else:
        len_valid = int(n_samples * split)

    valid_idx = idx_full[:len_valid]
    train_idx = idx_full[len_valid:]

    train_data = data[train_idx]
    train_label = label[train_idx]

    valid_data = data[valid_idx]
    valid_label = label[valid_idx]

    return train_data, train_label, valid_data, valid_label


df = pd.read_csv(file_path, sep='\t', escapechar='\\')
df['clean_review'] = df['review'].apply(text_clean)
word2id = get_word2id(df['clean_review'].tolist(), vocab_size)

data = df['clean_review'].apply(token2id, args=(max_text_len, word2id))
data = data.values
label = df['sentiment'].values

train_data, train_label, valid_data, valid_label = valid_split(data,label,split=validation_split)

## 3. 定义Dataset 和 DataLoader

In [8]:
class ImdbDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __getitem__(self, index):
        return torch.LongTensor(self.data[index]), torch.tensor(self.label[index])

    def __len__(self):
        return self.data.shape[0]


train_dataset = ImdbDataset(train_data, train_label)
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, num_workers=num_workers)

valid_dataset = ImdbDataset(valid_data, valid_label)
valid_dataloader = DataLoader(
    valid_dataset, batch_size=batch_size, num_workers=num_workers)


## 4. 定义模型 

In [9]:
class TextCNN(nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 max_text_len,
                 kernel_size,
                 num_filters,
                 num_classes):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.conv = nn.ModuleList([
            nn.Sequential(
                # [batch_size, num_filters, max_text_len-h+1]
                nn.Conv1d(in_channels=embedding_size,
                          out_channels=num_filters, kernel_size=h),
                nn.BatchNorm1d(num_features=num_filters),
                nn.ReLU(),
                # [batch_size, num_filters*1]
                nn.MaxPool1d(kernel_size=max_text_len-h+1)
            )
            for h in kernel_size
        ])

        self.fc = nn.Linear(in_features=num_filters *
                            len(kernel_size), out_features=num_classes)
        self.dropout = nn.Dropout(0.5)
        # 分类
        self.sm = nn.Softmax(dim=1)

    def forward(self, x):
        # [batch_size, max_text_len]
        embed_x = self.embedding(x)  # [batch_size,max_text_len,embedding_size]

        # [batch_size,embedding_size,max_text_len]
        embed_x = embed_x.permute(0, 2, 1)

        # out[i]: [batch_size, num_filters*1】
        out = [conv(embed_x) for conv in self.conv]

        # 拼接不同尺寸的卷积核运算出来的结果
        # [batch_size, num_filters * len(filter_size)]
        out = torch.cat(out, dim=1)
        out = out.view(-1, out.shape[1])

        out = self.fc(out)

        out = self.dropout(out)
        out = self.sm(out)

        return out


class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers, dropout_rate, num_classes):
        super(TextRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)

        self.rnn = nn.RNN(
            input_size=embedding_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout_rate,
            batch_first=True,
            bidirectional=True)
        self.fc = self.fc = nn.Linear(
            in_features=hidden_size * 2,
            out_features=num_classes)

        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        out = self.embedding(x)  # [batch_size,max_text_len,embedding_size]
        out, _ = self.rnn(out)  # [batch_size, max_text_len, hidden_size*2]
        out = self.fc(out[:, -1, :])  # [batch_size,max_text_len,num_classes]
        out = self.softmax(out)  # [batch_size, num_classess]

        return out


# model = TextCNN(vocab_size, embedding_size, max_text_len,
#                 kernel_size, num_filters, num_classes)

model = TextRNN(vocab_size, embedding_size, hidden_size,
                num_layers, dropout_rate, num_classes)


model.to(device)

TextRNN(
  (embedding): Embedding(5000, 256)
  (rnn): RNN(256, 512, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)

## 5.定义训练过程

In [10]:
def train(epochs, model, train_dataloader, valid_dataloader, do_validation, device, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_dataloader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            train_loss += loss.item()
            loss.backward()

            optimizer.step()
            print('Train Epoch:{}[{}/{}({:.0f}%)]\tLoss:{:.6f}'.format(
                epoch, batch_idx * len(data), len(train_dataloader.dataset),
                100. * batch_idx / len(train_dataloader), loss.item()))

        train_loss /= len(train_dataloader)

        if do_validation:
            model.eval()
            valid_loss = 0.0
            valid_correct = 0
            with torch.no_grad():
                for batch_idx, (data, target) in enumerate(valid_dataloader):
                    data, target = data.to(device), target.to(device)
                    output = model(data)
                    loss = criterion(output,target)
                    valid_loss += loss.item()
                    pred = output.argmax(dim=1, keepdim=True)
                    valid_correct += pred.eq(target.view_as(pred)).sum().item()

            valid_loss /= len(valid_dataloader)

            print('\nValid set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                valid_loss, valid_correct, len(valid_dataloader.dataset),
                100. * valid_correct / len(valid_dataloader.dataset)))

##  6.训练

In [None]:
train(epochs, model, train_dataloader,
      valid_dataloader, do_validation, device, lr)


Valid set: Average loss: 0.8181, Accuracy: 2469/5000 (49%)

