# 1.数据源

In [1]:
import os
import jieba
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F

In [2]:
#读取词表
def read_words_list(path):
    with open(path) as f:
        lines = f.readlines()
    
    strlist = []
    for l in lines:
        if '#' != l[0] and '' != l.strip():
            l = l.strip()
            strlist.append(l)
    return strlist

#去除常用词
def remove_stop_words(text, stop_words):
    #保存过滤词数量的字典
    swords_cnt = {}
        
    while "  " in text: #去掉多余空格
        text = text.replace('  ', ' ')
    for key, words in stop_words.items():
        swords_cnt[key] = np.zeros(len(words)) #创建向量
        for i,stpwd in enumerate(words):
            if (stpwd) in text:
                text = text.replace(' '+stpwd+' ', ' ')
#                 swords_cnt[key][i] += text.count(stpwd)
                swords_cnt[key][i] += 1
    return text, swords_cnt

In [3]:
def cut_sentence(sentence):
    return [token for token in jieba.lcut(sentence) if token not in stop_words]

In [4]:
stop_words_path = 'hit_stopwords.txt'
stop_words = read_words_list(stop_words_path)

建立词表

In [5]:
#tokenize=cut_sentence，sequence=True表示输入的是一个sequence类型的数据
ao3_text = torchtext.data.Field(sequential=True,lower=True,tokenize=cut_sentence, fix_length=200)
#LabelField对象，sequential=False，标签不是dtype=torch.int64标签转化成整形
ao3_label = torchtext.data.LabelField(sequential=False, dtype=torch.int64)

In [6]:
#这里主要是告诉torchtext需要处理哪些数据，这些数据存放在哪里，TabularDataset是一个处理scv/tsv的常用类
train_dataset,test_dataset = torchtext.data.TabularDataset.splits(
      path='train',  #文件存放路径
      format='csv',   #文件格式
      skip_header=False,  #不跳过表头
      train='train.csv',  
      test='test.csv',    
      fields=[('label',ao3_label),('content',ao3_text)] # 定义数据对应的表头
  )

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.617 seconds.
Prefix dict has been built succesfully.


In [7]:
pretrained_name = 'sgns.sogou.word' # 预训练词向量文件名
pretrained_path = './word_embedding' #预训练词向量存放路径
vectors = torchtext.vocab.Vectors(name=pretrained_name, cache=pretrained_path)

In [8]:
ao3_text.build_vocab(train_dataset, test_dataset, vectors=vectors)
ao3_label.build_vocab(train_dataset, test_dataset)

生成迭代器

In [9]:
# torchtext.data.BucketIterator.splits 
#BucketIterator能将样本长度接近的句子尽量放在同一个batch里面
#句子长度差距过大，就会给短句加入过多的无意义的<pad>
#句子长度相近的在一个batch里面的话，能够避免这个问题
train_iter,test_iter = torchtext.data.BucketIterator.splits(
        (train_dataset, test_dataset),#需要生成迭代器的数据集
        batch_sizes=(128, 128), # 每个迭代器分别以多少样本为一个batch
        sort_key=lambda x: len(x.content) #按什么顺序来排列batch，这里是以句子的长度，就是上面说的把句子长度相近的放在同一个batch里面
        )

TextCNN 建模

In [10]:
class TextCNN(nn.Module):
    def __init__(self,class_num, # 分类数
                 filter_sizes, # 卷积核的长也就是滑动窗口的长 
                 filter_num,   # 卷积核的数量 
                 vocabulary_size, # 词表的大小
                 embedding_dimension, # 词向量的维度
                 vectors, # 词向量
                 dropout): # dropout率
        super(TextCNN, self).__init__() # 继承nn.Module

        chanel_num = 1  # 通道数，也就是一篇文章一个样本只相当于一个feature map

        self.embedding = nn.Embedding(vocabulary_size, embedding_dimension) # 嵌入层 
        self.embedding = self.embedding.from_pretrained(vectors) #嵌入层加载预训练词向量

        self.convs = nn.ModuleList(
            [nn.Conv2d(chanel_num, filter_num, (fsz, embedding_dimension)) for fsz in filter_sizes])  # 卷积层
        self.dropout = nn.Dropout(dropout) # dropout
        self.fc = nn.Linear(len(filter_sizes) * filter_num, class_num) #全连接层

    def forward(self, x):
        # x维度[句子长度,一个batch中所包含的样本数] 例:[3451,128]
        x = self.embedding(x) # #经过嵌入层之后x的维度，[句子长度,一个batch中所包含的样本数,词向量维度] 例：[3451,128,300]
        x = x.permute(1,0,2) # permute函数将样本数和句子长度换一下位置，[一个batch中所包含的样本数,句子长度,词向量维度] 例：[128,3451,300]
        x = x.unsqueeze(1) # # conv2d需要输入的是一个四维数据，所以新增一维feature map数 unsqueeze(1)表示在第一维处新增一维，[一个batch中所包含的样本数,一个样本中的feature map数，句子长度,词向量维度] 例：[128,1,3451,300]
        x = [conv(x) for conv in self.convs] # 与卷积核进行卷积，输出是[一个batch中所包含的样本数,卷积核数，句子长度-卷积核size+1,1]维数据,因为有[3,4,5]三张size类型的卷积核所以用列表表达式 例：[[128,16,3459,1],[128,16,3458,1],[128,16,3457,1]]
        x = [sub_x.squeeze(3) for sub_x in x]#squeeze(3)判断第三维是否是1，如果是则压缩，如不是则保持原样 例：[[128,16,3459],[128,16,3458],[128,16,3457]]
        x = [F.relu(sub_x) for sub_x in x] # ReLU激活函数激活，不改变x维度 
        x = [F.max_pool1d(sub_x,sub_x.size(2)) for sub_x in x] # 池化层，根据之前说的原理，max_pool1d要取出每一个滑动窗口生成的矩阵的最大值，因此在第二维上取最大值 例：[[128,16,1],[128,16,1],[128,16,1]]
        x = [sub_x.squeeze(2) for sub_x in x] # 判断第二维是否为1，若是则压缩 例：[[128,16],[128,16],[128,16]]
        x = torch.cat(x, 1) # 进行拼接，例：[128,48]
        x = self.dropout(x) # 去除掉一些神经元防止过拟合，注意dropout之后x的维度依旧是[128,48]，并不是说我dropout的概率是0.5，去除了一半的神经元维度就变成了[128,24]，而是把x中的一些神经元的数据根据概率全部变成了0，维度依旧是[128,48]
        logits = self.fc(x) # 全接连层 例：输入x是[128,48] 输出logits是[128,10]
        return logits

In [11]:
class_num = len(ao3_label.vocab) # 类别数目
filter_size = [3,4,5]  # 卷积核种类数 
filter_num=16   # 卷积核数量
vocab_size = len(ao3_text.vocab) # 词表大小
embedding_dim = ao3_text.vocab.vectors.size()[-1] # 词向量维度
vectors = ao3_text.vocab.vectors # 词向量
dropout=0.5 
learning_rate = 0.001  # 学习率
epochs = 5   # 迭代次数
save_dir = './model' # 模型保存路径
steps_show = 10   # 每10步查看一次训练集loss和mini batch里的准确率
steps_eval = 200  # 每100步测试一下验证集的准确率
early_stopping = 1000  # 若发现当前验证集的准确率在1000步训练之后不再提高 一直小于best_acc,则提前停止训练

textcnn_model = TextCNN(class_num=class_num,
        filter_sizes=filter_size,
        filter_num=filter_num,
        vocabulary_size=vocab_size,
        embedding_dimension=embedding_dim,
        vectors=vectors,
        dropout=dropout)

定义train函数

In [12]:
def train(train_iter, dev_iter, model):

    if torch.cuda.is_available(): # 判断是否有GPU，如果有把模型放在GPU上训练，速度质的飞跃
        model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # 梯度下降优化器，采用Adam
    steps = 0
    best_acc = 0
    last_step = 0
    model.train()
    for epoch in range(1, epochs + 1): 
        for batch in train_iter:
            feature, target = batch.content, batch.label
            if torch.cuda.is_available(): # 如果有GPU将特征更新放在GPU上
                  feature,target = feature.cuda(),target.cuda() 
            optimizer.zero_grad() # 将梯度初始化为0，每个batch都是独立训练地，因为每训练一个batch都需要将梯度归零
            logits = model(feature)
            loss = F.cross_entropy(logits, target) # 计算损失函数 采用交叉熵损失函数
            loss.backward()  # 反向传播
            optimizer.step() # 放在loss.backward()后进行参数的更新
            steps += 1 
            if steps % steps_show == 0: # 每训练多少步计算一次准确率，我这边是1，可以自己修改
                corrects = (torch.max(logits, 1)[1].view(target.size()).data == target.data).sum() # logits是[128,10],torch.max(logits, 1)也就是选出第一维中概率最大的值，输出为[128,1],torch.max(logits, 1)[1]相当于把每一个样本的预测输出取出来，然后通过view(target.size())平铺成和target一样的size (128,),然后把与target中相同的求和，统计预测正确的数量
                train_acc = 100.0 * corrects / batch.batch_size # 计算每个mini batch中的准确率
                print('steps:{} - loss: {:.6f}  acc:{:.4f}'.format(
                  steps,
                  loss.item(),
                  train_acc))
                
            if steps % steps_eval == 0: # 每训练steps_eval步进行一次验证
#                 save(model,save_dir, steps)
                dev_acc = dev_eval(dev_iter,model)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    last_step = steps
                    print('Saving best model, acc: {:.4f}%\n'.format(best_acc))
                    save(model,save_dir, steps)
                else:
                    if steps - last_step >= early_stopping:
                        print('\n提前停止于 {} steps, acc: {:.4f}%'.format(last_step, best_acc))
                        raise KeyboardInterrupt

In [13]:
def dev_eval(dev_iter,model):
    model.eval()
    corrects, avg_loss = 0, 0
    for batch in dev_iter:
        feature, target = batch.content, batch.label
        if torch.cuda.is_available():
            feature, target = feature.cuda(), target.cuda()
        logits = model(feature)
        sorc = torch.argmax(logits,dim=1)
        loss = F.cross_entropy(logits, target)
        avg_loss += loss.item()
        corrects += (torch.max(logits, 1)
                    [1].view(target.size()).data == target.data).sum()
    size = len(dev_iter.dataset)
    avg_loss /= size
    accuracy = 100.0 * corrects / size
    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss,
                                                                      accuracy,
                                                                      corrects,
                                                                      size))
    return accuracy

In [14]:
# 定义模型保存函数
def save(model, save_dir, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_path = 'bestmodel_steps{}.pt'.format(steps)
    save_bestmodel_path = os.path.join(save_dir, save_path)
    torch.save(model.state_dict(), save_bestmodel_path)

In [16]:
#开始训练
train(train_iter,test_iter,textcnn_model)

steps:10 - loss: 0.288333  acc:92.1875
steps:20 - loss: 0.188884  acc:92.9688
steps:30 - loss: 0.200837  acc:92.9688
steps:40 - loss: 0.159954  acc:95.3125
steps:50 - loss: 0.225896  acc:92.1875
steps:60 - loss: 0.228430  acc:92.9688
steps:70 - loss: 0.123649  acc:95.3125
steps:80 - loss: 0.128664  acc:92.9688
steps:90 - loss: 0.082676  acc:95.3125
steps:100 - loss: 0.124748  acc:93.7500
steps:110 - loss: 0.071438  acc:96.0938
steps:120 - loss: 0.102413  acc:95.3125
steps:130 - loss: 0.086127  acc:96.8750
steps:140 - loss: 0.155415  acc:92.9688
steps:150 - loss: 0.090518  acc:96.8750
steps:160 - loss: 0.044919  acc:98.4375
steps:170 - loss: 0.127971  acc:96.8750
steps:180 - loss: 0.138537  acc:95.3125
steps:190 - loss: 0.101599  acc:96.8750
steps:200 - loss: 0.060814  acc:97.6562

Evaluation - loss: 0.000696  acc: 96.6422%(2504/2591) 

Saving best model, acc: 96.6422%

steps:210 - loss: 0.032192  acc:99.2188
steps:220 - loss: 0.046644  acc:97.6562
steps:230 - loss: 0.023588  acc:99.218

KeyboardInterrupt: 

In [17]:
dev_eval(test_iter,textcnn_model)


Evaluation - loss: 0.000702  acc: 97.2983%(2521/2591) 



tensor(97.2983)

保存词表

In [18]:
text_field_path = "./word_embedding/ao3_text.field"
label_field_path = "./word_embedding/ao3_label.field"
torch.save(ao3_text ,text_field_path)
torch.save(ao3_label ,label_field_path)