# Deep Continuous Bag of Words (Deep CBOW) Text Classifier

The code below implements a continuous bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- We create embeddings for inputs and sum them together
- The resulting vector is fed to hidden neural network, which generates a new vector that is multiplied to a weights matrix
- We then add the bias and obtain scores
- The scores are applied a softmax to generate probabilities which are used for the final classification

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/deep_cbow.png?raw=true)

In [1]:
import torch#安装torch库，支持在图形处理单元上计算张量
import random#安装random库，用于产生各种分布的伪随机数序列
import torch.nn as nn#加载神经网络常用模块

In [None]:
'''取消注释符下载数据
%%capture#隐藏单元格党的输出

#下载文件
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

#创建数据文件夹
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes
'''

## Read and Process the Data

In [2]:
#函数读取数据，处理每一行并按“|||”分割列
def read_data(filename):#函数读取数据，处理每一行
    data = []#初始化数据列表
    with open(filename, 'r') as f:#读取filename文件到f中
        for line in f:#逐行打印f文件
            line = line.lower().strip()#将读取数据的所有字符小写，并去除两端的空格或者制表符、换行符等无效字符
            line = line.split(' ||| ')#按“|||”分割列
            data.append(line)#将读取的数据添加到数据列表中
    return data#返回读取后的数据列表

train_data = read_data('data/classes/train.txt')#保存到训练数据集
test_data = read_data('data/classes/test.txt')#保存到测试数据集

word_to_index = {}#创建单词索引列表
word_to_index["<unk>"] = len(word_to_index)#添加<UNK>到字典
tag_to_index = {}#创建标签索引列表

def create_dict(data, check_unk=False):#根据数据创建单词到索引字典和标记到索引字典
    for line in data:#逐行迭代
        for word in line[1].split(" "):
            if check_unk == False:
                if word not in word_to_index:#如果word在字典里
                    word_to_index[word] = len(word_to_index)#添加word到字典中
            else:
                if word not in word_to_index:#如果word不在字典里
                    word_to_index[word] = word_to_index["<unk>"]#添加<UNK>到字典

        if line[0] not in tag_to_index:#检查第一行
            tag_to_index[line[0]] = len(tag_to_index)

create_dict(train_data)#创建训练数据字典
create_dict(test_data, check_unk=True)#创建测试数据字典

def create_tensor(data):#根据数据创建单词和标记张量
    for line in data:
        yield([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])

train_data = list(create_tensor(train_data))#创建训练数据张量
test_data = list(create_tensor(test_data))#创建测试数据张量

number_of_words = len(word_to_index)#单词个数
number_of_tags = len(tag_to_index)#标签个数

## Model

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"#检测当前计算机是否支持使用cuda，如果支持则将device变量设置为cuda，否则设置为cpu

class DeepCBoW(nn.Module):#创建一个具有嵌入层、偏置和xavier初始化的简单神经网络
    def __init__(self, nwords, ntags, hidden_size, num_layers, emb_size):
        super(DeepCBoW, self).__init__()#初始化

        self.num_layers = num_layers

        #层
        self.embedding = nn.Embedding(nwords, emb_size)#词嵌入
        self.linears = nn.ModuleList([nn.Linear(emb_size if i ==0 else hidden_size, hidden_size) \
            for i in range(num_layers)])#构建小型网络

        #对权重使用xavier初始化
        nn.init.xavier_uniform_(self.embedding.weight)
        for i in range(self.num_layers):
            nn.init.xavier_uniform_(self.linears[i].weight)

        self.output_layer = nn.Linear(hidden_size, ntags)#输出层

    def forward(self, x):#向前传播
        emb = self.embedding(x) # seq x emb_size
        emb_sum = torch.sum(emb, dim=0) # emb_size
        h = emb_sum.view(1, -1) # reshape to (1, emb_size)
        for i in range(self.num_layers):
            h = torch.tanh(self.linears[i](h))
        out = self.output_layer(h) # 1 x ntags
        return out
#隐藏层
HIDDEN_SIZE = 64
NUM_LAYERS = 2#层数
EMB_SIZE = 64
model = DeepCBoW(number_of_words, number_of_tags, HIDDEN_SIZE, NUM_LAYERS, EMB_SIZE).to(device)#加载深度连续词袋模型
criterion = nn.CrossEntropyLoss()#加载损失函数
optimizer = torch.optim.Adam(model.parameters())#加载神经网络训练优化器
type = torch.LongTensor#类型为32位浮点型张量

if torch.cuda.is_available():#如果可以使用cuda
    model.to(device)#模型加载到相应设备中
    type = torch.cuda.LongTensor#类型为64位整型张量

## Model Training

In [4]:
#进行词袋模型的训练
for epoch in range(10):
    model.train()#模型进行训练
    random.shuffle(train_data)#打乱训练数据的顺序
    total_loss = 0.0#总损失初始为0
    train_correct = 0#训练正确个数初始为0
    for sentence, tag in train_data:
        sentence = torch.tensor(sentence).type(type)#句子
        tag = torch.tensor([tag]).type(type)#标签
        output = model(sentence)#将句子构建成模型
        predicted = torch.argmax(output.data.detach()).item()#这个数据模型中最大值的索引
        
        loss = criterion(output, tag)#数据代入损失函数计算损失
        total_loss += loss.item()#损失值叠加到总损失中

        optimizer.zero_grad()#把loss关于weight的导数变成0
        loss.backward()#反向传播求梯度
        optimizer.step()#更新所有参数

        if predicted == tag: train_correct+=1#如果预测正确，训练正确个数+1

    #对模型进行测试
    model.eval()#评估模式
    test_correct = 0#测试正确个数
    for sentence, tag in test_data:
        sentence = torch.tensor(sentence).type(type)#句子
        output = model(sentence)#将句子构建成模型
        predicted = torch.argmax(output.data.detach()).item()#这个数据模型中最大值的索引
        if predicted == tag: test_correct += 1#如果预测正确，测试正确个数+1
    
    #打印模型性能结果
    log = f'epoch: {epoch+1} | ' \
        f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
        f'train accuracy: {train_correct/len(train_data):.4f} | ' \
        f'test accuracy: {test_correct/len(test_data):.4f}'
    
    print(log)#打印模型性能结果，查看训练误差、训练和测试的准确率

epoch: 1 | train loss/sent: 1.4293 | train accuracy: 0.3765 | test accuracy: 0.3941
epoch: 2 | train loss/sent: 1.0343 | train accuracy: 0.5729 | test accuracy: 0.4127
epoch: 3 | train loss/sent: 0.6565 | train accuracy: 0.7583 | test accuracy: 0.3801
epoch: 4 | train loss/sent: 0.4013 | train accuracy: 0.8586 | test accuracy: 0.3783
epoch: 5 | train loss/sent: 0.2659 | train accuracy: 0.9079 | test accuracy: 0.3959
epoch: 6 | train loss/sent: 0.1747 | train accuracy: 0.9419 | test accuracy: 0.3787
epoch: 7 | train loss/sent: 0.1257 | train accuracy: 0.9573 | test accuracy: 0.3805
epoch: 8 | train loss/sent: 0.0860 | train accuracy: 0.9702 | test accuracy: 0.3719
epoch: 9 | train loss/sent: 0.0652 | train accuracy: 0.9768 | test accuracy: 0.3747
epoch: 10 | train loss/sent: 0.0434 | train accuracy: 0.9860 | test accuracy: 0.3887


Bad pipe message: %s [b'I7{\xddYY9\x10\xe5', b"\xee\x8a\xf0\xff\xe6\x1a\xd2\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x00", b'\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00']
Bad pipe message: %s [b'\xe1\x05', b'\xb0\x87g\xc6U\xd5G\xa2.\xd2\xf7\x05\x9fL\x00\x00\xa6\xc0,\xc0', b'\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V']
Bad pipe message: %s [b"\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008\x00\x88\x00\x87\xc0\t\xc0\x13\x003\x002\x00\x9a\x00\x99