# Bag of Words Text Classifier

The code below implements a simple bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- The lookup allows for extracting embeddings for each tokenized inputs
- The embedding vectors are added together with a bias vector
- The resulting vector is referred to as the scores
- The score are applied a softmax to generate probabilities which are used for the classification task

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](../img/bow.png?raw=true)

In [4]:
import torch#安装torch库，支持在图形处理单元上计算张量
import random#安装random库，用于产生各种分布的伪随机数序列
import torch.nn as nn#加载神经网络常用模块

### Download the Data

In [2]:
%%capture#可以隐藏单元格党的输出

#下载文件
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt
#创建数据文件夹
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes

### Read the Data

In [5]:
def read_data(filename):#函数读取数据，处理每一行
    data = []#初始化数据列表
    with open(filename, 'r') as f:#读取filename文件到f中
        for line in f:#逐行打印f文件
            line = line.lower().strip()#将读取数据的所有字符小写，并去除两端的空格或者制表符、换行符等无效字符
            line = line.split(' ||| ')#按“|||”分割列
            data.append(line)#将读取的数据添加到数据列表中
    return data#返回读取后的数据列表

train_data = read_data('data/classes/train.txt')#保存到训练数据集
test_data = read_data('data/classes/test.txt')#保存到测试数据集

### Construct the Vocab and Datasets

In [6]:
word_to_index = {}#创建单词索引
word_to_index["<unk>"] = len(word_to_index) #添加<UNK>到字典
tag_to_index = {}#创建标签索引

def create_dict(data, check_unk=False):#根据数据创建单词到索引字典和标记到索引字典
    for line in data:#逐行迭代
        for word in line[1].split(" "):
            if check_unk == False:
                if word not in word_to_index:#如果word在字典里
                    word_to_index[word] = len(word_to_index)#添加word到字典中
            else:
                if word not in word_to_index:#如果word不在字典里
                    word_to_index[word] = word_to_index["<unk>"]#添加<UNK>到字典

        if line[0] not in tag_to_index:#检查第一行
            tag_to_index[line[0]] = len(tag_to_index)

create_dict(train_data)#创建训练数据字典
create_dict(test_data, check_unk=True)#创建测试数据字典

def create_tensor(data):#根据数据创建单词和标记张量
    for line in data:
        yield([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])

train_data = list(create_tensor(train_data))#创建训练数据张量
test_data = list(create_tensor(test_data))#创建测试数据张量

number_of_words = len(word_to_index)#单词个数
number_of_tags = len(tag_to_index)#标签个数

### Model

In [7]:
# cpu or gpu
device = "cuda" if torch.cuda.is_available() else "cpu"#检测当前计算机是否支持使用cuda，如果支持则将device变量设置为cuda，否则设置为cpu

class BoW(torch.nn.Module):#创建一个具有嵌入层、偏置和xavier初始化的简单神经网络
    def __init__(self, nwords, ntags):#初始化
        super(BoW, self).__init__()#调用父类的init方法, 同样可以使用super()去调用父类的其他方法
        self.embedding = nn.Embedding(nwords, ntags)#词嵌入
        nn.init.xavier_uniform_(self.embedding.weight)#统一方差

        type = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor#决定计算时GPU或是CPU的选定
        self.bias = torch.zeros(ntags, requires_grad=True).type(type)#初始偏移量为都0，requires_grad=True表示当前的Tensor需要计算来自loss的梯度

    def forward(self, x):#向前传播
        emb = self.embedding(x) # seq_len x ntags (for each seq) 
        out = torch.sum(emb, dim=0) + self.bias # ntags
        out = out.view(1, -1) # reshape to (1, ntags)
        return out

### Pretest the Model

In [8]:
#使用word_to_index字典将句子转换为张量的函数
def sentence_to_tensor(sentence):#将句子转化为张量
    return torch.LongTensor([word_to_index[word] for word in sentence.split(" ")])

#测试sentence_to_sensor函数
type = torch.cuda.LongTensor if torch.cuda.is_available() else torch.LongTensor
out = sentence_to_tensor("i love dogs").type(type)#转化
test_model = BoW(number_of_words, number_of_tags).to(device)#神经网络测试模型
test_model(out)#查看测试模型

tensor([[ 0.0124,  0.0164, -0.0182, -0.0014, -0.0120]], device='cuda:0',
       grad_fn=<ViewBackward0>)

### Train the Model

In [9]:
#训练和测试词袋模型
model = BoW(number_of_words, number_of_tags).to(device)#神经网络模型
criterion = nn.CrossEntropyLoss()#加载损失函数
optimizer = torch.optim.Adam(model.parameters())#加载神经网络训练优化器
type = torch.LongTensor#类型为32位浮点型张量

if torch.cuda.is_available():#如果可以使用cuda
    model.to(device)#模型加载到相应设备中
    type = torch.cuda.LongTensor#类型为64位整型张量

def train_bow(model, optimizer, criterion, train_data):#进行词袋模型的训练
    for ITER in range(10):
        model.train()#进行训练
        random.shuffle(train_data)#打乱训练数据的顺序
        total_loss = 0.0#总损失
        train_correct = 0#训练正确个数
        for sentence, tag in train_data:
            sentence = torch.tensor(sentence).type(type)#句子
            tag = torch.tensor([tag]).type(type)#标签
            output = model(sentence)#将句子构建成模型
            predicted = torch.argmax(output.data.detach()).item()#这个数据模型中最大值的索引
            
            loss = criterion(output, tag)#数据代入损失函数计算损失
            total_loss += loss.item()#损失值加到总损失中

            optimizer.zero_grad()#把loss关于weight的导数变成0
            loss.backward()#反向传播求梯度
            optimizer.step()#更新所有参数

            if predicted == tag: train_correct+=1#如果预测正确，训练正确个数+1

        #对模型进行测试
        model.eval()#评估模式
        test_correct = 0#测试正确个数
        for sentence, tag in test_data:
            sentence = torch.tensor(sentence).type(type)#句子
            output = model(sentence)#将句子构建成模型
            predicted = torch.argmax(output.data.detach()).item()#这个数据模型中最大值的索引
            if predicted == tag: test_correct += 1#如果预测正确，训练正确个数+1
        
        log = f'ITER: {ITER+1} | ' \
            f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
            f'train accuracy: {train_correct/len(train_data):.4f} | ' \
            f'test accuracy: {test_correct/len(test_data):.4f}'
        print(log)#打印模型性能结果，查看训练误差、训练和测试的准确率

train_bow(model, optimizer, criterion, train_data)#调用train_bow函数

ITER: 1 | train loss/sent: 1.4733 | train accuracy: 0.3631 | test accuracy: 0.4009
ITER: 2 | train loss/sent: 1.1216 | train accuracy: 0.6040 | test accuracy: 0.4118
ITER: 3 | train loss/sent: 0.9123 | train accuracy: 0.7117 | test accuracy: 0.4154
ITER: 4 | train loss/sent: 0.7688 | train accuracy: 0.7664 | test accuracy: 0.4140
ITER: 5 | train loss/sent: 0.6631 | train accuracy: 0.8065 | test accuracy: 0.4068
ITER: 6 | train loss/sent: 0.5814 | train accuracy: 0.8324 | test accuracy: 0.4059
ITER: 7 | train loss/sent: 0.5171 | train accuracy: 0.8507 | test accuracy: 0.4077
ITER: 8 | train loss/sent: 0.4640 | train accuracy: 0.8695 | test accuracy: 0.4036
ITER: 9 | train loss/sent: 0.4191 | train accuracy: 0.8830 | test accuracy: 0.3991
ITER: 10 | train loss/sent: 0.3818 | train accuracy: 0.8929 | test accuracy: 0.3964
