## Neural Language Models
Status of Notebook: Work in Progress

Reference: https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

Dynet Version: https://github.com/neubig/nn4nlp-code/blob/master/02-lm/nn-lm.py

In [3]:
import torch#安装torch库，支持在图形处理单元上计算张量
import random#安装random库，用于产生各种分布的伪随机数序列
import torch#安装torch库，支持在图形处理单元上计算张量
import torch.nn as nn#加载神经网络常用模块
import math#加载函数库
import time#加载时间元组
import numpy as np#加载numpy科学计算库

### Download the Data

In [6]:
#取消注释就可以下载数据集
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/test.txt
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/train.txt
#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/valid.txt

### Process the Data

In [4]:
#函数读取数据，处理每一行并按“|||”分割列
def read_data(filename):#函数读取数据，处理每一行
    data = []#初始化数据列表
    with open(filename, "r") as f:#读取filename文件到f中
        for line in f:#逐行打印f文件
            line = line.strip().split(" ")#将读取数据的所有字符小写，并去除两端的空格或者制表符、换行符等无效字符
            data.append(line)#将读取的数据添加到数据列表中
    return data#返回读取后的数据列表

#读取数据
train_data = read_data('data/ptb/train.txt')#保存到训练数据集
val_data = read_data('data/ptb/valid.txt')#保存到测试数据集

#创建单词和标签索引以及特殊标记
word_to_index = {}#创建单词索引列表
index_to_word = {}#创建标签索引列表
word_to_index["<s>"] = len(word_to_index)
index_to_word[len(word_to_index)-1] = "<s>"
word_to_index["<unk>"] = len(word_to_index)#添加<UNK>到字典
index_to_word[len(word_to_index)-1] = "<unk>"

def create_dict(data, check_unk=False):#根据数据创建单词到索引字典和标记到索引字典
    for line in data:#逐行迭代
        for word in line:
            if check_unk == False:
                if word not in word_to_index:#如果word不在字典里
                    word_to_index[word] = len(word_to_index)#添加word到字典中
                    index_to_word[len(word_to_index)-1] = word#添加索引
            
            #无效，因为<unk>已附带数据
            #应在未处理＜unk＞的情况下处理数据
            else: 
                if word not in word_to_index:#如果word不在字典里
                    word_to_index[word] = word_to_index["<unk>"]#添加<UNK>到字典
                    index_to_word[len(word_to_index)-1] = word#添加索引

create_dict(train_data)#创建训练数据字典
create_dict(val_data, check_unk=True)#创建测试数据字典

def create_tensor(data):#根据数据创建单词和标记张量
    for line in data:
        yield([word_to_index[word] for word in line])

train_data = list(create_tensor(train_data))#创建训练数据张量
val_data = list(create_tensor(val_data))#创建测试数据张量

number_of_words = len(word_to_index)#单词个数

In our implementation we are using batched training. There are a few differences from the original implementation found [here](https://github.com/neubig/nn4nlp-code/blob/master/02-lm/loglin-lm.py). 

### Define the Model

In [5]:
#定义模型
device = 'cuda' if torch.cuda.is_available() else 'cpu'#检测当前计算机是否支持使用cuda，如果支持则将device变量设置为cuda，否则设置为cpu

N = 2#n-gram的长度
EMB_SIZE = 128#嵌入的大小
HID_SIZE = 128#隐藏层的大小

class NeuralLM(nn.Module):#构建神经语言模型模型
    def __init__(self, number_of_words, ngram_length, EMB_SIZE, HID_SIZE):
        super(NeuralLM, self).__init__()#初始化

        self.embedding = nn.Embedding(number_of_words, EMB_SIZE)#嵌入层
        self.hidden = nn.Linear(EMB_SIZE * ngram_length, HID_SIZE)#隐藏层
        self.output = nn.Linear(HID_SIZE, number_of_words)#输出层

    def forward(self, x):#计算分数
        embs = self.embedding(x)#Size:[batch_size x num_hist x emb_size]
        embs = embs.view(embs.size(0), -1)#Size:[batch_size x (num_hist*emb_size)]
        h = torch.nn.functional.tanh(self.hidden(embs))#Size:[batch_size x hid_size]
        scores = self.output(h)#Size:batch_size x num_words
        return scores

### Model Settings and Functions

In [12]:
model = NeuralLM(number_of_words, N, EMB_SIZE, HID_SIZE)#加载神经语言模型模型
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)#加载神经网络训练优化器，学习率为0.1
criterion = torch.nn.CrossEntropyLoss()#加载损失函数

if torch.cuda.is_available():#如果可以使用cuda
    model.to(device)#模型加载到相应设备中

def calc_sent_loss(sent):#计算句子损失的函数
    S = word_to_index["<s>"]

    hist = [S] * N#起始历史等于句末符号
    
    #收集所有目标和历史记录
    all_targets = []#初始化目标队列
    all_histories = []#初始化历史记录
    
    #逐步完成句子，包括句尾标记
    for next_word in sent + [S]:
        all_histories.append(list(hist))#读取的数据加入历史记录队列
        all_targets.append(next_word)#下一个数据加入目标队列
        hist = hist[1:] + [next_word]

    logits = model(torch.LongTensor(all_histories).to(device))#logits是转换成概率之前的值，是下一步通常被投给softmax的向量
    loss = criterion(logits, torch.LongTensor(all_targets).to(device))#计算损失

    return loss

MAX_LEN = 100#定义句子最大长度100
def generate_sent():#生成句子的函数
    S = word_to_index["<s>"]#<s>添加到字典中
    hist = [S] * N
    sent = []#初始化列表
    while True:
        logits = model(torch.LongTensor([hist]).to(device))#将模型加载到相应的设备中，64位整型
        p = torch.nn.functional.softmax(logits) # 1 x number_of_words
        next_word = p.multinomial(num_samples=1).item()#抽取样本，为每行切片绘制的独立样本数为1
        if next_word == S or len(sent) == MAX_LEN:
            break
        sent.append(next_word)#添加到样本中
        hist = hist[1:] + [next_word]
    return sent

### Train the Model

In [14]:
#开始训练
for ITER in range (10):#10轮次
    random.shuffle(train_data)#训练

    model.train()#模型训练
    train_words, train_loss = 0, 0.0#训练单词总数，训练损失
    for sent_id, sent in enumerate(train_data):#更改为所有的train_data
        
        my_loss = calc_sent_loss(sent)#计算句子损失
        
        train_loss += my_loss.item()#将得到的句子损失加到训练损失中
        train_words += len(sent)#训练单词总数+1

        optimizer.zero_grad()#把梯度置零
        my_loss.backward()#反向传播，计算分数
        optimizer.step()#更新模型参数
        #这三个函数的作用是将梯度归零，然后反向传播计算得到每个参数的梯度值，最后通过梯度下降执行一步参数更新。
        if (sent_id+1) % 5000 == 0:
            print("--finished %r sentences" % (sent_id+1))
    print("iter %r: train loss/word=%.4f, ppl=%.4f" % (ITER, train_loss/train_words, math.exp(train_loss/train_words)))

    #评价
    model.eval()#切换评估模式
    dev_words, dev_loss = 0, 0.0#评估单词总数，评估损失
    start = time.time()#记录开始时间
    for sent_id, sent in enumerate(val_data):
        my_loss = calc_sent_loss(sent)#记录句子损失
        dev_loss += my_loss.item()#将得到的句子损失加到评估损失中
        dev_words += len(sent)#评估单词总数+1
    print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))

    #生成几个句子
    for _ in range(5):
        sent = generate_sent()#生成句子
        print(" ".join([index_to_word[x] for x in sent]))

--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 0: train loss/word=4.1802, ppl=65.3775
iter 0: dev loss/word=4.4128, ppl=82.4961, time=1.26s
in constitution physics which could counting suspect include be on
dealers manufacturers plans commissions
in constitution physics which could counting suspect include be on behalf he declares
in constitution physics which could counting suspect include be and which and an for was designed on themes of weakness jobs n't be and which <unk> developed the sale such from about other objectives
N have in prolonged damage




--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 1: train loss/word=4.4307, ppl=83.9873
iter 1: dev loss/word=4.5315, ppl=92.8970, time=1.27s
two <unk> hours <unk> relations clark new an index big were medicine more 'm bank N in october this fall
two this said its consumer puts for democratic futures the ringers out N note a day to get this affairs
this time
two this said its consumer puts for democratic futures the ringers out N note a top outstanding bank for $ candidates savings <unk> relationship in shopping for declared futures the ringers out N note a day to get this said its consumer puts to highlight this fall
this time the
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
-

KeyboardInterrupt: 