# 数据预处理
- 将原始txt处理成每行的json格式

In [2]:
import json

In [3]:
#统一全角转半角
def strQ2B(ustring):
    cur_list = []
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 12288:  # 全角空格直接转换
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # 全角字符（除空格）根据关系转化
                inside_code -= 65248
            rstring += chr(inside_code)
        cur_list.append(rstring)
    return ''.join(cur_list)


#转换特殊字符词组及标点符号
trans_punctuations = {'don\'t':"do not",
                      '"':'',
                      ';':''
                      }
def process_data(strs):
    for key in trans_punctuations:
        strs = strs.replace(key, trans_punctuations[key])
    return strQ2B(strs)


# 读取原始数据
raw_data = []
with open('Andersen Fairy Tales.txt', 'r') as f:
    for x in f:
        x = x.strip().lower()
        if x: raw_data.append(process_data(x))

#保留长度大于1的句子
raw_data = [x for x in raw_data if len(x.split(' '))>1]


#保存数据
with open("./corpus.json","w") as f:
    json.dump(raw_data, f, indent=4)

# 模型构建

In [4]:
import json
import os
from collections import  Counter
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn.functional import cross_entropy
from torch.autograd import Variable

## 配置文件

In [5]:
# 配置文件
config ={
    "elmo": {
            "activation": "relu",
            "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]],
            "n_highway": 2, 
            "word_dim": 300,
            "char_dim": 50,
            "max_char_token": 50,
            "min_count":5,
            "max_length":256,
            "output_dim":150,
            "units":256,
            "n_layers":2,
        },
    "batch_size":32,
    "epochs":50,
    "lr":0.00001,
}

# 保存路径
model_save_path="./elmo_model"

## 数据集构建

In [6]:
# 读取语料
with open("./corpus.json") as f:
    corpus = json.load(f)
    corpus = corpus[:1000] # 测试

In [7]:
# 检测是否有可用GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('device: ' + str(device))

device: cuda


In [8]:
#分词器
class Tokenizer:
    def __init__(self, word2id,ch2id):
        self.word2id = word2id
        self.ch2id = ch2id
        self.id2word = {i: word for word, i in word2id.items()}
        self.id2ch = {i: char for char, i in ch2id.items()}
    
    def tokenize(self,text,max_length=512,max_char=50):
        oov_id, pad_id = self.word2id.get("<oov>"), self.word2id.get("<pad>")
        w = torch.LongTensor(max_length).fill_(pad_id)
        words = text.lower().split()
        for i, wi in enumerate(words[:max_length]):
            w[i] = self.word2id.get(wi, oov_id)
        oov_id, pad_id = self.ch2id.get("<oov>"), self.ch2id.get("<pad>")
        c = torch.LongTensor(max_length,max_char).fill_(pad_id)
        for i, wi in enumerate(words[:max_length]):
            for j,wij in enumerate(wi[:max_char]):
                c[i][j]=self.ch2id.get(wij, oov_id)
        return w , c , len(words[:max_length])

    def save(self,path):
        try:
            os.mkdir(path)
        except:
            pass
        tok ={
            "word2id":self.word2id,
            "ch2id":self.ch2id
        }
        with open(f"{path}/tokenizer.json","w") as f:
            json.dump(tok,f,indent=4)
            
            
# 从语料中构建
def from_corpus(corpus,min_count=5):
    word_count = Counter()
    for sentence in corpus:
        word_count.update(sentence.split())
    word_count = list(word_count.items())
    word_count.sort(key=lambda x: x[1], reverse=True)
    for i, (word, count) in enumerate(word_count):
        if count < min_count:
            break
    vocab = word_count[:i]
    vocab = [v[0] for v in vocab]
    word_lexicon = {}
    for special_word in ['<oov>', '<pad>']:
        if special_word not in word_lexicon:
            word_lexicon[special_word] = len(word_lexicon)
    for word in vocab:
        if word not in word_lexicon:
            word_lexicon[word] = len(word_lexicon)
    char_lexicon = {}
    for special_char in ['<oov>', '<pad>']:
        if special_char not in char_lexicon:
            char_lexicon[special_char] = len(char_lexicon)
    for sentence in corpus:
        for word in sentence.split():
            for ch in word:
                if ch not in char_lexicon:
                    char_lexicon[ch] = len(char_lexicon)
    return Tokenizer(word_lexicon,char_lexicon)


# 从checkpoint中构建
def from_file(path):
    with open(f"{path}/tokenizer.json") as f:
        d = json.load(f)
    return Tokenizer(d["word2id"],d["ch2id"])

In [9]:
# 初始化分词器
tokenizer = from_corpus(corpus, config["elmo"]["min_count"])

In [10]:
# ELMO数据集生成器
class ELMoDataSet(Dataset):
    def __init__(self,corpus,tokenizer):
        self.corpus=corpus
        self.tokenizer=tokenizer
        
    def __getitem__(self, idx):
        text = self.corpus[idx]
        w,c,i= self.tokenizer.tokenize(text,max_length=config["elmo"]["max_length"],max_char=config["elmo"]["max_char_token"])
        return w,c,i
   
    def __len__(self):
        return len(self.corpus)

# 初始化数据集生成器
data = ELMoDataSet(corpus,tokenizer)

In [11]:
# 初始化Pytorch框架的数据生成器
data_loader = DataLoader(data, batch_size=config["batch_size"])

## 模型初始化

In [12]:
# Based upon https://gist.github.com/Redchards/65f1a6f758a1a5c5efb56f83933c3f6e
# Original Paper https://arxiv.org/abs/1505.00387
# 我们用残差网络替代HighWay
class HighWay(nn.Module):
    def __init__(self, input_dim, num_layers=1,activation= nn.functional.relu):
        super(HighWay, self).__init__()
        self._input_dim = input_dim
        self._layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)])
        self._activation = activation
        for layer in self._layers:
            layer.bias[input_dim:].data.fill_(1)
            
    def forward(self, inputs):
        current_input = inputs
        for layer in self._layers:
            projected_input = layer(current_input)
            linear_part = current_input
            nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
            gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
            nonlinear_part = self._activation(nonlinear_part)
            gate = torch.sigmoid(gate)
            current_input = gate * linear_part + (1 - gate) * nonlinear_part
        return current_input

In [15]:
class ELMo(nn.Module):
    def __init__(self,tokenizer,config):
        super(ELMo, self).__init__()
        self.config=config
        self.tokenizer = tokenizer
        self.word_embedding = nn.Embedding(len(tokenizer.word2id),config["elmo"]["word_dim"],padding_idx=tokenizer.word2id.get("<pad>"))
        self.char_embedding = nn.Embedding(len(tokenizer.ch2id),config["elmo"]["char_dim"],padding_idx=tokenizer.ch2id.get("<pad>"))
        self.output_dim = config["elmo"]["output_dim"]
        activation = config["elmo"]["activation"]
        if activation=="relu":
            self.act = nn.ReLU()
        elif activation=="tanh":
            self.act=nn.Tanh()
        self.emb_dim = config["elmo"]["word_dim"]
        self.convolutions = []
        filters = config["elmo"]["filters"]
        char_dim = config["elmo"]["char_dim"]
        for i, (width, num) in enumerate(filters):
            conv = nn.Conv1d(in_channels=char_dim,
                             out_channels=num,
                             kernel_size=width,
                             bias=True
                             )
            self.convolutions.append(conv)
        self.convolutions = nn.ModuleList(self.convolutions)
        self.n_filters = sum(f[1] for f in filters)
        self.n_highway = config["elmo"]["n_highway"]
        self.highways = HighWay(self.n_filters, self.n_highway, activation=self.act)
        self.emb_dim += self.n_filters
        self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True)
        self.f=[nn.LSTM(input_size = config["elmo"]["output_dim"], hidden_size = config["elmo"]["units"], batch_first=True)]
        self.b=[nn.LSTM(input_size = config["elmo"]["output_dim"], hidden_size = config["elmo"]["units"], batch_first=True)]
        for _ in range(config["elmo"]["n_layers"]-1):
            self.f.append(nn.LSTM(input_size = config["elmo"]["units"], hidden_size = config["elmo"]["units"], batch_first=True))
            self.b.append(nn.LSTM(input_size = config["elmo"]["units"], hidden_size = config["elmo"]["units"], batch_first=True))
        self.f = nn.ModuleList(self.f)
        self.b = nn.ModuleList(self.b)
        self.ln = nn.Linear(in_features=config["elmo"]["units"], out_features=len(tokenizer.word2id))
        
    def forward(self, word_inp, chars_inp):
        embs = []
        batch_size, seq_len = word_inp.size(0), word_inp.size(1)
        word_emb = self.word_embedding(Variable(word_inp))
        embs.append(word_emb)
        chars_inp = chars_inp.view(batch_size * seq_len, -1)
        char_emb = self.char_embedding(Variable(chars_inp))
        char_emb = char_emb.transpose(1, 2)
        convs = []
        for i in range(len(self.convolutions)):
            convolved = self.convolutions[i](char_emb)
            convolved, _ = torch.max(convolved, dim=-1)
            convolved = self.act(convolved)
            convs.append(convolved)
        char_emb = torch.cat(convs, dim=-1)
        char_emb = self.highways(char_emb)
        embs.append(char_emb.view(batch_size, -1, self.n_filters))
        token_embedding = torch.cat(embs, dim=2)
        embeddings = self.projection(token_embedding)
        fs = [embeddings]         
        bs = [embeddings]
        for fl,bl in zip(self.f,self.b):
            o_f,_ = fl(fs[-1])
            fs.append(o_f)
            o_b,_ = bl(torch.flip(bs[-1],dims=[1,]))
            bs.append(torch.flip(o_b,dims=(1,)))
        return fs,bs
    
    def save_model(self,path):
        try:
            os.mkdir(path)
        except:
            pass
        torch.save(self.state_dict(),f'{path}/model.pt')
        with open(f"{path}/config.json","w") as f:
            json.dump(self.config,f,indent=4)
        self.tokenizer.save(path)
        
    @classmethod
    def from_checkpoint(cls,path,device):
        with open(f"{path}/config.json") as f:
            config = json.load(f)
        tokenizer = Tokenizer.from_file(path)
        model = cls(tokenizer,config)
        model.load_state_dict(torch.load(f'{path}/model.pt',map_location=device))
        return model

In [16]:
model = ELMo(tokenizer,config)
model.to(device)

ELMo(
  (word_embedding): Embedding(1122, 300, padding_idx=1)
  (char_embedding): Embedding(39, 50, padding_idx=1)
  (act): ReLU()
  (convolutions): ModuleList(
    (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
    (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
    (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
    (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
    (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
    (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
    (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
  )
  (highways): HighWay(
    (_layers): ModuleList(
      (0): Linear(in_features=2048, out_features=4096, bias=True)
      (1): Linear(in_features=2048, out_features=4096, bias=True)
    )
    (_activation): ReLU()
  )
  (projection): Linear(in_features=2348, out_features=150, bias=True)
  (f): ModuleList(
    (0): LSTM(150, 256, batch_first=True)
    (1): LSTM(256, 256, batch_first=True)
  )
  (b): ModuleList(
    (0): LSTM(150, 256, batch_f

## 训练

In [18]:
opt = optim.Adam(model.parameters(),lr = config["lr"])
loss_function = torch.nn.NLLLoss()
for epoch in range(config["epochs"]):
    total_loss = 0
    print(f"Epoch: {epoch+1}")
    for batch in tqdm(data_loader):
        total_loss = 0
        w , c , i = batch
        w = w.to(device)
        c = c.to(device)
        f, b = model(w,c)
        f, b = f[-1], b[-1]
        k_max=torch.max(i)
        loss = 0
        for k in range(1,k_max):
            fpass=f[:,k-1,:]
            bpass=b[:,k-1,:]
            fl = model.ln(fpass).squeeze()
            bl = model.ln(bpass).squeeze()
            fll = torch.nn.functional.log_softmax(fl,dim=1).squeeze()
            bll = torch.nn.functional.log_softmax(bl,dim=1).squeeze()
            loss+=loss_function(fll,w[:,k])+loss_function(bll,w[:,k])
        loss.backward()
        opt.step()
        opt.zero_grad()
        model.zero_grad()
        total_loss += loss.detach().item()
    model.save_model(model_save_path)
    print('total_loss:',total_loss)

Epoch: 1


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 3546.50244140625
Epoch: 2


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 3521.490478515625
Epoch: 3


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 3480.632568359375
Epoch: 4


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 3414.54736328125
Epoch: 5


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 3278.797607421875
Epoch: 6


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 2848.858154296875
Epoch: 7


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 2211.5849609375
Epoch: 8


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 1741.710205078125
Epoch: 9


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 1446.1951904296875
Epoch: 10


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 1268.07275390625
Epoch: 11


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 1163.1085205078125
Epoch: 12


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 1097.5172119140625
Epoch: 13


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 1054.2308349609375
Epoch: 14


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 1023.152587890625
Epoch: 15


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 999.1707763671875
Epoch: 16


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 980.1627197265625
Epoch: 17


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 964.4329833984375
Epoch: 18


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 951.1318969726562
Epoch: 19


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


total_loss: 939.527587890625
Epoch: 20


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




KeyboardInterrupt: 