In [1]:
%%capture
!pip install transformers
!pip install pypinyin
!pip install jieba
!pip install paddlepaddle

In [3]:
%%capture
import re,time,json
from collections import defaultdict
from torch.utils.data import DataLoader
from pypinyin import pinyin, Style
from tqdm import tqdm
import pickle

import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from copy import deepcopy

from transformers import (BertTokenizer,BertConfig,BertModel)

from model.Embedding import *
from model.fusionDataset import FusionDataset

import jieba
import jieba.posseg as pseg
import paddle

config = BertConfig.from_pretrained('AnchiBERT')
tokenizer = BertTokenizer.from_pretrained('AnchiBERT')
Anchibert = BertModel.from_pretrained('AnchiBERT',config=config)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Load Necessary preproceeded Data

In [4]:
with open('data/char_map.json','r') as f:
    ix2glyph = defaultdict(lambda : '_')
    ix2glyph[0] = '[PAD]'
    glyph2ix = defaultdict(lambda : 1)
    glyph2ix.update({'[CLS]':0,'[SEP]':0,'[PAD]':0})
    for i, k in enumerate(json.load(f).keys(),2):
        glyph2ix[k] = i
        ix2glyph[i] = k
with open('data/pinyin_map.json','r') as f:
    pinyin2ix = defaultdict(lambda : 1)
    pinyin2ix.update({'[CLS]':0,'[SEP]':0,'[PAD]':0})
    for i,k in enumerate(json.load(f).keys(),2):
        pinyin2ix[k] = i
with open('data/pos_tags.json','r') as f:
    pos2ix = defaultdict(lambda : 0)
    pos2ix.update(json.load(f))

In [5]:
# train 上联
with open("couplet/train/in.txt",encoding='utf8') as f:
    tr_in =  [row.strip().split() for row in f.readlines()]
# train 下联  
with open("couplet/train/out.txt",encoding='utf8') as f:
    tr_out = [row.strip().split() for row in f.readlines()]
with open('data/train_in_pos.pt','rb') as f:
    tr_pos_in = pickle.load(f)
with open('data/train_out_pos.pt','rb') as f:
    tr_pos_out = pickle.load(f)

In [6]:
display(len(tr_in))
total_len = len(tr_in)
half = total_len//7
display(half)
train_split = int(0.80 * half)


770491

110070

In [7]:
trainSet = FusionDataset(tr_in[:train_split],tokenizer,\
                         glyph2ix,pinyin2ix,pos2ix,tr_out[:train_split],\
                         tr_pos_in[:train_split],tr_pos_out[:train_split],\
                         device=device) # use device if you want to load it gpu

88056it [00:34, 2522.45it/s]
88056it [00:37, 2323.02it/s]


In [8]:
valSet = FusionDataset(tr_in[train_split:half],tokenizer,\
                       glyph2ix,pinyin2ix,pos2ix,tr_out[train_split:half],\
                       tr_pos_in[train_split:half],tr_pos_out[train_split:half],\
                       device=device) # use device if you want to load it gpu

22014it [00:08, 2483.64it/s]
22014it [00:09, 2371.53it/s]


In [9]:
from model.fusion_transformer import Fusion_Anchi_Trans_Decoder, Fusion_Anchi_Transformer, Anchi_Decoder,Anchi_Transformer

In [10]:
from utils.trans_trainer import train

## Fusion_Anchi_Trans_Decoder

In [None]:
# for i in range(1,6):
config = { # for Fusion_Anchi_Trans_Decoder
    'max_position_embeddings':50,
    'hidden_size':768,
    'font_weight_path':'data/glyph_weight.npy',
    'pinyin_embed_dim':30, # trainable
    'pinyin_path':'data/pinyin_map.json',
    'tag_size':30,
    'tag_emb_dim':10, # trainable
    'layer_norm_eps':1e-12,
    'hidden_dropout':0.1,
    'nhead':12,
    'num_layers':6 , #6, trainable
    'output_dim':9110,# fixed use glyph dim as output
    'device':device,
}
# batch_size = [32,64,128]
# lr =[0.1,0.01,0.001]
# <model_name>_<optim>_<batch_num>_<lr>_<epoch>_<num_layer>_<pinyin_embed_dim>_<tag_emb_dim>_<train_data_size>
name = f'fu_anchi_de_Adam_128_0001_60_6_30_10_110k'
train(Fusion_Anchi_Trans_Decoder(config),trainSet,valSet,batch_size=128,lr=0.0001,
      epoch=60,bert=Anchibert,name= name, with_trans=True,
      optimizer_name='Adam',scheduleFactor=0.5,
      schedule_Patience=5,min_lr=1e-06,verbose=True
      ,patience=10,store='result/')
torch.cuda.empty_cache()

  0%|          | 0/60 [00:00<?, ?it/s]

Epoch: 01 | Epoch Time: 5m 26s
	Training Loss: 5.41640 	Validation Loss: 3.40628


  2%|▏         | 1/60 [05:27<5:21:59, 327.45s/it]

## Anchi_Decoder

In [None]:
config = { # for Trans_Decoder
    'max_position_embeddings':50,
    'hidden_size':768,
    'layer_norm_eps':1e-12,
    'hidden_dropout':0.1,
    'nhead':12,
    'num_layers':6, # trainable
    'output_dim':9110,# fixed use glyph dim as output
    'device':device
}
# <model_name>_<optim>_<batch_num>_<lr>_<epoch>_<num_layer>_<train_size>
name = 'anchi_de_Adam_128_0001_10_60_6_110k'
train(Anchi_Decoder(config),trainSet,valSet,batch_size=128,lr=0.0001,
      epoch=60,bert=Anchibert,name= name, with_trans=True,
      optimizer_name='Adam',scheduleFactor=0.5,
      schedule_Patience=5,min_lr=1e-06,verbose=True
      ,patience=10,store='result/')
torch.cuda.empty_cache()

## Fusion_Anchi_Transformer

In [None]:
config = { # Fusion_Anchi_Transformer
    'max_position_embeddings':50,
    'hidden_size':768,
    'font_weight_path':'data/glyph_weight.npy',
    'pinyin_embed_dim':30, # trainable
    'pinyin_path':'data/pinyin_map.json',
    'tag_size':30,
    'tag_emb_dim':10, # trainable 
    'layer_norm_eps':1e-12, 
    'hidden_dropout':0.1, 
    'nhead':12,
    'num_encoder_layers':5, # trainable
    'num_decoder_layers':6, # trainable
    'output_dim':9110,# fixed use glyph dim as output
    'dim_feedforward': 3072,
    'activation':'relu',
    'trans_dropout':0.1,
    'device':device
}
# <model_name>_<optim>_<batch_num>_<lr>_<epoch>_<pinyin_embed_dim>_<tag_emb_dim>_<encoder layer>_<decoder layer>_<train_data_size>
name = 'fu_anchi_tra_Adam_128_01_60_30_10_5_6_110k'
train(Fusion_Anchi_Transformer(config),trainSet,valSet,batch_size=128,lr=0.01,
      epoch=60,bert=Anchibert,name= name, with_trans=True,
      optimizer_name='Adam',scheduleFactor=0.5,
      schedule_Patience=5,min_lr=1e-06,verbose=True
      ,patience=10,store='result/')
torch.cuda.empty_cache()

## Anchi_Transformer

In [None]:
config = { # Anchi_Transformer
    'max_position_embeddings':50,
    'hidden_size':768,
    'layer_norm_eps':1e-12, 
    'hidden_dropout':0.1, 
    'nhead':12,
    'num_encoder_layers':6, # trainable
    'num_decoder_layers':6, # trainable
    'output_dim':9110,# fixed use glyph dim as output
    'dim_feedforward': 3072,
    'activation':'relu',
    'trans_dropout':0.1,
    'device':device
}
# <model_name>_<optim>_<batch_num>_<lr>_<epoch>_<encoder layer>_<decoder layer>_<train_data_size>
name = 'anchi_tra_Adam_128_01_60_6_6_110k'
train(Anchi_Transformer(config),trainSet,valSet,batch_size=128,lr=0.01,
      epoch=60,bert=Anchibert,name= name, with_trans=True,
      optimizer_name='Adam',scheduleFactor=0.5,
      schedule_Patience=5,min_lr=1e-06,verbose=True
      ,patience=10,store='result/')
torch.cuda.empty_cache()