### 目录
1. 导入相关工具包
1. 训练词向量
1. 特征工程
1. 模型训练

## 1. 导入相关工具包

In [None]:
import gc
import time
import datetime
import csv
import pickle
import random
import os
import numpy as np
import lightgbm as lgb
import pandas as pd
import Levenshtein
import textdistance
import gensim
from multiprocessing import Pool
from fuzzywuzzy import fuzz
from tqdm import tqdm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.spatial.distance import cosine, cityblock, canberra, euclidean, minkowski, braycurtis, correlation
from scipy.stats import skew, kurtosis
from sklearn import linear_model
from sklearn.externals import joblib
from sklearn.metrics import roc_auc_score
from scipy.linalg import norm
from collections import Counter, defaultdict
random.seed(1)
np.seterr(divide='ignore', invalid='ignore')
tqdm.pandas(desc='My bar')

import torch
import torch.nn.functional as F
from torch import nn, optim
random_seed = 2019
torch.manual_seed(random_seed)

pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

## 2. 训练词向量

In [None]:
data_path = "/home/kesci/input/bytedance/"
train_path = data_path + "train_final.csv"
test_path = data_path + "test_final_part1.csv"

# base dirs
C_OutputBase = '/home/kesci/work/output/'
C_LocalDataBase = '/home/kesci/work/local-data/'
C_Word2VecModelBase = '/home/kesci/work/word2vec-models/'

# some name
# test_feat_path = os.path.join(C_LocalDataBase, "test_feat.csv")
test_feat_path = './data/test_feature.csv'
C_AllText = 'text.txt'
embedding_size = 300
C_W2VModel = 'Word2Vec_' + str(embedding_size) + '.model'
word2idx_path = "./word2idx.pkl"
idx2word_path = "./idx2word.pkl"
embedding_path = "./pretrain_emb.npy"
UNK = "<UNK>"
PAD = "<PAD>"
# log format
C_LogFormat = '%(asctime)s - %(levelname)s - %(message)s'
# setting log format
logging.basicConfig(level=logging.DEBUG, format=C_LogFormat)

# 创建文件夹
if not os.path.exists(C_OutputBase):
    logging.debug('Create Dir ' + C_OutputBase)
    os.makedirs(C_OutputBase)
if not os.path.exists(C_LocalDataBase):
    logging.debug('Create Dir ' + C_LocalDataBase)
    os.makedirs(C_LocalDataBase)
if not os.path.exists(C_Word2VecModelBase):
    logging.debug('Create Dir ' + C_Word2VecModelBase)
    os.makedirs(C_Word2VecModelBase)

In [None]:
# extract title and query raw text from train and test dataset
def extract_text(file, of, which):
    nitem = 0
    nitems = C_TrainLen
    if which == 'test':
        nitems = C_TestLen
    
    with open(file, 'r') as f:
        for line in f:
            nitem += 1
            elements = line.strip().split(',')
            if which == 'test':
                assert len(elements) == 4
            if which == 'train':
                assert len(elements) == 5
            of.write(elements[1])
            of.write('\n')
            of.write(elements[3])
            of.write('\n')
            if nitem % C_NForPrint == 0:
                logging.info('%10d/%10d on %s' % (nitem, nitems, which))
        
of = open(C_LocalDataBase + C_AllText, 'w')

logging.info('start to parse test dataset...')
extract_text(C_TestFile, of, 'test')
logging.info('start to parse train dataset...')
extract_text(C_TrainFile, of, 'train')

of.close()

In [None]:
# train Word2Vec model
print(time.strftime('%F %T'))
sentences = word2vec.LineSentence(C_LocalDataBase + C_AllText)
w2v_model = word2vec.Word2Vec(sentences, 
    size=C_W2VSize, window=C_W2VWindow, min_count=C_W2VMinCount, workers=C_W2VWorkers, sg=1)
w2v_model.save(C_Word2VecModelBase + C_W2VModel_300)
print(time.strftime('%F %T'))

### 测试词向量

In [None]:
# test Word2Vec model
w2v_model = gensim.models.Word2Vec.load(C_Word2VecModelBase + C_W2VModel_300)
print(w2v_model['1'])
print(w2v_model.most_similar(['1']))

## 3. 特征工程

### 特征提取及辅助函数

In [None]:
# 获取公共词的数量
def q_t_common_words(query, title):
    query = set(query)
    title = set(title)
    return len(query & title)

# 计算离散的词的jaccard距离
def jaccard(query, title):
    query = set(query)
    title = set(title)
    q_t_intersection_len = len(query & title)
    q_t_union_len = len(query | title)
    return q_t_intersection_len / q_t_union_len

# 计算离散的词的相似度
def query_title_similarity(query, title, name):
    outlier_sample = 0
    try:
        counter = CountVectorizer(analyzer='word', token_pattern=u"(?u)\\b\\w+\\b")
        counter.fit([query, title])
        result = counter.transform([query, title]).toarray()
        vec1, vec2 = result[0], result[1]
        if name == 'euclid':
            return np.linalg.norm(vec1 - vec2)  # euclid_dis
        elif name == 'manhattan':
            return np.sum(np.abs(vec1 - vec2))  # manhattan_dis
        else:
            return np.sum(vec1 * vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))  # cosine
    except Exception as e:
        print('Meet a outlier sample!')
        outlier_sample += 1
        return 0
        
# 加载word2vec模型
w2v_model = gensim.models.Word2Vec.load('/home/kesci/work/word2vec-models/Word2Vec_300.model')

# 计算词向量的相似度
def get_w2v(query, title, num):
    q = np.zeros(300)
    count = 0
    for w in query:
        if w in w2v_model.wv:
            q += w2v_model.wv[w]
            count += 1
    if count == 0:
        query_vec = q
    query_vec = (q/count).tolist()
    
    t = np.zeros(300)
    count = 0
    for w in title:
        if w in w2v_model.wv:
            t += w2v_model.wv[w]
            count += 1
    if count == 0:
        title_vec = q
    title_vec = (t/count).tolist()
    
    if num == 1:
        try:
            vec_cosine = cosine(query_vec, title_vec)
            return vec_cosine
        except Exception as e:
            return 0
    if num == 2:
        try:
            vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
            return vec_canberra
        except Exception as e:
            return 0
    if num == 3:
        try:
            vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
            return vec_cityblock
        except Exception as e:
            return 0
    if num == 4:
        try:
            vec_euclidean = euclidean(query_vec, title_vec)
            return vec_euclidean
        except Exception as e:
            return 0
    if num == 5:
        try:
            vec_braycurtis = braycurtis(query_vec, title_vec)
            return vec_braycurtis
        except Exception as e:
            return 0
    if num == 6:
        try:
            vec_minkowski = minkowski(query_vec, title_vec)
            return vec_minkowski
        except Exception as e:
            return 0
    if num == 7:
        try:
            vec_correlation = correlation(query_vec, title_vec)
            return vec_correlation
        except Exception as e:
            return 0

# 计算n-gram特征  
def ngram_nums(query, title, n=2):
    try:
        count_vec = CountVectorizer(ngram_range=(n, n), analyzer='word', token_pattern=u"(?u)\\b\\w+\\b")
        analyzer = count_vec.build_analyzer()
        return 1 if len(set(analyzer(query)) & set(analyzer(title))) > 0 else 0
    except Exception as e:
        return 0

# 提取特征的函数
def extract_feature(data):

    time1 = datetime.datetime.now()
    print('Extracting features start.')
    
    data['query_split_30'] = data['query'].apply(lambda x: x.split())
    data['title_split_30'] = data['title'].apply(lambda x: x.split())
    
    # 词向量的相似度特征
    data['vec_cosine'] = data.progress_apply(lambda index: get_w2v(index['query_split_30'], index['title_split_30'], 1), axis=1)
    data['vec_canberra'] = data.progress_apply(lambda index: get_w2v(index['query_split_30'], index['title_split_30'], 2), axis=1)
    data['vec_cityblock'] = data.progress_apply(lambda index: get_w2v(index['query_split_30'], index['title_split_30'], 3), axis=1)
    data['vec_euclidean'] = data.progress_apply(lambda index: get_w2v(index['query_split_30'], index['title_split_30'], 4), axis=1)
    data['vec_braycurtis'] = data.progress_apply(lambda index: get_w2v(index['query_split_30'], index['title_split_30'], 5), axis=1)
    # data['vec_minkowski'] = data.progress_apply(lambda index: get_w2v(index['query_split_30'], index['title_split_30'], 6), axis=1)
    data['vec_correlation'] = data.progress_apply(lambda index: get_w2v(index['query_split_30'], index['title_split_30'], 7), axis=1)
    
    data['vec_cosine'] = data['vec_cosine'].astype('float32')
    data['vec_canberra'] = data['vec_canberra'].astype('float32')
    data['vec_cityblock'] = data['vec_cityblock'].astype('float32')
    data['vec_euclidean'] = data['vec_euclidean'].astype('float32')
    data['vec_braycurtis'] = data['vec_braycurtis'].astype('float32')
    data['vec_correlation'] = data['vec_correlation'].astype('float32')
    
    data.drop(['query_split_30', 'title_split_30'], axis=1, inplace=True)
    gc.collect()
    
    # 统计特征
    data['query_split'] = data['query'].apply(lambda x: x.split())
    data['title_split'] = data['title'].apply(lambda x: x.split())
    data['query_len'] = data['query_split'].map(len)
    data['title_len'] = data['title_split'].map(len)
    data['q_t_rate'] = data['query_len'] / data['title_len']
    data['q_t_common_words'] = data.apply(lambda index: q_t_common_words(index.query_split, index.title_split), axis=1)
    # data['qlen_gt_tlen'] = data.apply(lambda index: 1 if index.query_len > index.title_len else 0, axis=1)
    data['common_words_ql_rate'] = data['q_t_common_words'] / data['query_len']
    data['common_words_tl_rate'] = data['q_t_common_words'] / data['title_len']
    
    data.drop(['query_split', 'title_split'], axis=1, inplace=True)
    gc.collect()
    
    data['title_len'] = data['title_len'].astype('int16')
    data['q_t_rate'] = data['q_t_rate'].astype('float32')
    data['q_t_common_words'] = data['q_t_common_words'].astype('int16')
    data['common_words_ql_rate'] = data['common_words_ql_rate'].astype('float32')
    
    # 多项式特征
    poly = PolynomialFeatures(2) # 设置多项式阶数为2，其它默认
    ploy_features = poly.fit_transform(data[['common_words_ql_rate', 'common_words_tl_rate']].values)
    ploy_features = pd.DataFrame(ploy_features[:, 3:4], columns=['q_in_t_n_ql_r_sq'])#, 'q_in_t_n_tl_r_sq', 'q_in_t_num_ql*tl_rate'
    data = pd.concat([data, ploy_features], axis=1)
    data['q_in_t_n_ql_r_sq'] = data['q_in_t_n_ql_r_sq'].astype('float32')
    
    data.drop(['common_words_tl_rate', 'query_len'], axis=1, inplace=True)
    
    # 统计特征
    query_cnt = data.groupby(['query_id'], as_index=False)['query'].count().rename(columns={'query': 'query_cnt'})
    data = pd.merge(data, query_cnt, on='query_id', how='left')
    title_cnt = data['title'].value_counts().reset_index(name='title_cnt').rename(columns={'index': 'title'})
    data = pd.merge(data, title_cnt, on='title', how='left')
    del query_cnt#, title_cnt
    gc.collect()

    title_len_min = data.groupby(['query_id'], as_index=False)['title_len'].agg(min).rename(columns={'title_len': 'title_len_min'})
    data = pd.merge(data, title_len_min, on='query_id', how='left')
    del title_len_min #title_len_max, title_len_mean
    gc.collect()
    
    data['query_cnt'] = data['query_cnt'].astype('int16')
    data['title_len_min'] = data['title_len_min'].astype('int16')
    
    # 相似性特征
    data['levenshtein_distance'] = data.apply(lambda line: Levenshtein.distance(line['query'], line['title']), axis=1)
    data['levenshtein_ratio'] = data.apply(lambda line: Levenshtein.ratio(line['query'], line['title']), axis=1)
    data['levenshtein_jaro'] = data.apply(lambda line: Levenshtein.jaro(line['query'], line['title']), axis=1)
    data['levenshtein_jaro_winkler'] = data.apply(lambda line: Levenshtein.jaro_winkler(line['query'], line['title']), axis=1)
    
    data['levenshtein_ratio'] = data['levenshtein_ratio'].astype('float32')
    data['levenshtein_jaro'] = data['levenshtein_jaro'].astype('float32')
    data['levenshtein_jaro_winkler'] = data['levenshtein_jaro_winkler'].astype('float32')
    
    data['leven_dis_rank'] = data.groupby(['query'], as_index=False)['levenshtein_distance'].rank(method='average', ascending=True)
    data['hamming_similarity'] = data.apply(lambda line: textdistance.Hamming(qval=None).similarity(line['query'], line['title']), axis=1)
    data['hamming_normalized_distance'] = data.apply(lambda line: textdistance.Hamming(qval=None).normalized_distance(line['query'], line['title']), axis=1)

    data['leven_dis_rank'] = data['leven_dis_rank'].astype('float32')
    data['hamming_similarity'] = data['hamming_similarity'].astype('int32')
    data['hamming_normalized_distance'] = data['hamming_normalized_distance'].astype('float32')
    
    data['levenshtein_distance'] = data.apply(lambda line: textdistance.Levenshtein(qval=None).distance(line['query'], line['title']), axis=1)
    data['levenshtein_similarity'] = data.apply(lambda line: textdistance.Levenshtein(qval=None).similarity(line['query'], line['title']), axis=1)
    data['levenshtein_similarity_rank'] = data.groupby(['query'], as_index=False)['levenshtein_similarity'].rank(method='average', ascending=True)

    data['jaccard_similarity'] = data.apply(lambda line: textdistance.Jaccard(qval=None).similarity(line['query'], line['title']), axis=1)
    data['jaccard_similarity_rank'] = data.groupby(['query'], as_index=False)['jaccard_similarity'].rank(method='average', ascending=True)
    data.drop(['jaccard_similarity'], axis=1, inplace=True)
    gc.collect()
    
    data['levenshtein_distance'] = data['levenshtein_distance'].astype('int32')
    data['levenshtein_similarity'] = data['levenshtein_similarity'].astype('int32')
    data['levenshtein_similarity_rank'] = data['levenshtein_similarity_rank'].astype('float32')
    data['jaccard_similarity_rank'] = data['jaccard_similarity_rank'].astype('float32')

    data.drop(['query', 'title'], axis=1, inplace=True)#'jaccard_dis' 
    gc.collect()
    
    time2 = datetime.datetime.now()
    print('Extracting features end.')
    print('Cost time: {}s'.format((time2 - time1).seconds))
    print('After feature engineering dataset size:\n{}'.format(data.shape))
    return data

### 提取特征并保存特征文件到磁盘

In [None]:
# 设置数据类型
orin_types = {'query_id': np.int32, 'query_title_id': np.int16, 'label': np.int8}
# 数据加载函数（这里加载的是第6亿9600万到第7亿的400万数据）
def load_train_data(rows=2000000):
    train = pd.read_csv('/home/kesci/input/bytedance/train_final.csv', encoding='utf-8', 
                        names=['query_id', 'query', 'query_title_id', 'title', 'label'], skiprows=696000000, 
                        nrows=4000000, dtype=orin_types)
    print('Training set size:{}'.format(train.shape))
    return train
    
# 开始特征提取
time_now = datetime.datetime.now().strftime('%Y/%m/%d-%H:%M:%S')
print('Time: {}'.format(time_now))
start = time.clock()
print('Data loading ...')
train = load_train_data()   # 5000000  10000000, validate
print('=' * 50)

# 保存特征文件（这里展示的是第6亿9600万到第7亿的400万数据）
train_feature_696_70000w = extract_feature(train.copy())
train_feature_696_70000w.to_csv('/home/kesci/work/data/ml/train_feature_696_70000w.csv', index=None)
del train_feature_696_70000w, train
gc.collect()

print('Train features extracted over.')
print('-' * 50)
time_now = datetime.datetime.now().strftime('%Y/%m/%d-%H:%M:%S')
print('Time: {}'.format(time_now))

## 4. 模型训练

In [None]:
# 加载word2vec模型
logging.basicConfig(level=logging.DEBUG, format=C_LogFormat)
w2v_model = gensim.models.Word2Vec.load(C_Word2VecModelBase + C_W2VModel)

In [None]:
# 构建词表并保存
def build_vocab(train_path, min_count=5):
    count = [(UNK, -1), (PAD, -1)]
    counter = defaultdict(int)
    csvfile = open(train_path, newline='')
    spamreader = csv.reader(csvfile)
    for i, row in enumerate(spamreader): 
        query_id, query, query_title_id, title, label = row
        for w in query.split():
            counter[w] += 1
        for w in title.split():
            counter[w] += 1
        if i % 10000000 == 0:
            print(i, len(counter))
        if i > 10000 * 10000:
            break
            
    for w in counter:
        c = counter[w]
        if c >= min_count:
            count.append((w, c))
    word2idx = dict()
    for word, _ in count:
        word2idx[word] = len(word2idx)
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
    return word2idx, idx2word

word2idx, idx2word = build_vocab(train_path, min_count=7)
pickle.dump(word2idx, open(word2idx_path,'wb'))
pickle.dump(idx2word, open(idx2word_path, 'wb'))

In [None]:
# 根据词表和预训练word2vec构建word embedding矩阵并保存
pretrain_emb = []
vocab_size = len(idx2word)
print("index2word len:", vocab_size)
embedding_size = w2v_model.layer1_size
print("embedding size:", embedding_size)
count = 0
for i in range(vocab_size):
    word = idx2word[i]
    if word in w2v_model.wv:
        pretrain_emb.append(w2v_model.wv[word])
    else:
        count += 1
        if word == PAD:
            rand_emb = np.zeros(embedding_size)
        else:
            rand_emb = np.random.normal(loc=0, scale=1, size=embedding_size)
        pretrain_emb.append(rand_emb)
print("rand init count:",count)
pretrain_emb = np.array(pretrain_emb)
np.save(embedding_path, pretrain_emb)

In [None]:
# 指定验证集和全体数据集的大小以及相应的特征文件
valid_size = 100 * 10000
all_data_size = 10000 * 10000
train_size = all_data_size - valid_size
feat_path = "./data/train_feature_1E.csv"
# 训练第一亿skip_size为0，训练第二亿skip_size为1亿，以此类推
skip_size = 0 * 10000 * 10000

In [None]:
output_size = 2
feat_dim = 24 # 37
        
class ESIM_LSTM(nn.Module):
    def __init__(self, pretrain=True, is_training=True):
        super(ESIM_LSTM, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_dim = 128
        self.feat_dim = feat_dim
        self.feat_hidden_dim = 100
        self.merge_dim = 300
        self.dropout_rate = 0.2
        self.embedding_dim = embedding_size
        if pretrain:
            self.word_embeds = nn.Embedding(vocab_size, self.embedding_dim)
            if os.path.exists(embedding_path) and is_training:
                print("Loading pretrain embedding...")
                self.word_embeds.weight.data.copy_(torch.from_numpy(np.load(embedding_path))) 
                self.word_embeds.weight.requires_grad = False
                print("Loaded")
        else:
            self.word_embeds = nn.Embedding(vocab_size, self.embedding_dim)
        self.q_encoder = nn.LSTM(self.embedding_dim, hidden_size=self.hidden_dim,
                          num_layers=1,
                          dropout=self.dropout_rate,
                          bidirectional=True, batch_first=True)
        
        self.composition_layer = nn.LSTM(input_size=4*2*self.hidden_dim,
                            hidden_size=self.hidden_dim,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.esim_fc = nn.Linear(8*self.hidden_dim, 4*self.hidden_dim)
        self.feat_fc = nn.Linear(self.feat_dim, self.feat_hidden_dim)
        self.merge_fc = nn.Linear(4*self.hidden_dim+self.feat_hidden_dim, self.merge_dim)
        self.final_fc = nn.Linear(self.merge_dim, output_size)
        self.dropout = nn.Dropout(p=self.dropout_rate)
        
    def forward(self, query, title, feat):
        def _attention(q1, q2):
            # align_mat: [batch, q1_len, q2_len]
            align_mat = torch.bmm(q1, q2.transpose(1, 2))
            # [batch, q1_len, q2_feature_dim]
            align_a = torch.bmm(F.softmax(align_mat, dim=2), q2)
            # [batch, q2_len, q1_feature_dim]
            align_b = torch.bmm(F.softmax(align_mat, dim=1).transpose(1, 2), q1)
            return align_a, align_b

        def _pooling(inputs):
            max_output, _ = torch.max(inputs, 1)
            avg_output = torch.mean(inputs, 1)
            return torch.cat([max_output,avg_output], 1)
        # In: batch_size * q_len
        # Out: batch_size * q_len * word_embedding_dim
        q1_embed = self.word_embeds(query)
        q2_embed = self.word_embeds(title)
        # q1_encoding: [batch_size,q1_len,output_size]
        # q2_encoding: [batch_size,q2_len,output_size]
        q1_encoding, _ = self.q_encoder(q1_embed)
        q2_encoding, _ = self.q_encoder(q2_embed)
        q1_encoding = self.dropout(q1_encoding)
        q2_encoding = self.dropout(q2_encoding)
        # align_a: [batch, q1_len, q2_feature_dim]
        # align_b: [batch, q2_len, q1_feature_dim]
        align_a, align_b = _attention(q1_encoding, q2_encoding)
        # m_a: [batch, q1_len, 3*q1_feature_dim]
        # m_b: [batch, q2_len, 3*q2_feature_dim]
        m_a = torch.cat([q1_encoding, align_a, torch.abs(q1_encoding-align_a), q1_encoding*align_a], 2)
        m_b = torch.cat([q2_encoding, align_b, torch.abs(q2_encoding-align_b), q2_encoding*align_b], 2)
        v_a, _ = self.composition_layer(m_a)
        v_b, _ = self.composition_layer(m_b)
        
        v_a = _pooling(v_a)
        v_b = _pooling(v_b)
        v = torch.cat([v_a, v_b], 1)
        v = self.dropout(v)
        esim_output = F.leaky_relu(self.esim_fc(v))
        feat_output = F.leaky_relu(self.feat_fc(feat))
        merge_output = torch.cat([esim_output, feat_output], dim=1)
        output = self.final_fc(F.leaky_relu(self.merge_fc(merge_output)))
        return output

In [None]:
# 设置模型训练相关参数，指定待训练的模型
batch_size = 512
epoch_num = 15
use_cuda = True

begin_epoch = 0
model = ESIM_LSTM(pretrain=True, is_training=True)

if use_cuda:
    model = model.cuda()
loss_fn = nn.CrossEntropyLoss()
init_lr = 0.0002
weight_decay = 0.0
print_step = 10000

In [None]:
# 模型训练与验证，并保存验证得分最高的模型
optimizer = optim.Adam(model.parameters(), lr=init_lr, weight_decay=weight_decay)
max_qauc = 0
for epoch in range(begin_epoch, begin_epoch+epoch_num):
    running_loss = 0.0
    count = 0
    model = model.train()
    nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("Begin | Time: %s, Epoch: %d" % (nowTime, epoch))
    for query, title, feat, y in gen_batch_data(train_path, batch_size):
        optimizer.zero_grad()
        if use_cuda:
            query, title, feat, y = query.cuda(), title.cuda(), feat.cuda(), y.cuda()
        output = model(query, title, feat)
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()
        cur_loss = loss.item()
        running_loss += cur_loss
        count += 1
        if count % print_step == 0:
            nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print("Time: %s, Epoch: %d, Count: %d, Loss: %.4f" % (nowTime, epoch, count, running_loss/count))
    lr = max(0.0001, (init_lr * (0.9)**(epoch+1-begin_epoch)))
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("END | Time: %s, Epoch: %d, Loss: %.4f, lr: %.5f" % (nowTime, epoch, running_loss/count, lr))