In [1]:
import tensorflow as tf
import numpy as np

In [None]:
# variables,train_corpus,min_count=1
def build_vocab(model, sentences, min_count, trim_rule=None, null_word=False):
    vocabs = scan_vocab(model, sentences, trim_rule=trim_rule)
    scale_vocab(model, vocabs, min_count)
    finalize_vocab(model, null_word)
'''
统计document的词频：vocab{词：词频}(后面需要根据词频过滤一些词);
              标签词长：docvecs{tag：索引，tag对应句子的词总个数}
'''
def scan_vocab(model, documents, trim_rule=None):
    document_no = -1
    min_reduce = 1
    vocab = defaultdict(int)
    new_tags = []
    for document_no, document in enumerate(documents):
        document_length = len(document.words)

        for tag in document.tags:
            if tag not in model.docvecs:
                new_tags.append(tag)
                # docvecs  (TAG,(INDEX,SUM(SEN_LEN) ))
                model.docvecs[tag] = [len(model.offset2doctag), document_length]
                # NI  docvecs
                model.offset2doctag.append(tag)
            else:
                model.docvecs[tag][1] += document_length
        for word in document.words:
            # 词频
            vocab[word] += 1
        # 内 存 
        if model.max_vocab_size and len(vocab) > model.max_vocab_size:
            utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)

    model.n_newTags = len(new_tags)
    model.corpus_count = document_no + 1
    return vocab
'''
将scan_vocab的词频按过滤规则加入model.vocab，model.vocab = {} key: word(str) -> [wordindex, count, Intger.MAX_VALUE*word_probability]
并计算新词的 word_probability（可能被丢弃的概率）,即论文中对高频词的处理
'''
def scale_vocab(model, vocabs, min_count, dry_run=False):
    # [v.index, v.count, v.sample_int]
    sample = model.sample
    drop_unique, drop_total = 0, 0
    new_total, pre_exist_total = 0, 0
    new_words, pre_exist_words = [], []
    # model.vocab = {} key: word(str) -> [wordindex, count, sample_int]
    n_words_exit = len(model.vocab)
    for word, v in vocabs.items():
        if v >= min_count:
            if word in model.vocab:
                pre_exist_words.append(word)
                pre_exist_total += v
                if not dry_run:
                    model.vocab[word][1] += v
            else:
                new_words.append(word)
                new_total += v
                if not dry_run:
                    model.vocab[word] = [len(model.index2word), v, 0]
                    model.index2word.append(word)
        else:
            drop_unique += 1
            drop_total += v
    n_newWords = len(new_words)
    assert n_words_exit + n_newWords == len(model.vocab)
    model.n_newWords = n_newWords

    retain_words = new_words + pre_exist_words
    retain_total = new_total + pre_exist_total
    # Precalculate each vocabulary item's threshold for sampling
    if not sample:
        # no words downsampled
        threshold_count = retain_total
    elif sample < 1.0:
        # traditional meaning: set parameter as proportion of total
        threshold_count = sample * retain_total
    else:
        # new shorthand: sample >= 1 means downsample all words with higher count than sample
        threshold_count = int(sample * (3 + sqrt(5)) / 2)

    downsample_total, downsample_unique = 0, 0
    for w in retain_words:
        v = vocabs[w]
        # P= 1- sqrt(threshold/f) ==> P = sqrt((f/threshold)+1)*(thresholf/f)) 防止0的出现
        word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v)
        if word_probability < 1.0:
            downsample_unique += 1
            downsample_total += word_probability * v
        else:
            word_probability = 1.0
            downsample_total += v
        if not dry_run:
            model.vocab[w][2] = int(round(word_probability * 2 ** 32))
'''
预先计算负采样的数组：model.cum_table是一个升序数组，最后一位是Intger.MAX_VALUE，
是为让所有的词根据频率在model.cum_table上获得长度：len(w) = count(w) ** 0.75 /  E (count(u)) ** 0.75
'''
def finalize_vocab(model, null_word=False):
    """Build tables and model weights based on final vocabulary settings."""
    power = 0.75
    domain = 2 ** 31 - 1
    if model.negative:
        # build the table for drawing random words (for negative sampling)
        vocab_size = len(model.index2word)
        model.cum_table = np.zeros(vocab_size)
        # compute sum of all power (Z in paper)
        train_words_pow = 0.0
        for word_index in range(vocab_size):
            train_words_pow += model.vocab[model.index2word[word_index]][1] ** power
        cumulative = 0.0
        # len(w) = count(w) ** 0.75 /  E (count(u)) ** 0.75
        for word_index in range(vocab_size):
            cumulative += model.vocab[model.index2word[word_index]][1] ** power
            model.cum_table[word_index] = round(cumulative / train_words_pow * domain)
            
        if len(model.cum_table) > 0:
            assert model.cum_table[-1] == domain
            
    if null_word and not '\0' in model.vocab:
        # create null pseudo-word for padding when using concatenative L1 (run-of-words)
        # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter
        word, v = '\0', [len(model.vocab), 1, 0]
        model.index2word.append(word)
        model.vocab[word] = v
'''
采样函数
'''
def _random_sample_negative(model, predictword):
#     采样一定会采到中心词
    word_indices = [predictword[0]]
    while len(word_indices) < model.negative + 1:
#         model.cum_table[-1])==int的上限
        w = model.cum_table.searchsorted(np.random.randint(model.cum_table[-1]))
        # w=np.random.randint(len(model.cum_table))
        if w != predictword[0]:
            word_indices.append(w)
    return word_indices
''''''
def generate_label(model, sentences):
    batch, label = [], []
    for sentence in sentences:
        doctag_indexes = [model.docvecs[index][0] for index in sentence.tags if index in model.docvecs]

        for word in sentence.words:
            predict_word = model.vocab[word]
            word_indices = _random_sample_negative(model,predict_word)
            batch.extend(doctag_indexes)
            for i in range(len(doctag_indexes)):
                label.append(word_indices)
                # batch ->tag  index
    return np.array(batch), np.array(label)

def generate_batch_words(model, sentences):
    batch, label = [], []
    for sentence in sentences:
        word_vocabs = [model.vocab[w] for w in sentence.words if model.vocab[w][2] > np.random.rand() * 2 ** 32]
        for pos, word in enumerate(word_vocabs):
            # go over the words in window
            start = max(0, pos - model.window)
            for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1)], start):
                # don't train on the `word` itmodel
                if pos2 != pos:
                    batch.append(word[0])
                    word_index = model._random_sample_negative(word2)
                    label.append(word_index)

    return np.array(batch), np.array(label)

In [None]:
class tfDoc2vec(object):
    def __init__(self, negative, window, batch_words, other_model):
        self.vocab = other_model.wv.vocab
        self.index2word=other_model.wv.index2word
        self.syn1neg = other_model.syn1neg.astype(np.float32)
        self.syn0 = other_model.wv.syn0.astype(np.float32)
        self.cum_table = other_model.cum_table
        self.negative = negative
        self.window = window
        self.batch_words = batch_words
        self.max_vocab_size = 10000000
        self.corpus_count = 0
        self.sample = other_model.sample
        self.vector_size = 300
        self.layer1_size = 300
        self.docvecs = {}
        self.offset2doctag = []
        self.docvec_syn0 = []
        self.n_newWords = None
        self.n_newTags = None
        self.learn_words = False
        self.convertDataStructure()
    #!!!
    def train(self,sentences,epochs=None):
        #init word2vec placeholder
        #new word count
        n_newWords = tf.placeholder(dtype=tf.int32, name='n_newWords')
        #new tag count
        n_newTags = tf.placeholder(dtype=tf.int32, name='n_newTags')
        #单词plh
        wv_plh = tf.placeholder(dtype=tf.float32, name='wv_plh')
        #句向量plh
        doc_vec = tf.placeholder(dtype=tf.float32, name='doc_vec')
        #syn1neg
        sp_plh = tf.placeholder(dtype=tf.float32, name='sp_plh')
        
        w2v_embeddings, new_vocabs = self.createWeights(wv_plh, n_newWords, self.vector_size, ['exist_vocabs', 'new_vocabs', 'w2v_embeddings'])
        d2v_embeding, new_doc2vec = self.createWeights(doc_vec, n_newTags, self.layer1_size,  ['exist_doc2vec', 'new_doc2vec', 'd2v_embeding'])
        nce_weights, new_nce_weights= self.createWeights(sp_plh, n_newWords, self.vector_size, ['exit_nce_weights', 'new_nce_weights', 'nce_weights'])

        optimizer_sg, loss_sg = self.build_skip_gram(w2v_embeddings, nce_weights)
        optimizer_db, loss_db= self.build_dbow(d2v_embeding, nce_weights)
        
        bacth_sentences, _ = get_batch_sentences(self,sentences)
        
        with tf.Session() as sess:
            print('start train skip_gram model')
            sess.run(tf.global_variables_initalizer(),
                     feed_dict={
                         n_newWords:self.n_newWords,
                         n_newTags:self.n_newTags,
                         wv_plh:self.syn0,
                         doc_vec:[],
                         sp_plh:self.syn1neg
                     })
            for i in range(epochs):
                for sentences in bacth_sentences:
                    # train word2vec skip_gram model
                    if self.learn_words:
                        inputs_sg,labels_sg = generate_batch_words(self, sentences)
                        if len(labels_sg)>0 and len(inputs_sg)>0:
                            feed_dict = {'inputs_sg:0': inputs_sg, 'labels_sg:0': labels_sg}
                            _,cur_loss = sess.run([optimizer_sg, loss_sg],feed_dict=feed_dict)
                    # train doc2vec distribution bags of word model
                    inputs_db, labels_db = generate_label(self, sentences)
                    feed_dict = {'inputs_db:0': inputs_db, 'labels_db:0': labels_db}
                    _, cur_loss, doc_syn0, softmax_weight = sess.run(
                        [optimizer_db,loss_db, d2v_embeding, nce_weights], 
                        feed_dict=feed_dict)
                    print('loss:{} at epoch {}'.format(cur_loss,i))
            self.docvec_syn0 = doc_syn0
            self.syn1neg = softmax_weight
    def convertDataStructure(self):
        vocab = {word : [v.index, v.count, v.sample_int] for word, v in self.vocab.items()}
        self.vocab = vocab
        
    '''initial new_wv_syn0 , new_sp, new_doc_syn0'''
    def init_weights(self,new_words,new_tags):
        # set initial input/projection and hidden weights
        newsyn0 = np.zeros((len(new_words),self.vector_size))
        # randomize the remaining words
        for i in range(len(newsyn0)):
            # construct deterministic seed from word AND seed argument
            newsyn0[i] = self.seeded_vector(new_words[i] + str(1))
        if self.negative:
            self.new_sp = np.zeros((len(new_words), self.layer1_size))
        length=len(new_tags)
        self.new_doc_syn0 = np.zeros((length, self.vector_size))
        
        for i in range(length):
            # construct deterministic seed from index AND model seed
            seed = "%d %s" % (model.seed, new_tags[i])
            self.new_doc_syn0[i] = self.seeded_vector(seed)
        self.new_wv_syn0 = newsyn0
    
    def seeded_vector(self, seed_string):
        """Create one 'random' vector (but deterministic by seed_string)"""
        # Note: built-in hash() may vary by Python version or even (in Py3.x) per launch
        once = np.random.RandomState(hash(seed_string) & 0xffffffff)
        return (once.rand(self.vector_size) - 0.5) / self.vector_size
    
    def createWeights(self,exist,new,size,names):
        exist_str, new_str, concat_str = names[0], names[1], names[2]
        exist_weight=tf.Variable(exist,validate_shape=False,name=exist_str)
        new_weight = tf.Variable(tf.truncated_normal([new,size],dyte=tf.float32),validate_shape=False,name=new_str)
        embeddings = tf.case(
            [(tf.equal(tf.shape(exist_weight)[0], 0), lambda: new_weight),
            (tf.equal(tf.shape(new_weight)[0], 0), lambda: exist_weight)],
            default=lambda: tf.concat([exist_weight, new_weight], axis=0))
        #y=x的operation
        embeddings = tf.identity(embeddings, name=concat_str)
        return embeddings, new_weight
    
    def build_skip_gram(self, embeddings, nce_weights):
        train_inputs = tf.placeholder(tf.int32, shape=[None], name='inputs_sg')
        train_labels = tf.placeholder(tf.int32, shape=[None, 1+self.negative], name='labels_sg')
        
        embed = tf.nn.embedding_lookup(embeddings,train_inputs)
        layer_out = tf.nn.embedding_lookup(nce_weights,train_inputs)
        
        loss = self._train_sg_pair(embed,layer_out,train_inputs)
        loss = tf.identity(loss,name='lose_sg')
        optimizer = tf.train.AdamOptimizer().minimize(loss,name='optimizer_sg')
        return optimizer, loss
    
    def build_dbow(self,doc2vec, nce_weights):

        train_inputs = tf.placeholder(tf.int32, shape=[None], name='inputs_db')
        train_labels = tf.placeholder(tf.int32, shape=[None, 1+self.negative], name='labels_db')

        layer_in = tf.nn.embedding_lookup(doc2vec, train_inputs)
        layer_out = tf.nn.embedding_lookup(nce_weights, train_labels)
        #????
        layer_out = tf.stop_gradient(layer_out)

        loss = self._train_sg_pair(layer_in,layer_out,train_inputs)
        loss = tf.identity(loss,name='lose_db')
        optimizer = tf.train.AdamOptimizer().minimize(loss,name='optimizer_db')
        return optimizer, loss

    # train_inputs 词的id   layer_in：输入词的向量   layer_out：输出的词和负例子                 
    def _train_sg_pair(self,layer_in,layer_out,train_inputs):
        layer_in=tf.expand_dims(layer_in,axis=1)
        
        #
        logits =tf.reduce_sum(layer_in * layer_out, axis=2)
        
        labels_one_hot = tf.one_hot(tf.tile([0],multiples=tf.shape(train_inputs),self.negative+1))
        
        # label =》（1，0，0，0...）y_real 用来计算loss 将layer_in*layer_out 预测一个(0.8,0.02ds,...)的向量，计算loss
        loss =tf.nn.sigmoid_cross_entropy_with_logits(
            labels=labels_one_hot,
            logits=logits)
        loss=tf.reduce_sum(loss,axis=1)
        loss=tf.reduce_mean(loss)
        return loss

In [None]:
def docSimilar(model, docs, doctag):

    docMatrix = []
    productNameVec = getVec(model, doctag)
    productNameVec = np.array(productNameVec)
    for row1 in productNameVec:   
        docMatrixrow = []
        for row2 in productNameVec:
            temp = spatial.distance.cosine(row1, row2)
            docMatrixrow.append(temp)
        docMatrix.append(docMatrixrow)
    docMatrix = np.array(docMatrix)
    indexSku = docs.index
    df_docSimilar = pd.DataFrame(docMatrix, columns  = indexSku, index = indexSku)
    return df_docSimilar

def getVec(model, doctags):
    doc2vec = []
    for doctag in doctags:
        vec_index = [model.docvecs[tag][0] for tag in doctag]
        doc_vecs = model.docvec_syn0[vec_index]
        vec = np.average(doc_vecs, axis=0)
        doc2vec.append(vec)
    return doc2vec

def evaluation(docs,model,targetSku = '124269'):
    # generate resultcompare.csv
    result = []
    target_vector = model.docvecs[targetSku]
    for vector in model.docvecs:
        distance = spatial.distance.cosine(target_vector, vector)
        result.append(distance)
    docs['distance_to_1st'] = result
    docs.sort_values(by='distance_to_1st', inplace=True)
    docs.to_csv('resultcompare.csv')

In [None]:
if __name__ =='__main__':
    # data and model file path
    modelpath = '../data_prepare/nlp_model/doc2vec.bin'
    data_path = '../data_prepare/caas_stage/product.csv.gz'
    # #load Doc2vec nlp_model from pre-trained source
    data = pd.read_csv(data_path, compression='gzip', index_col='productSku',encoding='utf-8')
    corpus,docs = descriptionPreprocess(data)
    doctag = docs['productCategoryIds'].values
    
    train_corpus = TaggedSentence(corpus,doctag)
    
    #读取预训练的model
    model = g.Doc2Vec.load(modelpath)
    model.docvecs = DocvecsArray()
    tfdocVec = tfDoc2vec(other_model=model, negative=5, window=1, batch_words=200)
    
    #训练开始
    build_vocab(tfdocVec,train_corpus,min_count=1)
    tfdocVec.train(train_corpus, epochs=1)

    doc2vec = getVec(tfdocVec, doctag)
    drawDiagram(doc2vec, docs, label='productName', bounds_x=None, bounds_y=None)