<a href="https://colab.research.google.com/github/chongzicbo/nlp-ml-dl-notes/blob/master/code/text_similarity/NLP10%EF%BC%9Asiamese_text_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import numpy as np
import pandas as pd
import os
import math


def sentences_to_indices(X, word_to_index, max_len):
  """
  把字符串数组转换为字符数值索引数组
  :param X:string 数组
  :param word_to_index:
  :param max_len:最长的序列长度
  :return:
  """
  m = X.shape[0]
  X_indices = np.zeros((m, max_len))
  for i in range(m):
    # split字符串
    sentence_words = X[i].split(" ")
    for j, w in enumerate(sentence_words):
      if j >= max_len:
        break
      X_indices[i, j] = word_to_index[w]
  return X_indices


def load_dataset(data_dir, max_seq_len, embed_dim, word_level=True):
    """
    读取数据，对数据进行预处理，并生成embed_matrix
    :param data_dir:数据目录
    :param max_seq_len:
    :param embed_dim:词向量维度
    :param word_level:
    :return:
    """
    question_path = os.path.join(data_dir, "question.csv")
    train_path = os.path.join(data_dir, "train.csv")
    if word_level:
        embed_path = os.path.join(data_dir, "word_embed.txt")  # 词向量
    else:
        embed_path = os.path.join(data_dir, "char_embed.txt")  # 字符向量

    # 读取数据
    question = pd.read_csv(question_path)
    train = pd.read_csv(train_path)

    # 把train里面的问题id匹配到句子
    train = pd.merge(train, question, left_on=["q1"], right_on=["qid"], how="left")  # 匹配第一个问题
    train = pd.merge(train, question, left_on=["q2"], right_on=["qid"], how="left")  # 匹配第二个问题

    if word_level:
        train = train[["label", "words_x", "words_y"]]
    else:
        train = train[["label", "chars_x", "chars_y"]]
    train.columns = ["label", "q1", "q2"]

    word_to_vec_map = pd.read_csv(embed_path, sep=" ", header=None, index_col=0)
    word = word_to_vec_map.index.values

    # word2id,id2word
    word_to_index = dict([(word[i], i+1) for i in range(len(word))])
    index_to_word = dict([(i+1, word[i]) for i in range(len(word))])

    train_q1_indices = sentences_to_indices(train.q1.values, word_to_index, max_seq_len)
    train_q2_indices = sentences_to_indices(train.q2.values, word_to_index, max_seq_len)
    label = train.label.values

    vocab_len = len(word_to_index)+1
    embed_matrix = np.zeros((vocab_len, embed_dim))
    for word, index in word_to_index.items():
        embed_matrix[index, :] = word_to_vec_map.loc[word].values

    return train_q1_indices, train_q2_indices, label, embed_matrix, word_to_index, index_to_word


def load_test_data(data_dir, max_seq_len, word_level=True):
    """
    读取测试数据
    :param max_seq_len:
    :param word_level:
    :return:
    """
    question_path = os.path.join(data_dir, "question.csv")
    test_path = os.path.join(data_dir, "test.csv")
    if word_level:
        embed_path = os.path.join(data_dir, "word_embed.txt")
    else:
        embed_path = os.path.join(data_dir, "char_embed.txt")

    # 读取数据
    question = pd.read_csv(question_path)
    test = pd.read_csv(test_path)

    test = pd.merge(test, question, left_on=["q1"], right_on=["qid"], how="left")
    test = pd.merge(test, question, left_on=["q2"], right_on=["qid"], how="left")

    if word_level:
        test = test[["words_x", "words_y"]]
    else:
        test = test[["chars_x", "chars_y"]]
    test.columns = ["q1", "q2"]
    word_to_vec_map = pd.read_csv(embed_path, sep=" ", header=None, index_col=0)
    word = word_to_vec_map.index.values

    # word2id,id2word
    word_to_index = dict([(word[i], i+1) for i in range(len(word))])
    index_to_word = dict([(i+1, word[i]) for i in range(len(word))])

    test_q1_indices = sentences_to_indices(test.q1.values, word_to_index, max_seq_len)
    test_q2_indices = sentences_to_indices(test.q2.values, word_to_index, max_seq_len)
    return test_q1_indices, test_q2_indices



In [35]:
import numpy as np
import pandas as pd

np.random.seed(0)

from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, GaussianNoise, \
    Input, Dropout, LSTM, Activation, BatchNormalization, concatenate, Subtract, Dot, Multiply, Bidirectional, Lambda
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras import optimizers
import tensorflow as tf
import tensorflow.keras.callbacks as kcallbacks

np.random.seed(1)
import warnings

warnings.filterwarnings("ignore")

MAX_SEQUENCE_LENGTH = 15  # 20 for character level and 15 for word level
EMBEDDING_DIM = 300
lstm_num = 64
lstm_drop = 0.5
BATCH_SIZE = 100


def trainLSTM(train_q1, train_q2, train_label, embed_matrix):
    question1 = Input(shape=(MAX_SEQUENCE_LENGTH,), batch_size=BATCH_SIZE)
    question2 = Input(shape=(MAX_SEQUENCE_LENGTH,), batch_size=BATCH_SIZE)
    embed_layer = Embedding(embed_matrix.shape[0], EMBEDDING_DIM,weights=[embed_matrix]) #
    q1_embed = embed_layer(question1)
    q2_embed = embed_layer(question2)

    shared_lstm1 = LSTM(lstm_num, return_sequences=True)
    shared_lstm2 = LSTM(lstm_num)

    q1 = shared_lstm1(q1_embed)
    q1 = Dropout(lstm_drop)(q1)
    q1 = BatchNormalization()(q1)
    q1 = shared_lstm2(q1)

    q2 = shared_lstm1(q2_embed)
    q2 = Dropout(lstm_drop)(q2)
    q2 = BatchNormalization()(q2)
    q2 = shared_lstm2(q2)

    # 求distance (batch_size,lstm_num)
    d = Subtract()([q1, q2])
    distance = Multiply()([d, d])
    # 求angle (batch_size,lstm_num)
    angle = Multiply()([q1, q2])
    merged = concatenate([distance, angle])
    merged = Dropout(0.3)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(256, activation="relu")(merged)
    merged = Dropout(0.3)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(64, activation="relu")(merged)
    merged = Dropout(0.3)(merged)
    merged = BatchNormalization()(merged)

    res = Dense(1, activation="sigmoid")(merged)
    model = Model(inputs=[question1, question2], outputs=res)
    model.compile(loss=keras.losses.BinaryCrossentropy(), optimizer="adam", metrics=["accuracy"])
    model.summary()

    hist = model.fit([train_q1, train_q2],train_label,epochs=30, batch_size=BATCH_SIZE, validation_split=0.2,shuffle=True)


In [33]:

train_q1_indices, train_q2_indices, train_label, embed_matrix, word_to_index, index_to_word = load_dataset("/content/drive/My Drive/data/text_similarity/data", MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, False)
print('train_q1: ', train_q1_indices.shape)
print('train_q2: ', train_q2_indices.shape)
print('train_label: ', tf.one_hot(train_label,depth=2).shape)
print('embed_matrix: ', embed_matrix.shape)

# 加载test 数据
test_q1, test_q2 = load_test_data("/content/drive/My Drive/data/text_similarity/data", MAX_SEQUENCE_LENGTH, word_level=False)
print('test_q1: ', test_q1.shape)
print('test_q2: ', test_q2.shape)
print("word_to_index len:",len(word_to_index))

train_q1:  (254386, 15)
train_q2:  (254386, 15)
train_label:  (254386, 2)
embed_matrix:  (3049, 300)
test_q1:  (172956, 15)
test_q2:  (172956, 15)
word_to_index len: 3048


In [36]:
trainLSTM(train_q1_indices[:243000], train_q2_indices[:243000], train_label[:243000], embed_matrix) #数据数量无法整除BATCH_SIZE时会报错

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(100, 15)]          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(100, 15)]          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (100, 15, 300)       914700      input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   (100, 15, 64)        93440       embedding_3[0][0]     