In [8]:
import matchzoo as mz
import typing
from pathlib import Path
import pandas as pd
import csv

def read_data(path):
    table = pd.read_csv(path, sep='\t', header=0, quoting=csv.QUOTE_NONE)
    df = pd.DataFrame({
        'text_left': table['sentence1'],
        'text_right': table['sentence2'],
        'label': table['Label']
    })
    return mz.pack(df)

def load_data(
    stage: str = 'train',
    task: str = 'ranking',
    filtered: bool = False,
    return_classes: bool = False
) -> typing.Union[mz.DataPack, tuple]:
    if stage not in ('train', 'dev', 'test'):
        raise ValueError(f"{stage} is not a valid stage."
                         f"Must be one of `train`, `dev`, and `test`.")

    data_root = Path("/Users/lifang/desktop/project/corpus/LCQMC/data")
    file_path = data_root.joinpath(f'{stage}.txt')
    data_pack = read_data(file_path)
    if task == 'ranking':
        task = mz.tasks.Ranking()
    if task == 'classification':
        task = mz.tasks.Classification()
    if isinstance(task, mz.tasks.Ranking):
        return data_pack
    elif isinstance(task, mz.tasks.Classification):
        data_pack.one_hot_encode_label(task.num_classes, inplace=True)
        if return_classes:
            return data_pack, [False, True]
        else:
            return data_pack
    else:
        raise ValueError(f"{task} is not a valid task."
                         f"Must be one of `Ranking` and `Classification`.")


In [10]:
task = mz.tasks.Ranking()    
train_raw = load_data(stage='train', task=task)  #qa是datasets下新建的包，放置中文数据
test_raw = load_data(stage='test', task=task)
print(train_raw.left.head())
print(train_raw.right.head())
print(train_raw.relation.head())
print(train_raw.frame().head())

                  text_left
id_left                    
L-0        喜欢打篮球的男生喜欢什么样的女生
L-1            我手机丢了，我想换个手机
L-2                大家觉得她好看吗
L-3               求秋色之空漫画全集
L-4      晚上睡觉带着耳机听音乐有什么害处吗？
               text_right
id_right                 
R-0       爱打篮球的男生喜欢什么样的女生
R-1           我想买个新手机，求推荐
R-2            大家觉得跑男好看吗？
R-3             求秋色之空全集漫画
R-4          孕妇可以戴耳机听音乐吗?
  id_left id_right  label
0     L-0      R-0      1
1     L-1      R-1      1
2     L-2      R-2      0
3     L-3      R-3      1
4     L-4      R-4      0
  id_left           text_left id_right       text_right  label
0     L-0    喜欢打篮球的男生喜欢什么样的女生      R-0  爱打篮球的男生喜欢什么样的女生      1
1     L-1        我手机丢了，我想换个手机      R-1      我想买个新手机，求推荐      1
2     L-2            大家觉得她好看吗      R-2       大家觉得跑男好看吗？      0
3     L-3           求秋色之空漫画全集      R-3        求秋色之空全集漫画      1
4     L-4  晚上睡觉带着耳机听音乐有什么害处吗？      R-4     孕妇可以戴耳机听音乐吗?      0


In [12]:
path_vec = "/Users/lifang/desktop/project/corpus/word2vec/WordVector_60dimensional/wiki.zh.text.vector"
emb = mz.embedding.load_from_file(path_vec, mode='word2vec')
# print(emb.shape)
print(type(emb))

<class 'matchzoo.embedding.embedding.Embedding'>


In [15]:
model_class = mz.models.ArcI
# preprocessor_class = mz.preprocessors.chinese_preprocessor.ChinesePreprocessor()
# print(preprocessor_class)
preprocessor_class = mz.preprocessors.BasicPreprocessor()
preprocessor_class._units = [
            # mz.preprocessors.units.tokenize.ChineseTokenize(),
            mz.preprocessors.units.tokenize.ChineseTokenize(),
            # mz.preprocessors.units.lowercase.Lowercase(),
            mz.preprocessors.units.punc_removal.PuncRemoval(),
        ]

model, preprocessor, data_generator_builder, embedding_matrix = mz.auto.prepare(
    task=task,
    model_class=model_class,
    preprocessor=preprocessor_class,
    data_pack=train_raw,
    embedding=emb
)


In [16]:
print(model.params)   # 展示模型中可调参数
model.params['mlp_num_units'] = 3  # 直接调整参数
print("embedding_matrix: \n", type(embedding_matrix), '\n', embedding_matrix)

model_class                   <class 'matchzoo.models.arci.ArcI'>
input_shapes                  [(30,), (30,)]
task                          Ranking Task
optimizer                     adam
with_embedding                True
embedding_input_dim           4763
embedding_output_dim          60
embedding_trainable           True
with_multi_layer_perceptron   True
mlp_num_units                 128
mlp_num_layers                3
mlp_num_fan_out               64
mlp_activation_func           relu
num_blocks                    1
left_filters                  [32]
left_kernel_sizes             [3]
right_filters                 [32]
right_kernel_sizes            [3]
conv_activation_func          relu
left_pool_sizes               [2]
right_pool_sizes              [2]
padding                       same
dropout_rate                  0.0
embedding_matrix: 
 <class 'numpy.ndarray'> 
 [[ 0.01798805  0.13321907  0.02238536 ... -0.02820211  0.18273228
   0.18436114]
 [-0.14500743 -0.0936597  -0.124800

In [17]:
train_processed = preprocessor.transform(train_raw, verbose=0)
test_processed = preprocessor.transform(test_raw, verbose=0)

vocab_unit = preprocessor.context['vocab_unit']   # 此部分是为了显示处理过程
print('Orig Text:', train_processed.left.loc['L-0']['text_left'])
sequence = train_processed.left.loc['L-0']['text_left']
print('Transformed Indices:', sequence)
print('Transformed Indices Meaning:',
      '/'.join([vocab_unit.state['index_term'][i] for i in sequence]))

train_gen = data_generator_builder.build(train_processed)
test_gen = data_generator_builder.build(test_processed)
model.fit_generator(train_gen, epochs=1)
model.evaluate_generator(test_gen)

Orig Text: [3431, 978, 3431, 3431, 1042, 3431, 3431, 3630, 3431, 3431, 628, 3431, 3431, 296, 3431, 3431, 1251, 3431, 3431, 4480, 3431, 3431, 2158, 3431, 3431, 4345, 3431, 3431, 1042, 3431]
Transformed Indices: [3431, 978, 3431, 3431, 1042, 3431, 3431, 3630, 3431, 3431, 628, 3431, 3431, 296, 3431, 3431, 1251, 3431, 3431, 4480, 3431, 3431, 2158, 3431, 3431, 4345, 3431, 3431, 1042, 3431]
Transformed Indices Meaning:  /男/ / /生/ / /喜/ / /欢/ / /什/ / /么/ / /样/ / /的/ / /女/ / /生/ 


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


{mean_average_precision(0.0): 0.5073902492830356}