In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import InputExample, SentenceTransformer
from torch.utils.data import DataLoader

2022-03-15 14:49:39.156143: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## 读取数据

In [2]:
corpus_data = pd.read_csv("./input/corpus.tsv", sep="\t", names=["doc", "title"])
dev_data = pd.read_csv("./input/dev.query.txt", sep="\t", names=["query", "title"])
train_data = pd.read_csv("./input/train.query.txt", sep="\t", names=["query", "title"])
qrels = pd.read_csv("./input/qrels.train.tsv", sep="\t", names=["query", "doc"])

In [3]:
corpus_data = corpus_data.set_index("doc")
dev_data = dev_data.set_index("query")
train_data = train_data.set_index("query")
qrels = qrels.set_index("query")

In [4]:
# corpus_data_copy = corpus_data.copy()
# corpus_data_copy['title_len'] = corpus_data_copy['title'].apply(len)
# noise_title = corpus_data_copy[corpus_data_copy['title_len'] < 10]['title'].values

In [5]:
# len(noise_title)

In [6]:
corpus_data.head()

Unnamed: 0_level_0,title
doc,Unnamed: 1_level_1
1,铂盛弹盖文艺保温杯学生男女情侣车载时尚英文锁扣不锈钢真空水杯
2,可爱虎子华为荣耀X30i手机壳荣耀x30防摔全包镜头honorx30max液态硅胶虎年情侣女...
3,190色素色亚麻棉平纹布料 衬衫裙服装定制手工绣花面料 汇典亚麻
4,松尼合金木工开孔器实木门开锁孔木板圆形打空神器定位打孔钻头
5,微钩绿蝴蝶材料包非成品 赠送视频组装教程 需自备钩针染料


In [7]:
dev_data.head()

Unnamed: 0_level_0,title
query,Unnamed: 1_level_1
200001,甲黄酸阿怕替尼片
200002,索泰zbox
200003,kfc游戏机
200004,bunny成兔粮
200005,铁线威灵仙


In [8]:
train_data.head()

Unnamed: 0_level_0,title
query,Unnamed: 1_level_1
1,美赞臣亲舒一段
2,慱朗手动料理机
3,電力貓
4,掏夹缝工具
5,飞推vip


In [9]:
qrels.head()

Unnamed: 0_level_0,doc
query,Unnamed: 1_level_1
1,679139
2,35343
3,781652
4,557516
5,588014


## 分词

In [10]:
import jieba

" ".join(jieba.cut("美赞臣亲舒一段"))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.510 seconds.
Prefix dict has been built successfully.


'美赞臣 亲舒 一段'

In [11]:
def title_cut(x):
    return list(jieba.cut(x))


from joblib import Parallel, delayed

corpus_title = Parallel(n_jobs=-1)(
    delayed(title_cut)(title) for title in corpus_data["title"]
)
train_title = Parallel(n_jobs=-1)(
    delayed(title_cut)(title) for title in train_data["title"]
)
# dev_title = dev_data['title'].apply(title_cut)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary .

## IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

idf = TfidfVectorizer(analyzer=lambda x: x)
idf.fit(train_title)

TfidfVectorizer(analyzer=<function <lambda> at 0x7f9fa4911940>)

In [13]:
vocab = idf.get_feature_names()



In [14]:
train_ids = [[idf.vocabulary_[x] for x in title] for title in train_title]
train_ids[:2]

[[60517, 17245, 14225], [37210, 42871, 37525, 40991, 43206]]

In [15]:
idf.idf_, len(idf.idf_)

(array([ 4.14517079, 11.81978828, 10.721176  , ..., 11.81978828,
        11.81978828, 11.81978828]),
 77509)

In [16]:
corpus_idf = idf.transform(corpus_title)
train_idf = idf.transform(train_title)

In [17]:
[idf.vocabulary_[x] for x in train_title[0]]

[60517, 17245, 14225]

## 负样本构造

In [18]:
(corpus_idf[:, 60517] != 0).nonzero()

(array([  1009,  26789,  41852,  57032,  94147, 116564, 127764, 166304,
        191889, 236547, 257511, 280519, 285231, 285492, 303973, 345935,
        369300, 433723, 435556, 499075, 519905, 550179, 572794, 574646,
        614743, 623935, 679138, 681825, 699256, 735695, 754510, 763358,
        773570, 839213, 845106, 869273, 922629, 926762, 961398, 987809,
        997086], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32))

In [19]:
train_data["title"].iloc[0], corpus_data["title"].iloc[926762]

('美赞臣亲舒一段', '有现货美国美赞臣Enfamil婴幼儿宝宝多维综合维生素滴剂含铁50ML')

In [20]:
keyword_corpus_idxs, keyword_idxs = corpus_idf.nonzero()
inverse_keyword_map = {}
for x, y in zip(keyword_idxs, keyword_corpus_idxs):
    if vocab[x] in inverse_keyword_map:
        inverse_keyword_map[vocab[x]].append(y)
    else:
        inverse_keyword_map[vocab[x]] = [y]

In [21]:
vocab = idf.get_feature_names()

In [22]:
vocab[20001], idf.vocabulary_["公文包"]

('公文包', 20001)

In [23]:
corpus_idf.shape, max(keyword_corpus_idxs), max(keyword_idxs)

((1001500, 77509), 1001499, 77466)

In [24]:
from tqdm import tqdm_notebook

In [25]:
MAX_NEG_SAMPLES = 10

train_neg_piar = []
for idx in tqdm_notebook(range(1, train_data.shape[0] + 1 - 5000)):
    idx_keyword = train_title[idx - 1]
    idx_keyword_idf = idf.idf_[train_ids[idx - 1]]
    idx_top1_word = idx_keyword[idx_keyword_idf.argmax()]
    # idx_start_word, idx_end_word = idx_keyword[0], idx_keyword[-1]

    if idx_top1_word in inverse_keyword_map:
        negative_idx = inverse_keyword_map[idx_top1_word][:MAX_NEG_SAMPLES]
    else:
        negative_idx = np.random.randint(corpus_data.shape[0], size=MAX_NEG_SAMPLES)

    """
    idx_keyword = []
    if len(idx_top1_word) >= 2 and idx_top1_word in inverse_keyword_map:
        idx_keyword += inverse_keyword_map[idx_top1_word]
    if len(idx_start_word) >= 2 and idx_start_word in inverse_keyword_map:
        idx_keyword += inverse_keyword_map[idx_start_word]
    if len(idx_end_word) >= 2 and idx_end_word in inverse_keyword_map:
        idx_keyword += inverse_keyword_map[idx_end_word]
    negative_idx = sum(negative_idx, [])
    """

    # negative_idx = list(set(negative_idx))
    negative_idx = [x + 1 for x in negative_idx]
    positive_idx = qrels.loc[idx].ravel()[0]
    if positive_idx in negative_idx:
        negative_idx.remove(positive_idx)

    train_neg_piar.append(negative_idx)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm_notebook(range(1, train_data.shape[0] + 1 - 5000)):


  0%|          | 0/95000 [00:00<?, ?it/s]

In [26]:
idx_keyword[idx_keyword_idf.argmax()]

'江玲'

In [27]:
positive_idx in negative_idx

False

In [28]:
corpus_data.loc[qrels.loc[idx].ravel()[0]]

title    江铃尿素江铃柴油车专用尿素通用柴油车国五国六尿素春节后发货哦
Name: 188941, dtype: object

In [29]:
corpus_data.loc[negative_idx]

Unnamed: 0_level_0,title
doc,Unnamed: 1_level_1
296459,江玲宝典皮卡江铃e100电动3域虎5e200汽车驭胜S350专用手机支架


## 训练集构造

In [30]:
from tqdm import tqdm_notebook

train_examples = []
# for idx in tqdm_notebook(range(1, train_data.shape[0] + 1 - 5000)):
for idx in tqdm_notebook(range(1, 1000)):

    train_examples.append(
        InputExample(
            texts=[
                train_data.loc[idx]["title"],
                corpus_data.loc[qrels.loc[idx].ravel()[0]]["title"],
            ],
            label=1.0,
        )
    )
    
    if idx-1 in train_neg_piar:
        for neg_idx in train_neg_piar[idx-1]:
            if neg_idx % 2 == 0:
                train_examples.append(
                    InputExample(
                        texts=[
                            train_data.loc[idx]["title"],
                            corpus_data.loc[neg_idx]["title"],
                        ],
                        label=0.0,
                    )
                )
            else:
                train_examples.append(
                    InputExample(
                        texts=[
                            corpus_data.loc[neg_idx]["title"],
                            train_data.loc[idx]["title"],

                        ],
                        label=0.0,
                    )
                )
        
    rand_idx = np.random.randint(1, corpus_data.shape[0], size=20)
    for neg_idx in rand_idx:
        train_examples.append(
            InputExample(
                texts=[
                    train_data.loc[idx]["title"],
                    corpus_data.loc[neg_idx]["title"],
                ],
                label=0.0,
            )
        )
        
        
        # print(corpus_data.loc[neg_idx]["title"])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm_notebook(range(1, 1000)):


  0%|          | 0/999 [00:00<?, ?it/s]

In [31]:
[str(x) for x in train_examples[:10]]

['<InputExample> label: 1.0, texts: 美赞臣亲舒一段; 领券满减】美赞臣安婴儿A+亲舒 婴儿奶粉1段850克 0-12个月宝宝',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 印章收纳包营业执照收纳包大容量带密码锁多格印章执照一体包便携',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 加厚加大号铁书立/学生书夹书挡书靠书立书架办公桌面收纳黑白色',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 欧式实木茶几2.4米电视柜2.2米客厅组合小户型橡木法式香槟色家具',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 夏季港风复古chic盐系网红炸街小个子休闲上衣职业短裤两件套装女',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 模具加工定做订做 来图来样免费设计出图注塑制作 塑料制品开模',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 适用于雪佛兰赛欧3大灯罩15-18款赛欧3前大灯透明灯罩赛欧3大灯壳',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 百乐满极光平衡式燃气热水器，16升、20升、24升，不懂用多',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 适用于宝马E53 E70后下摆臂胶 E60 520 525 530 730 740后H臂胶套',
 '<InputExample> label: 0.0, texts: 美赞臣亲舒一段; 扬子洲 林可霉素利多卡因凝胶 20g*1支/盒 轻度烧伤创伤 蚊虫叮咬']

In [32]:
train_neg_piar[0]

[987810]

## 验证集构造

In [33]:
from sentence_transformers import evaluation

eval_s1 = []
eval_s2 = []
eval_socre = []

for idx in tqdm_notebook(range(train_data.shape[0] - 1000, train_data.shape[0] + 1)):
    eval_s1.append(train_data.loc[idx]["title"])
    eval_s2.append(corpus_data.loc[qrels.loc[idx].ravel()[0]]["title"])
    eval_socre += [1]

    if idx-1 in train_neg_piar:
        for neg_idx in train_neg_piar[idx-1]:
            eval_s1.append(train_data.loc[idx]["title"])
            eval_s2.append(corpus_data.loc[neg_idx]["title"])
            eval_socre += [0]
        
    rand_idx = np.random.randint(corpus_data.shape[0], size=10)
    for neg_idx in rand_idx:
        eval_s1 += [train_data.loc[idx]["title"]]
        eval_s2 += [corpus_data.loc[neg_idx]["title"]]
        eval_socre += [0]


evaluator = evaluation.EmbeddingSimilarityEvaluator(
    eval_s1, eval_s2, eval_socre, write_csv=True
)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm_notebook(range(train_data.shape[0] - 1000, train_data.shape[0] + 1)):


  0%|          | 0/1001 [00:00<?, ?it/s]

In [38]:
idx = 11
len(eval_s1[idx]), len(eval_s2[idx]), eval_socre[idx]

(5, 30, 1)

## sentence-bert

In [39]:
from sentence_transformers import SentenceTransformer, models, util
from torch import nn

word_embedding_model = models.Transformer("bert-base-chinese", max_seq_length=50)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(
    in_features=pooling_model.get_sentence_embedding_dimension(),
    out_features=128,
    activation_function=nn.Tanh(),
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
from sentence_transformers import InputExample, SentenceTransformer, losses

# Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=100)
train_loss = losses.CosineSimilarityLoss(model)


# Tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=100,
    evaluator=evaluator,
    evaluation_steps=1000,
    show_progress_bar=True,
    output_path="./",
    checkpoint_save_steps=10000,
    save_best_model=True,
    checkpoint_path='./'
)



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/210 [00:00<?, ?it/s]

Iteration:   0%|          | 0/210 [00:00<?, ?it/s]

In [41]:
len(train_examples)

20979

## 验证 & 提交

In [46]:
query_len = train_data.shape[0]
corpus_len = corpus_data.shape[0]

query_len = 1000
corpus_len = 10000
query_sentences = list(train_data["title"])[:query_len]
corpus_sentences = list(corpus_data["title"].iloc[:])[:corpus_len]
corpus_sentences = [x for x in corpus_sentences if len(x) > 10]

In [59]:
model.eval()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 50, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 768, 'out_features': 128, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [55]:
query_embeddings = model.encode(query_sentences, batch_size=500, show_progress_bar=True)
corpus_embeddings = model.encode(corpus_sentences, batch_size=500, show_progress_bar=True)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [50]:
from sklearn.preprocessing import normalize

query_embeddings = normalize(query_embeddings)
corpus_embeddings = normalize(corpus_embeddings)

In [60]:
query_embeddings.shape

(1000, 128)

In [61]:
cos_sim = util.cos_sim(query_embeddings, corpus_embeddings)

In [62]:
for query_idx in range(0, 200):
    query_sim = cos_sim[query_idx, :]
    corpus_idx = query_sim.argmax().item()
    ids = query_sim.argsort().numpy()[::-1]
    print(
        "{} \t {} \t {}".format(
            query_sentences[query_idx],
            corpus_data["title"].iloc[corpus_idx],
            corpus_data["title"].loc[qrels['doc'].loc[query_idx+1]],
            query_sim[corpus_idx],
        )
    )
    print(query_sim[corpus_idx])
    # print(np.where(ids == qrels['doc'].loc[query_idx+1])[0][0])
    
    # break

美赞臣亲舒一段 	 原装 HP DL388 GEN8 服务器RAID卡1G缓存带电池    633542-001 	 领券满减】美赞臣安婴儿A+亲舒 婴儿奶粉1段850克 0-12个月宝宝
tensor(0.6575)
慱朗手动料理机 	 P棉蓄温毯子户外保温热反射毯子地垫被子露营出差车载家用沙发毯 	 Braun/博朗 MQ3035/3000/5025料理棒手持小型婴儿辅食家用搅拌机
tensor(0.6544)
電力貓 	 男女个性潮款钛钢无耳洞耳骨夹日韩欧美流行女士耳骨夹潮男耳骨夹 	 小米WiFi电力猫无线路由器套装一对300M穿墙宝家用信号增强扩展器
tensor(0.4757)
掏夹缝工具 	 南卡巡更棒电子巡更系统4G网络巡更棒打点器巡逻棒巡更机巡检仪 	 电梯地坎清洁工具除灰尘神器轿厢门槽缝隙掏勺维保打扫奥的斯三菱
tensor(0.6929)
飞推vip 	 【正版现货】纺织高职高专规划教材:纺织品检验 田恬 纺织品检验基础知识 纺织纤维鉴别 纱线质量的检验 色差评定书籍 服装质量检 	 飞逗推拍 店主邀请码 去水印 创意视频一键制作视频
tensor(0.5147)
多功能托地把 	 微星 GE75 GP75 GL75 MS-17E1 17E2 17E3 17E5 A壳B壳C壳D壳 屏轴 	 免手洗拖把家用一拖净刮刮乐干湿两用懒人拖平板墩布托帕拖地神器
tensor(0.5571)
充气浮力袖 	 湖北腊鱼风干鱼块湖南特产咸鱼干货水产腌鱼块武汉腊鱼非烟熏500g 	 学游泳神器装备充气腰背漂水袖浮臂三角浮力儿童游泳辅助工具大人
tensor(0.7045)
盒马花胶鸡汤锅 	 茶色超轻近视眼镜框女可配防蓝光有度数镜片韩版圆脸显瘦眼睛架潮 	 盒马鲜生工坊代购 花胶奶冻150g 入口Q弹 奶味浓郁 香甜丝滑
tensor(0.6574)
塞塞乐 	 验钞灯手电筒儿童小迷你电池三色便携紫光微小型激光红外线钥匙扣 	 婴儿童玩具6个月以上8宝宝益智早教0一1岁男孩女孩六9月十7新生礼
tensor(0.4963)
广汽传祺gs5挡风遮雨条子 	 韩国东大门虎年本命年手链女招财转运珠红色手串手饰品送闺蜜礼物 	 2021款广汽传祺GS5晴雨挡遮雨板传奇GS5配件车窗雨眉防雨条挡雨板
tensor(0.8650)
冰墩敦人偶服装 	 立顿茉莉花茶包独立纸包

In [98]:
corpus_data["title"].iloc[ids[:4]+1]

doc
133974            北欧实木吧台桌椅家用高脚桌靠墙现代简约设计师餐厅岛台小吧台桌
924399             金家用纱窗压条嵌条卡条实心塑钢窗H纱网窗纱压纱胶条牛筋老式
86959     李宁篮球鞋男子驭帅13 14 15代䨻科技耐磨缓震高帮运动鞋 ABAR043
875103         2021新款真皮女靴子唐焉欧阳娜娜同款rv方扣短靴厚底漆皮马丁靴女
Name: title, dtype: object

In [59]:
query_sim.shape

torch.Size([999372])

In [369]:
corpus_data_copy = corpus_data.copy()
corpus_data_copy['title_len'] = corpus_data_copy['title'].apply(len)
noise_title = corpus_data_copy[corpus_data_copy['title_len'] < 10]['title'].values

In [56]:
np.array([1,2,3]).argsort()

array([0, 1, 2])

In [375]:
np.random.choice(noise_title, 10)

array(['永劫无间网吧特权', '老颜鲜生改价专拍', '水肌魔塑套餐', '各种稳定环大合集', '儿童安全门螺杆', '钢笔书法章法',
       '华为管理法（新版）', '桂花树上的香', '紫苏精华系列套装', '白白叶叶小铺杯子6'], dtype=object)

In [393]:
corpus_data[corpus_data['title'] == '直销与传销']

Unnamed: 0_level_0,title
doc,Unnamed: 1_level_1
9155,直销与传销
