In [1]:
!ls ./input/* -lh

-rw-rw-r-- 1 lyz lyz  91M 3月   1 15:25 ./input/corpus.tsv
-rw-rw-r-- 1 lyz lyz 1.5K 3月   9 10:35 ./input/data_check.py
-rw-rw-r-- 1 lyz lyz  26K 3月  10 14:12 ./input/dev.query.txt
-rw-rw-r-- 1 lyz lyz 1.3M 3月  10 14:13 ./input/qrels.train.tsv
-rw-rw-r-- 1 lyz lyz 2.3M 3月  10 14:13 ./input/train.query.txt


# 数据集读取

In [18]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm_notebook

In [12]:
corpus_data = pd.read_csv( "./input/corpus.tsv", sep="\t", names=["doc", "title"])
dev_data = pd.read_csv("./input/dev.query.txt", sep="\t", names=["query", "title"])
train_data = pd.read_csv("./input/train.query.txt", sep="\t", names=["query", "title"])
qrels = pd.read_csv("./input/qrels.train.tsv", sep="\t", names=["query", "doc"])

In [13]:
corpus_data = corpus_data.set_index("doc")
dev_data = dev_data.set_index("query")
train_data = train_data.set_index("query")
qrels = qrels.set_index("query")

In [14]:
qrels.head()

Unnamed: 0_level_0,doc
query,Unnamed: 1_level_1
1,679139
2,35343
3,781652
4,557516
5,588014


In [15]:
for idx in range(1, 20):
    print(
        train_data.loc[idx]["title"],
        "\t",
        corpus_data.loc[qrels.loc[idx].ravel()[0]]["title"],
    )

美赞臣亲舒一段 	 领券满减】美赞臣安婴儿A+亲舒 婴儿奶粉1段850克 0-12个月宝宝
慱朗手动料理机 	 Braun/博朗 MQ3035/3000/5025料理棒手持小型婴儿辅食家用搅拌机
電力貓 	 小米WiFi电力猫无线路由器套装一对300M穿墙宝家用信号增强扩展器
掏夹缝工具 	 电梯地坎清洁工具除灰尘神器轿厢门槽缝隙掏勺维保打扫奥的斯三菱
飞推vip 	 飞逗推拍 店主邀请码 去水印 创意视频一键制作视频
多功能托地把 	 免手洗拖把家用一拖净刮刮乐干湿两用懒人拖平板墩布托帕拖地神器
充气浮力袖 	 学游泳神器装备充气腰背漂水袖浮臂三角浮力儿童游泳辅助工具大人
盒马花胶鸡汤锅 	 盒马鲜生工坊代购 花胶奶冻150g 入口Q弹 奶味浓郁 香甜丝滑
塞塞乐 	 婴儿童玩具6个月以上8宝宝益智早教0一1岁男孩女孩六9月十7新生礼
广汽传祺gs5挡风遮雨条子 	 2021款广汽传祺GS5晴雨挡遮雨板传奇GS5配件车窗雨眉防雨条挡雨板
冰墩敦人偶服装 	 灯笼布偶熊猫道具服装吉祥物人穿玩偶宣传传单服活动人偶服装
寵物罐頭密封蓋 	 仁可宠物 猫罐头保鲜盒密封盖防潮可加热猫咪罐头勺喂食勺猫用品
15 蒸汽爱美克闸阀 	 埃美柯 8135闸阀304不锈钢蒸汽用闸阀中型Z15W-16P耐温腐蚀4分6分
电动切面机 	 复兴牌面条机电动家用不锈钢压面机多功能半全自动四种面条DMT-6
医用震动排痰机 	 普门排痰机振动背心式慢阻肺支气管扩张肺气肿医用咳痰祛痰神器
草莓盆专用夹 	 大棚草莓钩盆器新款农具草莓采摘神器摘取自如温室水果铁丝钩子
lg洗烘套装 	 LG RC90V9AV2W RC90V9JV2W RH10进口9/10KG热泵双变频干衣烘干机
芝士脆 	 山居小食 芝士小脆棒 香酥小零食 罐装 110g包邮
笔记本应用书籍 	 ThinkPad笔记本电脑应用技术精粹


# 词向量

In [16]:
import jieba

" ".join(jieba.cut("美赞臣亲舒一段"))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.593 seconds.
Prefix dict has been built successfully.


'美赞臣 亲舒 一段'

In [17]:
def title_cut(x):
    return list(jieba.cut(x))

from joblib import Parallel, delayed

corpus_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in corpus_data["title"])
train_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in train_data["title"])
dev_title = Parallel(n_jobs=4)(delayed(title_cut)(title) for title in dev_data["title"])

Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.519 seconds.
Prefix dict has been built successfully.
Loading model cost 0.523 seconds.
Prefix dict has been built successfully.
Loading model cost 0.527 seconds.
Prefix dict has been built successfully.
Loading model cost 0.530 seconds.
Prefix dict has been built successfully.


In [19]:
from gensim.models import Word2Vec
from gensim.test.utils import common_texts


if os.path.exists("word2vec.model"):
    model = Word2Vec.load("word2vec.model")
else: 
    model = Word2Vec(
        sentences=list(corpus_title) + list(train_title) + list(dev_title),
        vector_size=128,
        window=5,
        min_count=1,
        workers=4,
    )
    model.save("word2vec.model")

In [20]:
model.wv.most_similar("小天鹅")

[('金羚', 0.8855313658714294),
 ('海尔', 0.8832445740699768),
 ('韩电', 0.865885317325592),
 ('三洋', 0.8579813838005066),
 ('惠而浦', 0.8569628596305847),
 ('波轮', 0.8547092080116272),
 ('吉德', 0.8522803783416748),
 ('容声', 0.8349811434745789),
 ('xqb50', 0.8298338055610657),
 ('荣事达', 0.8265526294708252)]

In [21]:
model.wv.index_to_key[:10]

[' ', '新款', '女', '/', '2021', '-', '加厚', '儿童', '秋冬', '外套']

In [22]:
model.wv.key_to_index["女"]

2

In [23]:
train_w2v_ids = [[model.wv.key_to_index[xx] for xx in x] for x in train_title]
corpus_w2v_ids = [[model.wv.key_to_index[xx] for xx in x] for x in corpus_title]
dev_w2v_ids = [[model.wv.key_to_index[xx] for xx in x] for x in dev_title]

# all_text = " ".join(train_data["title"])
# all_query_word = list(jieba.cut(all_text))
# all_query_word = [x for x in all_query_word if len(x) >= 2]
# all_query_ids = [model.wv.key_to_index[xx] for xx in all_query_word]

## IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

idf = TfidfVectorizer(analyzer=lambda x: x)
idf.fit(train_title + corpus_title)

TfidfVectorizer(analyzer=<function <lambda> at 0x7f503c611a60>)

In [25]:
idf.idf_, len(idf.idf_)

(array([ 2.46292242,  8.5771301 ,  7.7050655 , ..., 14.21903717,
        14.21903717, 14.21903717]),
 640554)

In [26]:
token = np.array(idf.get_feature_names())
drop_token = token[np.where(idf.idf_ < 10)[0]]
drop_token = list(set(drop_token))
drop_token += ['领券']



In [27]:
drop_token_ids = [model.wv.key_to_index[x] for x in drop_token]

In [28]:
[idf.idf_[idf.vocabulary_[xx]] for xx in train_title[0]]

[11.083542956587955, 13.52588999195716, 10.621724911928661]

# 句子编码

In [29]:
def unsuper_w2c_encoding(s, pooling="max"):
    feat = []
    corpus_query_word = [x for x in s if x not in drop_token_ids]
    if len(corpus_query_word) == 0:
        return np.zeros(128)
    
    feat = model.wv[corpus_query_word]

    if pooling == "max":
        return np.array(feat).max(0)
    if pooling == "avg":
        return np.array(feat).mean(0)


# def unsuper_w2c_encoding(s, pooling="avg", debug=False):
#     feat = []
    
#     # corpus_query_word = list(set(s) & set(all_query_ids))

#     for w in s:
        
#         if idf.idf_[idf.vocabulary_[w]] > 11:
#             if debug:
#                 print(w)
#             feat.append(model.wv[w])
        
#     if len(feat) == 0:
#         return np.zeros(128)


#     if pooling == "max":
#         return np.array(feat).max(0)
#     if pooling == "avg":
#         return np.array(feat).mean(0)

In [30]:
unsuper_w2c_encoding(train_w2v_ids[0])

array([ 0.13844971,  0.02329931,  0.00218298,  0.30193067,  0.24012831,
       -0.03772005,  0.06984258, -0.00104502,  0.2525359 ,  0.02744666,
        0.43275502,  0.03205966,  0.26791453, -0.04520423,  0.09708406,
        0.03066566, -0.04605221, -0.02081237, -0.03759015,  0.01533887,
        0.17473975,  0.54764605, -0.03636033, -0.0470927 ,  0.06880933,
        0.30155945, -0.03811692,  0.4881905 , -0.02433527, -0.03787201,
       -0.05090072,  0.1707239 , -0.04582863,  0.26897642,  0.18457308,
        0.2454911 ,  0.01931062,  0.482265  ,  0.23519492, -0.01226895,
        0.28913426,  0.46038544,  0.27555916,  0.05334978,  0.0333859 ,
        0.17673498, -0.04632355, -0.01765269,  0.40820375,  0.14650379,
        0.38070896,  0.41259366,  0.06576917,  0.8626872 ,  0.23650156,
        0.0819394 ,  0.49214628, -0.02028468, -0.03448511, -0.00941355,
       -0.02544864, -0.02463404,  0.1276301 ,  0.1326973 ,  0.30747333,
       -0.01177661,  0.27607018,  0.42989215, -0.02167742, -0.07

In [31]:
_ = unsuper_w2c_encoding(corpus_w2v_ids[679139-1])

In [33]:
from tqdm import tqdm_notebook
# [corpus_w2v_ids[x] for x in qrels['doc'].values[:100] - 1]

corpus_mean_feat = [
    unsuper_w2c_encoding(s) for s in tqdm_notebook(corpus_w2v_ids[:1000])
]
corpus_mean_feat = np.vstack(corpus_mean_feat)

train_mean_feat = [
    unsuper_w2c_encoding(s) for s in tqdm_notebook(train_w2v_ids[:100])
]
train_mean_feat = np.vstack(train_mean_feat)

dev_mean_feat = [
    unsuper_w2c_encoding(s) for s in tqdm_notebook(dev_w2v_ids[:100])
]
dev_mean_feat = np.vstack(dev_mean_feat)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  unsuper_w2c_encoding(s) for s in tqdm_notebook(corpus_w2v_ids[:1000] + [corpus_w2v_ids[x] for x in qrels['doc'].values[:100] - 1])


  0%|          | 0/1100 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  unsuper_w2c_encoding(s) for s in tqdm_notebook(train_w2v_ids[:100])


  0%|          | 0/100 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  unsuper_w2c_encoding(s) for s in tqdm_notebook(dev_w2v_ids[:100])


  0%|          | 0/100 [00:00<?, ?it/s]

# 初步检索

In [34]:
from sklearn.preprocessing import normalize

corpus_mean_feat = normalize(corpus_mean_feat)
train_mean_feat = normalize(train_mean_feat)
dev_mean_feat = normalize(dev_mean_feat)

In [38]:
mrr = []
for idx in tqdm_notebook(range(1, 100)):
    dis = np.dot(train_mean_feat[idx - 1], corpus_mean_feat.T)
    ids = np.argsort(dis)[::-1]
    
    # print(train_title[idx-1], corpus_data.loc[qrels.loc[idx].ravel()[0]]["title"],  dis[qrels.loc[idx].ravel()-1])
    # print(corpus_title[ids[0]])
    # mrr.append(1/(np.where(ids == qrels.loc[idx].ravel()[0] - 1)[0][0] + 1))
    # break
    # print('')
    # mrr.append(ids[0]==idx+999)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx in tqdm_notebook(range(1, 100)):


  0%|          | 0/99 [00:00<?, ?it/s]

In [39]:
np.mean(mrr)

0.18181818181818182

In [229]:
with open('query_embedding', 'w') as up :
    for id, feat in zip(dev_data.index, dev_mean_feat):
        up.write('{0}\t{1}\n'.format(id, ','.join([str(x)[:6] for x in feat])))
        
with open('doc_embedding', 'w') as up :
    for id, feat in zip(corpus_data.index, corpus_mean_feat):
        up.write('{0}\t{1}\n'.format(id, ','.join([str(x)[:6] for x in feat])))