# MIND 微软新闻数据集

 MIND
  ├── MINDlarge_dev
  │   ├── \_\_placeholder\_\_
  │   ├── behaviors.tsv
  │   ├── entity_embedding.vec
  │   ├── news.tsv
  │   └── relation_embedding.vec
  ├── MINDlarge_test
  ├── MINDlarge_train
  ├── MINDsmall_dev
  ├── MINDsmall_train


In [38]:
import pandas as pd
import numpy as np

数据路径

In [39]:
data_path = "F:/DataSets/MIND/MINDsmall_train/"
behaviors_path = data_path + "behaviors.tsv"
entity_embeddings_path = data_path + "entity_embedding.vec"
news_path = data_path + "news.tsv"

* 印象 ID。展示的 ID。 int64
* 用户身份。用户的匿名 ID。str
* 时间。展示时间，格式为“MM/DD/YYYY HH:MM:SS AM/PM”。str
* 历史。该用户在本次曝光前的新闻点击历史记录（点击新闻的ID列表）。点击的新闻文章按时间排序。str
* 印象。本次展示中展示的新闻列表以及用户对其的点击行为（1为点击，0为未点击）。印象中的新闻顺序已被打乱。str

In [40]:
feature_names = ['impression_id', 'user_id', 'time', 'history', 'impressions']
behaviors_file = pd.read_table(behaviors_path, header=None, names=feature_names)
print(type(behaviors_file['impression_id'][0]),type(behaviors_file['user_id'][0]),type(behaviors_file['time'][0]),type(behaviors_file['history'][0]),type(behaviors_file['impressions'][0]))
behaviors_file.head(2)

<class 'numpy.int64'> <class 'str'> <class 'str'> <class 'str'> <class 'str'>


Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...


## 嵌入表示
读取实体嵌入向量，每个向量是一个100维的数组
问题：
1. 只有实体的嵌入，每个新闻有标题和摘要的若干实体，没有正文内容的实体
2. 暂用：mean(（标题+摘要）.嵌入) 代表 新闻嵌入


In [61]:
entity_embeddings_file = pd.read_csv(entity_embeddings_path, header=None, sep='\t')

# 将ID和向量分离，ID在第一列，向量在剩余的列
entity_ids = entity_embeddings_file.iloc[:, 0]
embedding_vectors = entity_embeddings_file.iloc[:, 1:]

# 使用apply函数去除"Q"前缀并转换为整数
converted_entity_ids = entity_embeddings_file.iloc[:, 0].apply(lambda x: int(x[1:]))

# 将向量转换为一个numpy数组，这样每个向量就是一个数组形式
embedding_vectors = embedding_vectors.apply(lambda row: np.array(row), axis=1)

# 创建一个新的DataFrame来存储这些值
entity_embeddings = pd.DataFrame({'EID': entity_ids, 'Vector': embedding_vectors})

# 设置ID为索引，方便后续查询
# entity_embeddings.set_index('EID', inplace=True)
entity_embeddings.head(2)

Unnamed: 0,EID,Vector
0,Q41,"[-0.063388, -0.181451, 0.057501, -0.091254, -0..."
1,Q1860,"[0.060958, 0.069934, 0.015832, 0.079471, -0.02..."


## 构造用户兴趣表（行为表）
不是所有新闻都存在标题实体和摘要实体
步骤：
1. 构建新闻-嵌入表
2. 以user_id排序behavior_file
3. impressions到处交互新闻id
4. 通过新闻id，得到新闻嵌入
5. groupby用户id和不同时间段，计算平均嵌入，作为用户长中短期兴趣

In [42]:
import json
feature_names = ['news_id', 'label', 'sub_label', 'title', 'abstract', 'url', 'title_entity', 'abstract_entity']
news = pd.read_table(news_path, header=None, names=feature_names)

news['title_entity']

0        [{"Label": "Prince Philip, Duke of Edinburgh",...
1        [{"Label": "Adipose tissue", "Type": "C", "Wik...
2                                                       []
3                                                       []
4        [{"Label": "Skin tag", "Type": "C", "WikidataI...
                               ...                        
51277    [{"Label": "Woolsey Fire", "Type": "N", "Wikid...
51278    [{"Label": "Broadway theatre", "Type": "F", "W...
51279                                                   []
51280    [{"Label": "MLS Cup", "Type": "U", "WikidataId...
51281    [{"Label": "Peugeot RCZ", "Type": "V", "Wikida...
Name: title_entity, Length: 51282, dtype: object

In [43]:
print(news['title_entity'][2])
print(type(news['title_entity'][2]))

[]
<class 'str'>


实体存储在title_entity和abstract_entity中，以json格式的字符串存在

In [44]:
df = news

# 定义一个函数来提取WikidataId和Confidence
def extract_wikidata_confidence(json_str):
    if type(json_str) == str:  # 检查字符串是否为空
        entities = json.loads(json_str)
        return [(entity["WikidataId"], entity["Confidence"]) for entity in entities]
    else:
        return []  # 返回一个空列表

# 应用这个函数到title_entity列
df['wikidata_confidence1'] = df['title_entity'].apply(extract_wikidata_confidence)
df['wikidata_confidence2'] = df['abstract_entity'].apply(extract_wikidata_confidence)

# print(df['wikidata_confidence1'])
# print(df['wikidata_confidence2'])

In [79]:
# 定义函数计算平均嵌入
def calculate_average_embedding(wikidata_confidence):
    embeddings = []
    for wikidata_id, _ in wikidata_confidence:
        embedding = entity_embeddings.loc[entity_embeddings['EID'] == wikidata_id, 'Vector'].values
        if embedding.size > 0:
            embeddings.append(embedding[0])
    
    if embeddings:
        # 计算找到的嵌入向量的平均值
        average_embedding = np.mean(embeddings, axis=0)
        return average_embedding
    else:
        return np.array([])  # 如果没有找到嵌入向量，返回空数组

# 应用函数到wikidata_confidence列
df['average_embedding1'] = df['wikidata_confidence1'].apply(calculate_average_embedding)
df['average_embedding2'] = df['wikidata_confidence2'].apply(calculate_average_embedding)

df.head()

Unnamed: 0,news_id,label,sub_label,title,abstract,url,title_entity,abstract_entity,wikidata_confidence1,wikidata_confidence2,average_embedding,average_embedding1,average_embedding2
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],"[(Q80976, 1.0), (Q43274, 1.0), (Q9682, 0.97)]",[],"[0.004057333333333334, -0.03991733333333333, -...","[0.004057333333333334, -0.03991733333333333, -...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[(Q193583, 1.0)]","[(Q193583, 1.0)]","[-0.013597, -0.009758, 0.01712, -0.051993, 0.0...","[-0.013597, -0.009758, 0.01712, -0.051993, 0.0...","[-0.013597, -0.009758, 0.01712, -0.051993, 0.0..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",[],"[(Q212, 0.946)]",[],[],"[-0.065324, -0.088163, -0.015203, -0.031949, 0..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",[],"[(Q155223, 1.0)]",[],[],"[0.003752, -0.061771, -0.037073, 0.02677, -0.0..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[(Q3179593, 1.0)]","[(Q3179593, 1.0), (Q171171, 1.0), (Q371820, 0....","[0.014257, 0.018351, 0.005199, 0.001773, 0.031...","[0.014257, 0.018351, 0.005199, 0.001773, 0.031...","[-0.008593666666666668, -0.0001680000000000003..."


验证一下

In [78]:
# # 80976 43284 9682
# print((entity_embeddings.loc[entity_embeddings["EID"]== "Q80976",'Vector'].values +
#       entity_embeddings.loc[entity_embeddings["EID"]== "Q43274",'Vector'].values +
#       entity_embeddings.loc[entity_embeddings["EID"]== "Q9682",'Vector'].values)/3)


保存csv文件

In [80]:
df.to_csv("news_embedding.csv")