In [15]:
path = "/Users/Desktop/data/mind_new/"
import pandas as pd

news = pd.read_table('/Users/Downloads/MINDsmall_dev/news.tsv',
              header=None,
              names=[
                  'id', 'category', 'subcategory', 'title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])

In [33]:
item_list = path + 'item_map.txt'

item_map = {}


with open(item_list, 'r') as file:
    for line in file:
        key, value = line.strip().split()
        item_map[(key)] = int(value)

news = news[news['id'].isin(set(item_map.keys()))]
news['item_id'] = news['id'].apply(lambda x: item_map[x])

item_1stneighbor_text_dic = {}
for idx, row in news.iterrows():
    item_id = row['id'] # original id
    category = row['category']
    subcategory = row['subcategory']
    title = row['title']
    abstract = row['abstract']
    text = f"The category of the news is {category}, the subcategory is {subcategory}, and the abstract of the news is {abstract}"
    item_1stneighbor_text_dic[item_id] = text

In [19]:
import json

import copy

# item_1stneighbor_text_dic = {}
triples = [] 

single_cols = ["category", "subcategory", "title_entities", "abstract_entities", 'title', 'abstract']

for index, row in news.iterrows():
    # id = row['id']
    item_id = row['id']
    title = row['title']
    category = row['category']
    subcategory = row['subcategory']
    
    triples.append((item_id, "category", category))
    triples.append((item_id, "subcategory", subcategory))

    triples.append((category, "category_reverse", item_id))
    triples.append((subcategory, "subcategory_reverse", item_id))

    
    if isinstance(row['title_entities'], str):
        title_entities = json.loads(row['title_entities'])
        for item in title_entities:
            if 'Label' in item.keys():
                triples.append((item_id, 'title_entities.Label', item['Label']))
                triples.append((item['Label'], 'title_entities.Label.reverse', item_id))
                
                # if 'Type' in item.keys():
                #     triples.append((item['Label'], 'entity.type', item['Type']))
                #     triples.append((item['Type'], 'entity.type.reverse', item['Label']))
            
            if 'SurfaceForms' in item.keys():
                if len(item['SurfaceForms']) > 0:
                    item['SurfaceForms'] = item['SurfaceForms'][0]
                triples.append((item_id, 'title_entities.SurfaceForms', item['SurfaceForms']))
                triples.append((item['SurfaceForms'], 'title_entities.SurfaceForms.reverse', item_id))

    if isinstance(row['abstract_entities'], str):
        abstract_entities = json.loads(row['abstract_entities'])
        for item in abstract_entities:
            if 'Label' in item.keys():
                triples.append((item_id, 'abstract_entities.Label', item['Label']))
                triples.append((item['Label'], 'abstract_entities.Label.reverse', item_id))
                # if 'Type' in item.keys():
                #     triples.append((item['Label'], 'entity.type', item['Type']))
                #     triples.append((item['Type'], 'entity.type.reverse', item['Label']))
            if 'SurfaceForms' in item.keys():
                if len(item['SurfaceForms']) > 0:
                    item['SurfaceForms'] = item['SurfaceForms'][0]
                triples.append((item_id, 'abstract_entities.SurfaceForms', item['SurfaceForms']))
                triples.append((item['SurfaceForms'], 'abstract_entities.SurfaceForms', item_id))


data = pd.DataFrame(triples, columns=['head', 'relation', 'tail'])
data = data.dropna()

In [32]:

import dgl
import random
import torch
from collections import Counter
from sklearn.preprocessing import LabelEncoder


def create_kg_from_data(data):
    node_encoder = LabelEncoder()
    relation_encoder = LabelEncoder()
    
    nodes = pd.concat([data['head'], data['tail']]).unique()
    node_encoder.fit(nodes)
    head = node_encoder.transform(data['head'])
    tail = node_encoder.transform(data['tail'])

    relations = data['relation'].unique()
    relation_encoder.fit(relations)
    relation = relation_encoder.transform(data['relation'])

    g = dgl.heterograph({
        ('node', 'relation', 'node'): (head, tail)
    })
    
    g.edges['relation'].data['etype'] = torch.tensor(list(relation))  
    return g, node_encoder, relation_encoder



def node2vec_walk(g, start_node, K):
    walk_2nd_neighbor_dic = {}
    walk = [start_node]
    visited_neighbors = set()
    visited_second_neighbors = set()
    
    one_hop_neighbors = g.successors(start_node).tolist()
    
    for neighbor in one_hop_neighbors:
        walk_2nd_neighbor_dic[neighbor] = []
        walk.append(neighbor)
        visited_neighbors.add(neighbor)
        
        second_order_neighbors = g.successors(neighbor).tolist()
        second_order_neighbors = [n for n in second_order_neighbors if n != start_node]
        # print("debug second_order_neighbors", second_order_neighbors)
        if len(second_order_neighbors) > K:
            sampled_neighbors = np.random.choice(second_order_neighbors, K, replace=False)
        else:
            sampled_neighbors = second_order_neighbors
        
        for second_neighbor in sampled_neighbors:
            walk.append(second_neighbor)
            visited_second_neighbors.add(second_neighbor)
            walk_2nd_neighbor_dic[neighbor].append(second_neighbor)
    
    return walk_2nd_neighbor_dic

def get_edge_types(g, walk):
    edge_types = []
    for i in range(len(walk) - 1):
        src, dst = walk[i], walk[i + 1]
        eid = g.edge_ids(src, dst)
        edge_type = g.edata['etype'][eid]
        edge_types.append(edge_type.item())
    return edge_types


# -------------------------------------main----------------------------------
import numpy as np
data["tail"] = data["tail"].apply(str)
data["head"] = data["head"].apply(str)
graph, node_encoder, relation_encoder = create_kg_from_data(data)

item_id_name_map = news.set_index('id')['title'].to_dict()  # original news id
item_2ndneighbor_text_dic = {}
cnt = 0

selected_relation = ["category", "subcategory", "title_entities.Label", "abstract_entities.Label"]

# for item_id in list(item_id_name_map.keys())[:10]:
for item_id in item_id_name_map.keys():
    cnt += 1
    if cnt % 100 == 0:
        print(cnt)
        # break
    
    total_text = []

    item_name = item_id_name_map[item_id]
    item_node_id = node_encoder.transform([(item_id)])[0]
    item_2ndneighbor_text_dic[item_id] = []

    cur_dic = node2vec_walk(graph, item_node_id, K=10)

    for neighbor_id in cur_dic.keys():
        
        relation = get_edge_types(graph, walk=[item_node_id, neighbor_id])
        # to text
        neighbor = node_encoder.inverse_transform([neighbor_id])[0]
        relation = relation_encoder.inverse_transform(relation)[0]
        
        if relation not in selected_relation:
            continue
        # walk = [item_name, relation, neighbor]
        # print(walk)
        second_order_neighbors = cur_dic[neighbor_id]
        second_order_neighbors_list = []
        for second_order_neighbor_id in second_order_neighbors:
            second_order_neighbor = node_encoder.inverse_transform([second_order_neighbor_id])[0]
            second_order_neighbors_list.append(second_order_neighbor)
        # second_order_neighbors_name_list = [item_id_name_map[int(i)]+"@"+business[business["business_id_mapped"]==int(i)]['address'].values[0] for i in second_order_neighbors_list]
        # second_order_neighbors_name_list = [i for i in second_order_neighbors_list]
        second_order_neighbors_name_list = [item_id_name_map[i] for i in second_order_neighbors_list]

        if len(second_order_neighbors_name_list) > 0:
            item_2ndneighbor = "; ".join(second_order_neighbors_name_list)
            if relation == "category":
                text = f"News under the same category ({neighbor}) also include: {item_2ndneighbor}."
            elif relation == "subcategory":
                text = f"News under the same subcategory ({neighbor}) also include: {item_2ndneighbor}."
            elif relation == "title_entities.Label":
                text = f"News with the same entity ({neighbor}) in the title also include: {item_2ndneighbor}."
            elif relation == "abstract_entities.Label":
                text = f"News with the same entity ({neighbor}) in the abstract also include: {item_2ndneighbor}."
            else:
                continue
        else:
            continue

        total_text.append(text)
    
    item_2ndneighbor_text_dic[item_id] = " ".join(total_text) 

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100


In [36]:
import json

def prompt_generation(id, name, item_1stneighbor_text_dic, item_2ndneighbor_text_dic):
    basic_info = item_1stneighbor_text_dic[id]
    order_2nd_info = item_2ndneighbor_text_dic[id]

    analysis = \
    f"""
    The title of the news is: {name}. Below is the information about the news:
    1) Basic information: {basic_info} ;
    2) Related news: {order_2nd_info}
    
    """
    
    return analysis


question_dic = {}
item_id2id_map = news.set_index('id')['item_id'].to_dict()  # original news id to mapped id

for id, name in item_id_name_map.items():
    prompt = prompt_generation(id, name, item_1stneighbor_text_dic, item_2ndneighbor_text_dic)
    mapped_id = item_id2id_map[id]
    question_dic[mapped_id] = prompt


print("The number of requests is: ", len(list(question_dic.keys())))



with open('/Users/Desktop/data/mind_new/llm_input_item.json', 'w') as f:
    json.dump(question_dic, f)

The number of requests is:  15174


In [None]:
instruction = "Assume you are an expert in news media. You are given a piece of news. I will give you the basic information of the news and some related news that has the same category or key words with the current news.  Please help me summarize the news and related news, and analyze what kind of users would like reading this type of news. please give your answer in a coherent paragraph under 150 words. Here is a sample answer: Hunter Biden is stepping down from the board of a Chinese-backed private equity company and pledging to forgo all foreign work if his father, former U.S. Vice President Joe Biden, is elected president in 2020. This decision comes amid increasing attacks from President Donald Trump, who has been scrutinizing the Biden family's foreign business dealings. Related news covers a wide range of political and social issues, including police actions, fundraising efforts, acts of kindness, and other political controversies involving Trump and Biden. This news would attract politically engaged readers, particularly those interested in the 2020 U.S. presidential election, the ongoing scrutiny of the Biden family, and the broader implications of U.S.-China relations. It would also appeal to readers following Trump's criticisms and the political dynamics between the Biden and Trump camps, as well as those interested in political ethics, campaign strategies, and international business relations."

In [37]:
instruction_v2 = "Assume you are an expert in news media. You are given a piece of news. I will give you the basic information of the news and some related news that has the same category or key words with the current news.  Please help me summarize the news and related news, including the category and main content, and analyze what kind of users would like reading this type of news. please give your answer in a coherent paragraph under 150 words. Here is a sample answer: Hunter Biden is stepping down from the board of a Chinese-backed private equity company and pledging to forgo all foreign work if his father, Joe Biden, is elected president in 2020. This move comes amidst attacks from President Donald Trump, who has been targeting Hunter Biden's business dealings. This news falls under the politics subcategory and is part of a broader narrative involving political figures and their controversies. Related news includes various political stories, such as Trump's impeachment inquiry, Democratic fundraising efforts, and other political maneuvers. This type of news would appeal to politically engaged readers, particularly those interested in U.S. politics, election campaigns, and the ongoing scrutiny of political figures and their families. These readers are likely to follow political developments closely and have a keen interest in the implications of such events on the upcoming elections."

'\n    The title of the news is: Hunter Biden Steps Down From Chinese Board as Trump Attacks. Below is the information about the news:\n    1) Basic information: The category of the news is news, the subcategory is newspolitics, and the abstract of the news is Hunter Biden is stepping down from the board of a Chinese-backed private equity company and promising to forgo all foreign work if his father, former U.S. Vice President Joe Biden, is elected president in 2020. ;\n    2) Related news: News under the same category (news) also include: Police blew up an innocent man\'s house in search of an armed shoplifter. Too bad, court rules.; How much money did Beto O\'Rourke raise in latest fundraising period?; GoFundMe Honors Good Samaritans at First Heroes Celebration: \'Amazing Things Are Happening\'; \'Be Kind! Just Try It! I Will!\': 98-Year-Old Leads Glenwood, Minn., Kindness Movement; Football player changes last name for stepdad; Facebook takedowns show new Russian activity targeted B

In [31]:
item_2ndneighbor_text_dic

{'N61837': "News under the same category (news) also include: Kelsey Berreth Murder Suspect's Secret Ex: He 'Told Me to Take Care of the Problem'; Police release images of two men wanted for robbing NYPD captain; Hey Google and Alexa, how easy is it to take control?; Energy giants face 35% output cut to hit Paris climate goals: watchdog; Joe Biden or Elizabeth Warren? New Polls Differ on Who's Leading 2020 Race. News under the same subcategory (newsworld) also include: Iran, Russia lash out at US plans to protect oil in Syria; 2 'Unprofessional' Poll Workers Arrested On Election Day; Catalan President Makes Secession Pledge in Challenge to Spain; Today in History: October 28; 12 Hours. 4 Syrian Hospitals Bombed. One Culprit: Russia.. News with the same entity (Ukraine) in the abstract also include: State Department agrees to turn over Ukraine documents, potentially providing Democrats with impeachment fodder; White House Aides Feared That Trump Had Another Ukraine Back Channel; Top Rep