In [None]:
import torch
import networkx as nx
import itertools
import json
from tqdm import tqdm
import numpy as np
from scipy import sparse
import pickle
from scipy.sparse import csr_matrix, coo_matrix
from multiprocessing import Pool
from collections import OrderedDict
from transformers import RobertaTokenizer, RobertaForMaskedLM

In [None]:
merged_relations = [
    'antonym',
    'atlocation',
    'capableof',
    'causes',
    'createdby',
    'isa',
    'desires',
    'hassubevent',
    'partof',
    'hascontext',
    'hasproperty',
    'madeof',
    'notcapableof',
    'notdesires',
    'receivesaction',
    'relatedto',
    'usedfor',
]

In [None]:
def load_data(cpnet_vocab_path):
    with open(cpnet_vocab_path, "r", encoding="utf-8") as fin:
        id2cpt = [line.strip() for line in fin]
    cpt2id = {w:i for i, w in enumerate(id2cpt)}
    id2rel = merged_relations
    rel2id = {w:i for i, w in enumerate(id2rel)}
    return id2cpt, cpt2id, id2rel, rel2id
#id2concept, concept2id, id2relation, relation2id = load_data("../input/cp-net-en/CPnet_en/concept.txt")

In [None]:
#load concept net data
def load_cpnet(cpnet_graph_path):
    graph = nx.read_gpickle(cpnet_graph_path)
    cpnet_simple = nx.Graph()
    # example: 0 1 {'rel': 0, 'weight': 1.0}
    for u,v,data in graph.edges(data=True):
        w = data["weight"] if "weight" in data else 1.0
        if cpnet_simple.has_edge(u,v):
            cpnet_simple[u][v]["weight"] +=w
        else:
            cpnet_simple.add_edge(u,v,weight = w)
    return cpnet_simple, graph
# cpnet_simple, cpnet = load_cpnet("../input/cp-net-en/CPnet_en/conceptnet.en.pruned.graph")

In [None]:

# sentence = "hello world!"
# model = AutoModel.from_pretrained("bert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# input_ids= tokenizer.encode(sentence)
# max_len = 15
# input_ids += [tokenizer.pad_token_id]*(max_len - len(input_ids))
# mask = (torch.tensor(input_ids)!=tokenizer.pad_token_id).long()

In [None]:
def concepts_to_adj_matrices_2hop_all_pair__use_LM__Part1(data, cpnet_simple):
    '''
    This function generate the adj list for each concepts
    '''
    q_ids, ans_ids, qa_data = data
#     print("qids", q_ids)
#     print("ans_ids", ans_ids)
    qa_nodes = set(q_ids) | set(ans_ids)
#     print("qa_nodes", qa_nodes)
    extra_nodes = set()
    for qid in qa_nodes:
        for aid in qa_nodes:
            if qid != aid and qid in cpnet_simple.nodes and aid in cpnet_simple.nodes:
                # qid one-hop adj ls
#                 print(cpnet_simple[qid])
                # ans one-hop adj ls
#                 print(cpnet_simple[aid])
                # get the intersection of each concept one hop neightbour
                extra_nodes |= set(cpnet_simple[qid]) & set(cpnet_simple[aid])
    # remove already exist qa part
    extra_nodes = extra_nodes - qa_nodes
    return (sorted(q_ids), sorted(ans_ids), qa_data, sorted(extra_nodes))

In [None]:
# get relevance score
def get_relevance_score(cids, question, id2concept, tokenizer, model):
    '''
    This function return the relevance score for each concepts
    input:
        cids: concept ids
        question: qa context
        id2concept: id2concept dict
        tokenizer: huggingface tokenizer
        model: pretrained language model
    '''
    cids = cids[:]
    # add QA context node
    # which is the LM encoding represent as z in paper
    cids.insert(0, -1)
    sents, scores = [], []
    # here create all the sentence pair as: question,answer, and do the MLM per each sentence, to get the relevance score per each concept
    for cid in cids:
        if cid == -1:
            # if added
            sent = question.lower()
        else:
            # if true concepts
            sent = "{}{}.".format(question.lower," ".join(id2concept[cid].split("_")))
        sent = tokenizer.encode(sent, add_special_tokens = True)
        sents.append(sent)
    num_cids = len(cids)
    cur_idx = 0
    batch_size = 50
    # get score
    while cur_idx < num_cids:
        # prepare for each batch
        input_ids = sents[cur_idx:cur_idx+batch_size]
        max_len = max([len(seq) for seq in input_ids])
        # add padding
        for i, ids in enumerate(input_ids):
            ids += [tokenizer.pad_token_id] * (max_len - len(ids))
            input_ids[i] = ids
        input_ids = torch.tensor(input_ids).cuda()# shape is [bs, max_len]
        mask = (input_ids != tokenizer.pad_token_id).long()# shape is [bs, max_len]
        #pass the forward for pretrained language model
        with torch.no_grad():
            
            output = model(input_ids, attention_mask = mask, masked_lm_labels=input_ids)
            # use loss as score
            loss = output[0]
            _scores = list(-loss.detach().cpu().numpy())
        scores += _scores
        cur_idx +=batch_size
    assert len(sents) == len(scores) == len(cids)
    cid2score = OrderedDict(sorted(list(zip(cids, scores)), key=lambda x: -x[1]))# score from high to low
    return cid2score

In [None]:
class RobertaForMaskedLMwithLoss(RobertaForMaskedLM):
    #
    def __init__(self, config):
        super().__init__(config)
    #
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, masked_lm_labels=None):
        #
        assert attention_mask is not None
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask)
        sequence_output = outputs[0] #hidden_states of final layer (batch_size, sequence_length, hidden_size)
        prediction_scores = self.lm_head(sequence_output)
#         print(prediction_scores.shape)
#         print(sequence_output.shape)
        
        outputs = (prediction_scores, sequence_output) + outputs[2:]
#         print(len(outputs))
        if masked_lm_labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
            bsize, seqlen = input_ids.size()
#             print("evalutaion shape",prediction_scores.view(-1, self.config.vocab_size).shape)
#             print("labels shape",masked_lm_labels.view(-1).shape)
#             print("num_class", torch.unique(masked_lm_labels.view(-1)).shape)
#             print("class_type", torch.unique(masked_lm_labels.view(-1)))
#             print("cross entropy shape", loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)).shape)
            #这里很奇怪，其评分用的应该是 masked LM
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)).view(bsize, seqlen)
            #print("loss shape", masked_lm_loss.shape)
            masked_lm_loss = (masked_lm_loss * attention_mask).sum(dim=1)
            outputs = (masked_lm_loss,) + outputs
            # (masked_lm_loss), prediction_scores, sequence_output, (hidden_states), (attentions)
        return outputs

In [None]:
def concepts_to_adj_matrices_2hop_all_pair__use_LM__Part2(data, id2concept):
    q_ids, ans_ids, qa_data, extra_nodes = data
    model = RobertaForMaskedLMwithLoss.from_pretrained("roberta-large").cuda()
    model.eval()
    tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
    cid2score = get_relevance_score(q_ids + ans_ids + extra_nodes, qa_data,id2concept, tokenizer, model)
    print(cid2score)
    return (q_ids, ans_ids, qa_data, extra_nodes, cid2score)

In [None]:
def concepts2adj(node_ids, id2relation,cpnet ):
    cids = np.array(node_ids, dtype=np.int32)
    n_rel = len(id2relation)
    n_node = cids.shape[0]
    adj = np.zeros((n_rel, n_node, n_node), dtype=np.uint8)
    for s in range(n_node):
        for t in range(n_node):
            s_c, t_c = cids[s], cids[t]
            if cpnet.has_edge(s_c, t_c):
                for e_attr in cpnet[s_c][t_c].values():
                    if e_attr['rel'] >= 0 and e_attr['rel'] < n_rel:
                        adj[e_attr['rel']][s][t] = 1
    # cids += 1  # note!!! index 0 is reserved for padding
    adj = coo_matrix(adj.reshape(-1, n_node))
    return adj, cids
def concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3(data, id2relation, cpnet):
    q_ids, ans_ids, qa_data, extra_nodes, cid2scores = data
    schema_graph = q_ids + ans_ids + sorted(extra_nodes, key = lambda x: -cid2scores[x])
    # get qmask and amask
    qmask = np.arange(len(schema_graph)) < len(q_ids)
    amask = (np.arange(len(schema_graph)) >= len(q_ids)) & (np.arange(len(schema_graph)) < len(q_ids) + len(ans_ids))
    adj, concepts = concepts2adj(schema_graph, id2relation, cpnet)
    return {"adj": adj, "concepts":concepts, "qmask":qmask, "amask":amask, "cid2score":cid2scores}

In [None]:
def generate_adj_data_from_grounded_concepts__use_LM(grounded_path, cpnet_graph_path, cpnet_vocab_path, output_path, num_processes):
    """
    This function will save
        (1) adjacency matrics (each in the form of a (R*N, N) coo sparse matrix)
        (2) concepts ids
        (3) qmask that specifices whether a node is a question concept
        (4) amask that specifices whether a node is an answer concept
        (5) cid2score that maps a concept id to its relevance score given the QA context
    """
    print(f'generating adj data for {grounded_path}...')
    id2concept, concept2id, id2relation, relation2id = load_data(cpnet_vocab_path)    
    cpnet_simple, cpnet = load_cpnet(cpnet_graph_path)
    qa_data = []
    statement_path = grounded_path.replace('grounded', 'statement')
    
    # ---------------------------------- Get q_concept ids, answer_concept ids, question-answer description
    with open(grounded_path,'r',encoding='utf-8') as fin_ground, open(statement_path,'r', encoding='utf-8') as fin_state:
        lines_ground = fin_ground.readlines()
        lines_state = fin_state.readlines()
        print(lines_ground[0], lines_state[0])
        assert len(lines_ground) % len(lines_state) == 0
        # number of choices per question
        num_choice = len(lines_ground) // len(lines_state)
        for j, line in enumerate(lines_ground):
            dic = json.loads(line)
            # get question concept and answer concept mapping idx
            qc_ids = set(concept2id[c] for c in dic["qc"])
            answers_ids = set(concept2id[c] for c in dic["ac"])
            # remove answer ids in question ids
            qc_ids = qc_ids - answers_ids
            # get corresponding source text ids
            statement_obj = json.loads(lines_state[j//num_choice])
            qa_context = "{}{}.".format(statement_obj['question']["stem"],dic["ans"])
            # append it 
            qa_data.append((qc_ids, answers_ids, qa_context))
    # test
    qids, ansids, context, intersect_ = concepts_to_adj_matrices_2hop_all_pair__use_LM__Part1(qa_data[0], cpnet_simple)
    qids, ansids, context, intersect_,cid2score = concepts_to_adj_matrices_2hop_all_pair__use_LM__Part2((qids, ansids, context, intersect_),id2concept)
    res = concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3((qids, ansids, context, intersect_,cid2score), id2relation, cpnet)
    #print(res)
    # if you want to run all the test, please remove the common here
    
#     with Pool(num_processes):
#         res1 = list(tqdm(p.imap(concepts_to_adj_matrices_2hop_all_pair__use_LM__Part1, qa_data), total=len(qa_data)))
    
#     res2 = []
#     for j, _data in enumerate(res1):
#         if j % 100 == 0: print (j)
#         res2.append(concepts_to_adj_matrices_2hop_all_pair__use_LM__Part2(_data))

#     with Pool(num_processes) as p:
#         res3 = list(tqdm(p.imap(concepts_to_adj_matrices_2hop_all_pair__use_LM__Part3, res2), total=len(res2)))

#     # res is a list of responses
#     with open(output_path, 'wb') as fout:
#         pickle.dump(res3, fout)

#     print(f'adj data saved to {output_path}')
    

In [None]:
generate_adj_data_from_grounded_concepts__use_LM("../input/csqa-with-subgraph/csqa/grounded/train.grounded.jsonl", "../input/cp-net-en/CPnet_en/conceptnet.en.pruned.graph",
                                                "../input/cp-net-en/CPnet_en/concept.txt",None,None)