In [315]:
import pandas as pd
import numpy as np
import json
from tqdm.autonotebook import tqdm
import re
import nltk
import http.client, urllib.request, urllib.parse, urllib.error, base64
from nltk.corpus import stopwords
from pyNTCIREVAL import Labeler
from pyNTCIREVAL.metrics import MSnDCG, nERR, nDCG
from sklearn.metrics.pairwise import cosine_similarity
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [98]:
def getVectors(queries):
    headers = {
        # Request headers
        'Content-Type': 'application/json',
        'Ocp-Apim-Subscription-Key': '924c1505854b4da4a6144a1cce92937f',
    }
    
    queries = [str(i).replace("\'", "") for i in queries]

    params = urllib.parse.urlencode({})
    
    try:
        conn = http.client.HTTPSConnection('api.msturing.org')
#         conn.request("POST", "/gen/encode?%s" % params, '{"queries": ["how to make gingerbread people (in grams)", "test AI"]}', headers)
        conn.request("POST", "/gen/encode?%s" % params, str({"queries": queries}).replace("\'", "\""), headers)
        response = conn.getresponse()
        data = response.read()
        data = json.loads(data)
        conn.close()
    except Exception as e:
#         print(data)
        print(e)
#         print("[Errno {0}] {1}".format(e.errno, e.strerror))
    
    return {data[i]['query']:data[i]['vector'] for i in range(len(data))}

In [106]:
stop_words = set(stopwords.words('english'))

regex = re.compile('[^a-zA-Z0-9]')
#First parameter is the replacement, second parameter is your input string
def preprocessingText(doc):
    doc = regex.sub(' ', doc)
    doc = " ".join([w for w in doc.split() if not w in stop_words])
    return doc.lower()

def evaluate(qrels, ranked_list):
    res = []
    grades = [1,2,3,4] # a grade for relevance levels 1 and 2 (Note that level 0 is excluded)
    labeler = Labeler(qrels)
    labeled_ranked_list = labeler.label(ranked_list)
    rel_level_num = 5
    xrelnum = labeler.compute_per_level_doc_num(rel_level_num)
    metric = MSnDCG(xrelnum, grades, cutoff=10)
    result = metric.compute(labeled_ranked_list)
    return result

trainIds, testIds = [], []
for name, group in df.groupby("entityType"):
    if group.query_id.nunique() > 1:
        ids = list(group.query_id.unique())
        mid = int(group.query_id.nunique() / 2)
        trainIds.extend(ids[:mid])
        testIds.extend(ids[mid:])
    else:
        ids = list(group.query_id.unique())
        trainIds.extend(ids)

In [40]:
df = pd.read_csv("data/AKG/Test Collection/AKGG/akg_standard_akgg_property_rele.csv")
df_action = pd.read_csv("data/AKG/Test Collection/AM/akg_standard_am_verb_object_rele.csv")

with open("data/AKG/Formal Run Topics/AKGG_Formal_Run_Topic.json") as json_file:
    data = json.load(json_file)
    qid, query, entity, entityType, action = [], [], [], [], []
    for p in data['queries']:
        qid.append(p['queryId'])
        query.append(p['query'])   
        entity.append(p['entity'])
        entityType.append(' '.join(p['entityTypes']))    
        action.append(p['action'])
topic = pd.DataFrame({"query_id": qid, "query": query, "entity": entity, "entityType": entityType, "action":action})
for c in ["query", "entityType", "action", "entity"]:
    topic[c] = topic[c].str.lower().replace("\'", "")
    
df = df.merge(topic, how="inner", on="query_id")

In [282]:
import json

with open("data/AKG/Formal Run Topics/AM_Formal_Run_Topic.json") as json_file:
    data = json.load(json_file)
    qid, entityurl, entity, entityType = [], [], [], []
    for p in data['queries'][0]:
        qid.append(p['queryId'])
        entity.append(p['entity'])
        entityType.append(' '.join(p['entityTypes']))    
        entityurl.append(p['entityurl'])
am_topic = pd.DataFrame({"query_id": qid, "url": entityurl, "entity": entity, "entityType": entityType})

In [39]:
df_wiki = pd.read_csv("data/wikihowSep.csv")
df_wiki['headline'] = df_wiki['headline'].str.replace("\n", "")
df_wiki['title'] = df_wiki['title'].str.replace("How to", "")

df_wiki['overview'] = [preprocessingText(str(i)) for i in df_wiki['overview']]
df_wiki['headline'] = [preprocessingText(str(i)) for i in df_wiki['headline']]
df_wiki['text'] = [preprocessingText(str(i)) for i in df_wiki['text']]
df_wiki['sectionLabel'] = [preprocessingText(str(i)) for i in df_wiki['sectionLabel']]
df_wiki['title'] = [preprocessingText(str(i)) for i in df_wiki['title']]

In [46]:
class AutoVivification(dict):
    """Implementation of perl's autovivification feature."""
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value
with open("data/AKG/Participants Runs/AKGG/akgg-formalrun-cuis.json") as json_file:
    data = json.load(json_file)
    run = AutoVivification()
    for p in data['runs']:
        for res in p['results']:
            for prop in res['properties']:
                run[p['runid']][str(res['queryid'])][str(prop['property'])] = prop['rank']

qids = []
props = []
for qid in run['1']:
    tmp = list(run['1'][str(qid)].keys())
    qids.extend([int(qid)] * len(tmp))
    props.extend(tmp)
df_run = pd.DataFrame({"query_id": qids, "property": props})
df_run = df_run.merge(topic, how="left", on="query_id")

In [65]:
import collections
qrel = collections.defaultdict(dict)
for qid, prop, label in df[['query_id', 'property', 'rele_label']].values:
    qrel[str(qid)][str(prop)] = int(label)

In [126]:
dfp = df[["query_id", "entityType", "property"]].append(df_run[["query_id", "entityType", "property"]])
dfp = dfp[dfp.query_id.isin(trainIds)]
type2prop = dfp.groupby("entityType")['property'].unique().to_dict()

In [143]:
dfp.property.nunique(), df_run[df_run.query_id.isin(trainIds)].property.nunique()

(287, 175)

In [127]:
prop2popularity = dfp.groupby("property").size().to_dict()

In [128]:
type2prop2popularity = dfp.groupby(["entityType", "property"]).size().to_dict()

In [313]:
terms = []
for c in [ 'entity', 'entityType', 'action']:
    for i in df[c].unique().tolist():
        terms.append(i.replace("\'", ""))
terms.extend([camel_case_split(i) for i in df.property.unique()])
term2MSvec = getTermMSvec(terms)

In [319]:
list(term2MSvec.keys())

767

In [321]:
MAX_NUM_WORDS = 1000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(list(term2MSvec.keys()))
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

def get_pretrain_embeddings(path, word_index, EMBEDDING_DIM=300):
    MAX_NUM_WORDS = len(word_index)
    BASE_DIR = path + 'data/'
    GLOVE_DIR = os.path.join(BASE_DIR, 'w2v')
    print('Indexing word vectors.')

    embeddings_index = {}
    with open(os.path.join(GLOVE_DIR, 'glove.42B.300d.txt'), encoding="utf-8") as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs

    print('Found %s word vectors.' % len(embeddings_index))

    print('Preparing embedding matrix.')

    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    found = 0
    for word, i in word_index.items():
        if i > MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if embedding_vector.shape[0] == 0:
                continue
            embedding_matrix[i] = embedding_vector
            found += 1

    print("Token num: %d, Found Tokens: %d" % (len(word_index), found))

    # load pre-trained word embeddings into an Embedding layer
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix))

    return embedding_layer

In [251]:
def getTermMSvec(all_properties):
    tmp = {}
    for i in range(0, len(all_properties), 20):
        data = getVectors(all_properties[i:i+20])
        for i in data:
            tmp[i] = data[i]
    return tmp

# qid2MSvec = {}
# for i, j, k, l in df[["query_id", "entity", "action", "entityType"]].drop_duplicates().values:
#     q = str(k +" " +j + " " + l).replace("\'", "")
#     try:
#         data = getVectors([q])
#         qid2MSvec[i] = data[q]
#     except Exception as e:
#         print(e)
#         print(q)
# prop2MSvec = getTermMSvec(dfp.property.unique().tolist())
# type2MSvec = getTermMSvec(dfp.entityType.unique().tolist())
# action2MSvec = getTermMSvec(df.action.unique().tolist())
# entity2MSvec = getTermMSvec(df.entity.unique().tolist())

In [316]:
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return " ".join([m.group(0) for m in matches]).lower()

In [269]:
from keras.layers import Input, Concatenate, Embedding, Multiply, Dot, Dense, Subtract, Activation, SimpleRNN, Flatten, Lambda
from keras.models import Model
from keras import backend as K
import numpy as np

def bpr_triplet_loss(X):
    positive_item_latent, negative_item_latent = X

    loss = 1 - K.log(K.sigmoid(positive_item_latent - negative_item_latent))

    return loss


def identity_loss(y_true, y_pred):
    return K.mean(y_pred - 0 * y_true)


class BPR():
    def __init__(self):

        self.entityInput = Input(shape=(100,))
        self.entityTypeInput = Input(shape=(100,))
        self.actionInput = Input(shape=(100,))
        
        self.propPosInput = Input(shape=(100,))
        self.propNegInput = Input(shape=(100,))

        queryEmbeddingLayer = Dense(10, name="uEmb")
        propEmbeddingLayer = Dense(10, name="iEmb")

#         self.qEmb = queryEmbeddingLayer(self.queryInput)
#         self.pEmb = propEmbeddingLayer(self.propPosInput)
#         self.nEmb = propEmbeddingLayer(self.propNegInput)

#         pDot = Dot(axes=-1)([self.qEmb, self.pEmb])
#         nDot = Dot(axes=-1)([self.qEmb, self.nEmb])

        peDot = Dot(axes=-1)([self.entityInput, self.propPosInput])
        ptDot = Dot(axes=-1)([self.entityTypeInput, self.propPosInput])
        paDot = Dot(axes=-1)([self.actionInput, self.propPosInput])

        neDot = Dot(axes=-1)([self.entityInput, self.propNegInput])
        ntDot = Dot(axes=-1)([self.entityTypeInput, self.propNegInput])
        naDot = Dot(axes=-1)([self.actionInput, self.propNegInput])
        
        pDot = Concatenate()([self.entityInput, self.entityTypeInput, self.actionInput, self.propPosInput])
        nDot = Concatenate()([self.entityInput, self.entityTypeInput, self.actionInput, self.propNegInput])
        
        dense = Dense(1, activation="linear")
        
#         pDot = Multiply()([self.queryInput, self.propPosInput])
#         nDot = Multiply()([self.queryInput, self.propNegInput])
        
#         pDot = Dot(axes=-1)([self.queryInput, self.propPosInput])
#         nDot = Dot(axes=-1)([self.queryInput, self.propNegInput])
        
        pDot = dense(pDot)
        nDot = dense(nDot)
#         pred = Multiply()([q_emb, t_emb])
        #
        # diff = Subtract()([pDot, nDot])
        #
        lammbda_output = Lambda(bpr_triplet_loss, output_shape=(1,))
        self.pred = lammbda_output([pDot, nDot])

#         self.model = Model(inputs=[self.queryInput, self.propPosInput, self.propNegInput], outputs=self.pred)
        self.model = Model(inputs=[self.entityInput, self.entityTypeInput, self.actionInput, self.propPosInput, self.propNegInput], outputs=self.pred)

        self.model.compile(optimizer="adam", loss=identity_loss)
#         self.predictor = Model([self.queryInput, self.propPosInput], [pDot])
        self.predictor = Model([self.entityInput, self.entityTypeInput, self.actionInput, self.propPosInput], [pDot])
    def generate_train_data(self, df):
        x_entity, x_type, x_action, x_pos_prop, x_neg_prop, y = [], [], [], [], [], []
        for name, group in df.groupby("query_id"):
            cand_pos_prop = group.property.tolist()
            for idx, row in group.iterrows():
#                 cand_neg_prop = type2prop[row['entityType']]
                cand_neg_prop = df.property.unique().tolist()

                for n in range(int(row['rele_label'])):
#                 for n in range(1):
#                     if int(row['rele_label']) < 3:
#                         break
                    x_entity.append(entity2MSvec[row['entity']])
                    x_type.append(type2MSvec[row['entityType']])
                    x_action.append(action2MSvec[row['action'].replace("\'", "")])
                    x_pos_prop.append(prop2MSvec[row['property']])
                    neg_prop = random.choice(cand_neg_prop)
                    while neg_prop in cand_pos_prop:
                        neg_prop = random.choice(cand_neg_prop)
                    x_neg_prop.append(prop2MSvec[neg_prop])
        x_entity = np.array(x_entity)
        x_type = np.array(x_type)
        x_action = np.array(x_action)
        x_pos_prop = np.array(x_pos_prop)
        x_neg_prop = np.array(x_neg_prop)
        return [x_entity, x_type, x_action, x_pos_prop, x_neg_prop], np.ones(len(x_query))
        
        
# print(x_query)
df_train = df[df.query_id.isin(trainIds)]
bpr = BPR()

In [271]:
for i in range(10):
    x_train, y_train = bpr.generate_train_data(df_train)
    history = bpr.model.fit(x_train, y_train, verbose=0)
    res = []
    for idx, row in df[['query_id', 'entity', 'action', 'entityType']].drop_duplicates().iterrows():
        if row['query_id'] not in testIds:
            continue
        qrels = qrel[str(row['query_id'])]
        cand_properties = type2prop[row['entityType']]

        rank = {}
        for p in cand_properties:
#             score = bpr.predictor.predict([[type2MSvec[row['entityType']]], [prop2MSvec[p]]])[0][0]
            score = bpr.predictor.predict([[entity2MSvec[row['entity']]], [type2MSvec[row['entityType']]], [action2MSvec[row['action']]], [prop2MSvec[p]]])[0][0]
            rank[p] = score
        rank = [i[0] for i in sorted(rank.items(), key=lambda x: x[1], reverse=True)][:20]
        our = evaluate(qrels, rank)

        res.append(our)
    print(history.history["loss"][0], np.mean(res))


1.5450881636370655 0.4314965225135292
1.5526838118153288 0.4334187952565266
1.5433020704107425 0.42797148087480374
1.5432055357457692 0.43175021703697974
1.5394703051735423 0.4315754185314166
1.5428768547973193 0.435186726529109
1.5386353054275614 0.43729759427748344
1.5342632471693523 0.4369458369801347
1.5409849028737876 0.4326755318364465
1.5374256969823366 0.42595449066245117


In [242]:
res = []
res2 = []
for idx, row in df[['query_id', 'entity', 'action', 'entityType']].drop_duplicates().iterrows():
#     if row['query_id'] not in testIds:
#         continue
    qrels = qrel[str(row['query_id'])]
    cand_properties = type2prop[row['entityType']]
    
    rank = {}
    for p in cand_properties:
#         score = bpr.predictor.predict([[type2MSvec[row['entityType']]], [prop2MSvec[p]]])[0][0]
#         rank[p] = score
        score = cosine_similarity([type2MSvec[row['entityType']]], [prop2MSvec[p]])[0][0]
        rank[p] = score
#         if (row['entityType'], p) not in type2prop2popularity:
#             rank[p] = -99999
#         else:
#             rank[p] = type2prop2popularity[(row['entityType'], p)]
#         rank[p] = prop2popularity[p]
    rank = [i[0] for i in sorted(rank.items(), key=lambda x: x[1], reverse=True)][:20]
    our = evaluate(qrels, rank)
#     our = evaluate(qrels, cand_properties)
    base = evaluate(qrels, list(run["1"][str(row["query_id"])].keys()))

    res.append(our)
    res2.append(base)
print(np.mean(res))
print(np.mean(res2))

0.4439482474153289
0.5324041751716548
