In [97]:
with open('./data/evaluation_sample.json') as f:
    data = json.loads(f.read())

In [None]:
import json
import spacy
import pickle
from random import sample 
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

nlp = spacy.load("en_core_web_sm")

model = pipeline('fill-mask',
                 model=f"../model/brt/remap_10epochs",
                 tokenizer="../remap_topic_tokenizer")

In [None]:
def load_voa():
    with open('./data/voa_sentence.json') as f:
        voa_sentences = json.loads(f.read())
    return voa_sentences

def handle_examples(headword, sent_en):
    reconstruct = []
    doc = nlp(sent_en)
    
    word = ''
    find = False
    for token in doc:
        if not find:
            if token.text == headword and token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop:
                word = token.text
                find = True
            elif token.lemma_ == headword and token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop:
                word = token.lemma_
                find = True

            if find:
                reconstruct.append('[MASK]')
                continue
        else:
            reconstruct.append(token.text)
            
    masked_sent = ' '.join(reconstruct)
    return masked_sent 

def load_spacy_sbert():
    model = SentenceTransformer('all-roberta-large-v1')
    spacy_model = spacy.load("en_core_web_sm")
    return model, spacy_model

def load_cambridge():
    with open('../data/words2defs.json') as f:
        word2defs = json.loads(f.read())
        
    with open('../data/def2guide.json') as f:
        def2guideword = json.loads(f.read())
    return word2defs, def2guideword

def load_map():
    with open('../data/orig_new.json') as f:
        data = json.loads(f.read())
        
    with open('../data/topic_embs.pickle', 'rb') as f:
        emb_map = pickle.load(f)
    return data, emb_map

def disambiguate(sent, targetword, token_score, word2defs, def2guideword, cat_map, emb_map):
    definitions = word2defs[nlp(targetword)[0].lemma_]

    # sentence and definitions
    sentence_defs = [sent.replace('[MASK]', targetword)]
    sentence_defs.extend(definitions)
    embs = sbert.encode(sentence_defs, convert_to_tensor=True)
    # calculate cosine similarity score
    cos_scores = util.pytorch_cos_sim(embs, embs)
    
    defs_score = {}
    i = 0
    for j in range(1, len(cos_scores)):
        defs_score[sentence_defs[j]]  = cos_scores[i][j] 

    topic_defs = {}
    for result in token_score:
        embs = sbert.encode(definitions, convert_to_tensor=True)
        if result in emb_map:
            cosine_scores = util.pytorch_cos_sim(embs, emb_map[result])
            pairs = []
            edge = len(definitions)
            for i in range(0, edge):
                for j in range(0, len(emb_map[result])):
                    pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
            pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
            highest_idx = pairs[0]['index'][0]
            topic_defs[result] = [definitions[highest_idx], pairs[0]['score']]
    
    result = []
    for idx, topic in enumerate(token_score):
        if topic in topic_defs:
            sense = topic_defs[topic][0]
            result.append([sense, (token_score[topic] * topic_defs[topic][1]) + defs_score[sense]/2])
    result = sorted(result, key=lambda x: x[1], reverse=True)
    
    ans_score = {}
    for i in result:
        if i[0] not in ans_score:
            ans_score[i[0]] = [i[1]]
        else:
            ans_score[i[0]].append(i[1])
            
    confidence = []
    for k, v in ans_score.items():
        confidence.append([k, sum(v)/len(v)])
    confidence = sorted(confidence, key=lambda x: x[1], reverse=True)
    confidence = [str(i) for i in confidence]
    return confidence


In [None]:
handle_examples('taste', "They include changing public tastes, high operating costs, and public battles with animal rights groups. ")

In [None]:
sbert, spacy_model = load_spacy_sbert()
voa_sentence = load_voa()
orig_new_map, emb_map = load_map()
word2defs, def2guideword = load_cambridge()

In [None]:
with open('./results/voa_evaluation_10.tsv', 'a') as f:
    for k, v in voa_sentence.items():
        for sent in v:
            masked_sent = handle_examples(k, sent)
            try:
                results = model(masked_sent)
            except:
                print(sent)
                print(masked_sent)
            token_score = {line['token_str'][1:-1]: line['score'] for line in results}
            token = '\t'.join(token_score.keys())
            senses = disambiguate(sent, k, token_score, word2defs, def2guideword, orig_new_map, emb_map)
            senses = '\t'.join([str(i) for i in senses])
            f.write(sent.strip() + '\t' + token + '\t' + senses + '\n')

In [2]:
import json

In [6]:
with open('./data/voa_sentences.json') as f:
    voa_sentences = json.loads(f.read())

voa_sample = {}
for k, v in voa_sentences.items():
    
    if len(v) > 10:
        v = list(set(v))
        v = sorted(v, key=lambda x: len(x.split()), reverse=True)
        sent = [line for line in v if len(line.split())<20 and len(line.split())>10][5:15]
    else:
        sent = v
    voa_sample[k] = sent

In [4]:
# with open('./data/voa_sample.json', 'w') as f:
#     f.write(json.dumps(voa_sample))

In [7]:
voa_sample['bank']

['A current employee said the bank could soon test facial recognition software on people as they enter a bank.\n',
 'The delegates have promised to work on issues including water and property rights, central bank independence and labor policies.\n',
 'But Vitez’s biggest concern is whether some schools may urge students towards banks that may harm them financially.\n',
 'He said a big question for banks with this technology is how the public will react to it.\n',
 'The man told Schuster that the Air Force would not let him remove money from his bank account.\n',
 'Treasury Department said it will work with banks and technology companies to avoid becoming victims of ransomware attacks.\n',
 'The newspaper said Borghese wrote checks to people although he did not have enough money in the bank.\n',
 'They said the government acted because of fears that the crisis would cause a run on the banks.\n',
 'Or, they may be charged fees if they spend more money than they have in their bank account

In [2]:
import json
from collections import defaultdict

In [88]:
with open('./data/MED.json') as f:
    med = json.loads(f.read())

# with open('./data/evaluation_sample.json') as f:
#     sample = json.loads(f.read())
    
# with open('./data/voa_sentences.json') as f:
#     voa = json.loads(f.read())

In [64]:
over = []

In [65]:
evaluation = defaultdict(list)
for word in voa.keys():
    count = 0
    for sense in med[word]['noun']['SENSE']:
        examples = [line[0] for line in sense[-1]]
        evaluation[word].extend(examples)
        count += len(examples)
    if count > 10:
        over.append(word)

for word, value in evaluation.items():
    if len(value) < 10:
        evaluation[word] += sample[word][:10-len(evaluation[word])]  

In [66]:
sense_sep = defaultdict(dict)
for word in over:
    for idx, sense in enumerate(med[word]['noun']['SENSE']):
        examples = [line[0] for line in sense[-1]]
        sense_sep[word][idx] = examples

In [67]:
for word in sense_sep:
    print(word)
    evaluation[word] = []
    count = 10
    while count > 0:
        for idx, sents in sense_sep[word].items():
            if sents:
                sents = sorted(sents, key=lambda x:len(x.split()), reverse=True)
                evaluation[word].append(sents[0])
                sense_sep[word][idx].pop(0)
                count -= 1
                if count == 0:
                    break

taste
interest
star


In [68]:
for k, v in evaluation.items():
    print(k)
    print(len(v))

taste
10
issue
10
interest
10
star
10
duty
10
sentence
10
cone
10
bow
9
mole
10
slug
10
bass
5
bank
10


In [60]:
with open('../data/0.0_top3_brt_map_cam.jsonl') as f:
    data = [json.loads(line) for line in f.readlines()]

for line in data:
    if 'cone.noun' in line['word_id']:
        print(line)
        print()

{'super_group': 'Foundations of Knowledge', 'category': 'Mathematics', 'pos': 'noun', 'brt_word': 'cone', 'group': '43', 'word_id': 'cone.noun.01', 'en_def': 'a shape with a flat, round or oval base and a top that becomes narrower until it forms a point', 'score': 0.449418306350708}

{'super_group': 'Living Things', 'category': 'Human Body', 'pos': 'noun', 'brt_word': 'cone', 'group': '9', 'word_id': 'cone.noun.01', 'en_def': 'a shape with a flat, round or oval base and a top that becomes narrower until it forms a point', 'score': 0.4992419183254242}

{'super_group': 'Living Things', 'category': 'Trees', 'pos': 'noun', 'brt_word': 'pine cone', 'group': '2', 'word_id': 'pine-cone.noun.01', 'en_def': 'the hard, egg-shaped part of the pine tree that opens and releases seeds', 'score': 'one_sense'}

{'super_group': 'Living Things', 'category': 'Trees', 'pos': 'noun', 'brt_word': 'cone', 'group': '2', 'word_id': 'cone.noun.02', 'en_def': 'the hard oval-shaped fruit of a conifer', 'score': 0

In [69]:
with open('./data/med_voa_sample.json', 'w') as f:
    f.write(json.dumps(evaluation))

In [None]:
with open('./data/med2cam_sense.noun.json') as f:
    med2cam = json.loads(f.read())

In [None]:
for word in med2cam:
    for sense in med2cam[word]:
        examples = sense2examples[sense['med_def']]
        sense['examples'] = examples

In [None]:
sense2examples = {}
for word in med:
    if 'noun' in med[word]:
        for sense in med[word]['noun']['SENSE']:
            en_sense = sense[0]
            examples = [line[0] for line in sense[-1]]
            sense2examples[en_sense] = examples

In [92]:
for i in med['bass']['noun']['SENSE']:
    print(i)

['a fish that lives in rivers and the sea', '鱸魚', []]


In [107]:
with open('./data/voa_sentences.json') as f:
    voa = json.loads(f.read())

In [108]:
voa.keys()

dict_keys(['taste', 'issue', 'interest', 'star', 'duty', 'sentence', 'cone', 'bow', 'mole', 'slug', 'bass', 'bank'])

In [None]:
for word in voa.keys():
    for sense in med2cam[word]:
        print(sense)
        print(sense['examples'])

In [80]:
with open('./data/med2cam_sense.noun.json') as f:
    data = json.loads(f.read())

In [93]:
for i in data['bank']:
    print(i)
    print()

{'med_sense': 'a financial institution that people or businesses can keep their money in or borrow money from. The main banks used by ordinary people are called high-street banks', 'cam_sense': 'an organization where people and businesses can invest or borrow money, change it to foreign money, etc., or a building where these services are offered', 'examples': ['Marge works for the Royal Bank of Scotland.', 'a New York investment bank'], 'avg_score': 0.3191983103752136}

{'med_sense': 'a raised area of land along the side of a river', 'cam_sense': 'sloping raised land, especially along the sides of a river', 'examples': ['A man was fishing on the opposite bank.', 'The village lies on the east bank of the river Derwent.'], 'avg_score': 0.4779926836490631}

{'med_sense': 'a large number of things in a row, especially pieces of equipment', 'cam_sense': 'a row of similar things, especially machines or parts of machines', 'examples': ['a bank of TV monitors'], 'avg_score': 0.4837077260017395