# TF- update

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giuliarambelli/Event_Knowledge_Model_Comparison/blob/master/TF-update.ipynb) 

In [1]:
# load libraries
import copy
import pandas as pd
import os
import logging
import sys
import numpy as np
from collections import defaultdict
import gzip
import copy
from scipy.spatial import distance

In [2]:
# data_utils

def load_MIT_data(inpath):
    """
    Load lemmatized dataset.
    Each line is tab-separated, first column is sentence ID, the other items are lemmas in the form
    lemma@POS@RELATION
    :param inpath: File path
    :return: list of tuples; each tuple is in the form (id, [context lemmas], target word)
    """
    data = []
    with open(inpath, 'r') as f:
        for line in f:
            items = line.strip().split('\t')
            id = items[0]
            contexts = items[1:-1]
            target = items[-1]
            data.append((id, contexts, target))
    return data


def load_mapping(fpath):
    """
    Load a space separated files mapping our relations to UD relation(s)
    :param fpath: file mapping role name to UD relation labels
    :return: {label:[ud_label(s)]} and {ud_label: label} dictionaries
    """
    map = {}
    with open(fpath) as fin:
        for line in fin:
            line = line.strip().split()
            map[line[0]] = line[1].split(",")
    inv_map = {}
    for k,v in map.items():
        for x in v:
            inv_map[x] = k
    return map, inv_map


def load_lemma_freqs(fpath):
    """
    Load a lemma-freqs file.
    Each line has the form 'lemma POS\tfreq'

    @param fpath: input file
    @type fpath: str
    @return: dictionary {(lemma,pos):freq}
    @rtype: dict
    """
    lemmas_dic = dict()
    with gzip.open(fpath, 'rt') as f:
        for line in f:
            lemma, freq = line.strip().split("\t")
            lemma, pos = lemma.split(' ')
            lemmas_dic[(lemma,pos)] = float(freq)
    return lemmas_dic


def load_bigr_freqs(fpath,mapping):
    """
    Load bigram frequencies from a .gzip file in the form of
    'arg1 arg2     frequency'
    :param fpath: file path
    :param mapping: dictionary that maps UD relations in core-grained categories
    :return: a {(arg1,arg2): float(freq) }dictionary
    """
    bigram_dict = dict()
    with gzip.open(fpath, 'rt') as f:
        for line in f:
            try:
                bigr, freq = line.strip().split("\t")
                arg1, arg2 = bigr.split(" ")
                l,p,r = arg1.split('@')
                arg1='@'.join([l,p,mapping[r]])
                l,p,r = arg2.split('@')
                arg2='@'.join([l,p,mapping[r]])
                bigram_dict[(arg1, arg2)] = float(freq)
            except KeyError:
                pass
    return bigram_dict


# Load FastText vectors
class VectorsDict(dict):
    """
    Functions to load fasttext vectors
    """

    def __init__(self, withPoS=False):
        self.withPoS = withPoS

    def __getitem__(self, item):
        form, pos = item
        if self.withPoS:
            # TODO: allow for different composition functions
            form = form+"/"+pos
        else:
            form = form
        return super().__getitem__(form)


def _load_vocab(fpath):
    ret = []
    with open(fpath) as fin:
        for line in fin:
            line = line.strip().split()
            for el in line:
                ret.append(el)

    return ret


def _load_vectors_npy(vectors_fpath, withPoS, noun_set, len_vectors):

    vectors_vocab = vectors_fpath[:-4]+".vocab"

    vectors = np.load(vectors_fpath)
    vocab = _load_vocab(vectors_vocab)

    noun_vectors = VectorsDict(withPoS)

    for key, value in zip(vocab, vectors):

        if key in noun_set or not len(noun_set):
            noun_vectors[key] = value
            if len_vectors > -1:
                noun_vectors[key] = value[:len_vectors]

    return noun_vectors


def _load_vectors_from_text(vectors_fpath, withPoS, noun_set, len_vectors):

    noun_vectors = VectorsDict(withPoS)

    with open(vectors_fpath) as fin_model:

        n_words, len_from_file = fin_model.readline().strip().split()
        len_from_file = int(len_from_file)

        for line in fin_model:
            if len_vectors == -1:
                len_vectors = len_from_file

            line = line.strip().split()
            len_line = len(line)
            word = " ".join(line[:len_line-len_from_file])

            if word in noun_set or not len(noun_set):
                try:
                    vector = [float(x) for x in line[-len_vectors:]]
                    noun_vectors[word] = np.array(vector)
                except:
                    print("problem with vector for word {}".format(word))

    print("loaded {} vectors".format(len(noun_vectors)))
    return noun_vectors


def load_vectors(vectors_fpath, withPoS=False, noun_set=set(), len_vectors=-1):
    print("Load vectors from {}".format(vectors_fpath))
    if vectors_fpath.endswith(".npy"):
        ret = _load_vectors_npy(vectors_fpath, withPoS=withPoS, noun_set=noun_set, len_vectors=len_vectors)
    else:
        ret = _load_vectors_from_text(vectors_fpath, withPoS=withPoS, noun_set=noun_set, len_vectors=len_vectors)
    return ret

# vector_utils.py
def get_centroid(vecs, n):
    """
    Compute the centroid of N vectors
    :param vecs: list of arrays
    :param n: int
    :return: array
    """

    if len(vecs) == 0:
        #print (list_of_words)
        return None
    if len(vecs) == 1:
        centroid = copy.deepcopy(vecs[0])
        return centroid
    else:
        centroid = None
        num_vecs = 0
        for v in vecs:
            if num_vecs > n:
                break
            else:
                try:
                    vector = vecs[num_vecs]
                    if centroid is None:
                        centroid = copy.deepcopy(vector)
                    else:
                        centroid += vector
                    num_vecs += 1
                except KeyError:
                    pass

        # if centroid is not None:
        res = centroid / n

        return res


def cosine(vec1, vec2):
    """
    Compute the cosine similarity between two vectors
    :param vec1: array
    :param vec2: array
    :return: array
    """
    try:
        score = 1-distance.cosine(vec1, vec2)
        return score
    except ValueError:
        print(vec1)
        print(vec2)
        return None

In [3]:
# math_utils
def c_prob(bigr1_f, bigr2_f, lemma_f, N):
    return (bigr1_f*bigr2_f)/lemma_f*N

def c_prob_simple(bigr_f, lemma_f):
    return bigr_f/lemma_f

def mi(ab, a, b, n):
    """
    Pointwise Mutual Information (Church & Hanks, 1990)
    :param float ab: joint frequency between two words a and b
    :param  float a: frequency of word a
    :param  float b: frequency of word b
    :param float n: total frequencies
    :return: mutual association measure
    :rtype: float
    """
    try:
        # dividend
        obs = ab * n
        # divider
        exp = (a * b)
        result = np.log2(obs/exp)
    except ZeroDivisionError:
        print("Division by zero is not permitted")
        sys.exit(2)
    return result


def _ppmi(ab, a, b, n):
    """
    Positive Pointwise Mutual Information (Church & Hanks, 1990)
    :param float ab: joint frequency between two words a and b
    :param  float a: frequency of word a
    :param  float b: frequency of word b
    :param float n: total frequencies
    :return: mutual association measure
    :rtype: float
    """
    res = max(0, mi(ab, a, b, n))
    return res


def _lmi(ab, a, b, n):
    """
    Local Mutual Information (Church & Hanks, 1990)
    :param float ab: joint frequency between two words a and b
    :param  float a: frequency of word a
    :param  float b: frequency of word b
    :param float n: total frequencies
    :return: mutual association measure
    :rtype: float
    """
    result = mi(ab, a, b, n) * ab
    return result

possible_functions={'lmi':_lmi, 'ppmi':_ppmi}

In [4]:
def compute_and_print_scores(dic_res, data, original_data, vec_space, outpath):
    """
    Save results in a file
    :param dic_res: {(id, [contexts], target): {'most_associated':[(word,assoc_score), 'centroid': array}}
    :param data: list of tuples (id, [contexts], target)
    :param original_data: original dataset
    :param vec_space: VectorsDict object
    :param outpath: file path
    :return:
    """
    data_sent = pd.read_csv(original_data, sep='\t',
                                header=None)
    with open(outpath, 'w') as fout:
        for id in data_sent[0]:
            sent = data_sent.iloc[id][1]

            _, contexts, target = data[id]
            t_w, t_pos, t_role = target.split('@')

            # compute cosine_sim
            centroid = dic_res[tuple(contexts + [t_role.split(':')[0]])]['centroid']
            if centroid is not None:
                try:
                    target_vec = vec_space[(t_w, t_pos)]
                    cos_sim = cosine(target_vec, centroid)
                except TypeError:
                    cos_sim = None
                except KeyError:
                    cos_sim = None
                print('{}\t{}\t{}'.format(id, sent, cos_sim), file=fout)
            else:
                print('{}\t{}\t{}'.format(id,sent, '0.0'), file=fout)
        

def print_most_assoc(dic_res, data, outpath):
    """
    Save fillers used to compute prototype
    :param dic_res: {(id, [contexts], target): {'most_associated':[(word,assoc_score), 'centroid': array}}
    :param data: list of tuples (id, [contexts], target)
    :param outpath: file path
    :return:
    """
    with open(outpath, 'w') as fout:
        for id, contexts, target in data:
            t_w, t_pos, t_role = target.split('@')

            # compute cosine_sim
            d = dic_res[tuple(contexts + [t_role.split(':')[0]])]
            most_assoc = d['most_associated']
            most_assoc = ['{} ({})'.format(w,round(s,2)) for w,s in most_assoc]
            print('{}\t{}\t{}'.format('\t'.join([id] + contexts + [target]), ','.join(most_assoc), d['operation']), file=fout)

In [5]:
class TFmodel(object):
    def __init__(self, lemma_f, bigram_f, vectors, n, m, weight_function, update_function):
        """
        Initialize obkect with all parameters
        :param lemma_f: {lemma:freq} dictionary
        :param bigram_f: {(arg1,arg2):freq} dictionary
        :param vectors: VectorsDict space
        :param n: number of fillers to use to compute the centroid
        :param m: number of filler to retrieve for each word-relation
        :param weight_function: 'add' or 'prod'
        :param update_function: 'ppmi' or 'lmi'
        """
        self.lemmas_freq = lemma_f
        self.bigram_freq = bigram_f
        self.rels = []
        for x, y in self.bigram_freq.keys():
            self.rels.append(x)
            self.rels.append(y)
        self.rels = set(self.rels)

        # self.N_all = sum([f for tup, f in lemma_f.items()])
        self.N_all = sum([f for tup, f in lemma_f.items() if tup[1] == 'V'])
        self.vec_space = vectors
        self.N = n
        self.M = m
        self.weight_function = weight_function
        self.update_function = update_function

        #self.map_roles = {'nsubj':'nsubj', 'obj':'dobj', 'obl':'nmod', 'root':'HEAD', 'iobj':'iobj'}

    def get_most_assoc(self, word_pos_rel, target_role):
        """
        Get fillers for
        :param word_pos_rel: lemma@pos@rel string
        :param target_role: relation string
        :return: [(word, assoc_score)] list
        """
        most_assoc = []
        for k in self.bigram_freq:

            a1, a2 = k
            if word_pos_rel == a1:
                r = a2.split('@')[-1]
                if r.startswith(target_role):
                    #most_assoc.append('@'.join(a2.split('@')[:-1]), bigram_dict[k])
                    w = self.weight_function(self.bigram_freq[k],
                            self.lemmas_freq[tuple(a1.split('@')[:-1])], 
                            self.lemmas_freq[tuple(a2.split('@')[:-1])], 
                            self.N_all)
                    most_assoc.append(('@'.join(a2.split('@')[:-1]),w))
            elif word_pos_rel == a2:
                r = a1.split('@')[-1]
                if r.startswith(target_role):
                    #most_assoc.append('@'.join(a1.split('@')[:-1]), bigram_dict[k])
                    w = self.weight_function(self.bigram_freq[k],
                            self.lemmas_freq[tuple(a1.split('@')[:-1])], 
                            self.lemmas_freq[tuple(a2.split('@')[:-1])], 
                            self.N_all)
                    most_assoc.append(('@'.join(a1.split('@')[:-1]), w))
            else:
                pass
        return sorted(most_assoc,key = lambda x: x[1], reverse=True)

    def update_lists(self, list_of_lists):
        """
        Take two or more lists of fillers and take the intersection
        :param list_of_lists: list of lists
        :return: [(word, assoc_score)] list
        """
        lists = []
        dicts = {}
        for i,sublist in enumerate(list_of_lists):
            dicts[i] = dict(sublist[:self.N])
            lists.append(set([word for word, weight in sublist[:self.N]]))
        intersection = lists[0].intersection(*lists)   

        scores = []
        for w in intersection:
            weights = [dicts[k][w] for k in dicts]
            if self.update_function == "add":
                scores.append(sum(weights))
            elif self.update_function == "prod":
                scores.append(np.prod(weights))

        res = [(w, score) for w, score in zip(intersection, scores)]
        res.sort(key=lambda x: x[1], reverse=True)

        return res

    def get_first_M(self, l):
        """
        Get first M items in a list that are also attested in the vector space vocabulary.
        :param [(word, assoc_score)] list
        :return: list of arrays, list of lemma@pos@rel
        """

        res = []
        vecs = []
        num_vecs = 0
        for w, score in l:
            #print(w)
            if num_vecs >= self.M:
                break
            else:
                try:
                    lemma, pos = w.split('@')
                    vector = self.vec_space[(lemma, pos)]
                    vecs.append(copy.deepcopy(vector))
                    res.append((w, score))
                    num_vecs += 1
                except KeyError:
                    print(w, 'not in space')
                    pass
        #print(len(vecs), len(res))
        return vecs, res


    def tf_estimation_update(self, data, operation=None,list_length=None, n_centroid=None):
        """
        Run TF-update
        :param data: [(id, [contexts], target)] list
        :param operation: 'add' or 'prod'
        :param list_length: M
        :param n_centroid: N
        :return:  {(id, [contexts], target): {'most_associated':[(word,assoc_score), 'centroid': array}}
        """
        # update parameters (no need to rebuild the class if one of them changes)
        if operation: self.update_function = operation
        if list_length: self.N = list_length
        if n_centroid: self.M = n_centroid
        ####
        print('Update model')
        """
        Compute thematic fit estimation given the subject and a 2nd argument (with updated expectations) for the input dataset.
        """
        dic_res = {}  # {(arg1, .., role):{centroid:array, most_associated:[]}
        scores = []
        #c = 0
        for c, (_, contexts, target) in enumerate(data):
            print(c, contexts, target)
            t_w, t_pos, t_role = target.split('@')
            t_role = t_role.split(':')[0]
            synrel = t_role#self.map_roles[t_role.lower()]
            lists_of_most_assoc = []
            for word in contexts:              
                w_tok, w_pos, w_role = word.split('@')
                #word = '@'.join([w_tok, w_pos,self.map_roles[w_role.lower()] ])

                if word in self.rels:
                    #print(word)

                    l=self.get_most_assoc(word, synrel)
                    lists_of_most_assoc.append(l)
                    print(word, synrel, len(l))
                else:
                    print('NO {} for {}'.format(synrel, word))
                    #lists_of_most_assoc.append([])
            #print(lists_of_most_assoc)
            if len(lists_of_most_assoc)==0:
                print('Item {}: no relations '.format(c))
                dic_res[tuple(contexts + [t_role])] = {'centroid': None, 'most_associated': [], 'operation':'no_fillers'}
            elif len(lists_of_most_assoc)==1:
                print('no update')
                target_vecs, target_list = self.get_first_M(lists_of_most_assoc[0])  # get M verbs to compute the centroid
                centroid = get_centroid(target_vecs, self.M)  # compute centroid
                dic_res[tuple(contexts + [t_role])] = {'centroid': centroid, 'most_associated': target_list,  'operation':'one_filler_list'}
            else:
                updated_expectations = self.update_lists(lists_of_most_assoc)
                if len(updated_expectations) > 0:
                    print('update', updated_expectations[:10])
                    target_vecs, target_list = self.get_first_M(updated_expectations)  # get M verbs to compute the centroid
                    if len(target_list) == 0: #case in which we have few items in intersection but none in vec space
                        print('fillers not in space')
                        target_vecs, target_list = self.get_first_M(lists_of_most_assoc[1])  # get M verbs to compute the centroid
                        centroid = get_centroid(target_vecs, self.M)  # compute centroid
                        dic_res[tuple(contexts + [t_role])] = {'centroid': centroid, 'most_associated': target_list, 'operation':'intersection_empty_get_verbfillers'}
                    else:
                        centroid = get_centroid(target_vecs, self.M)  # compute centroid
                        dic_res[tuple(contexts + [t_role])] = {'centroid': centroid, 'most_associated': target_list,  'operation':'update'}
                else:
                    print('empty intersection, get verb fillers')
                    target_vecs, target_list = self.get_first_M(lists_of_most_assoc[1])  # get M verbs to compute the centroid
                    centroid = get_centroid(target_vecs, self.M)  # compute centroid
                    dic_res[tuple(contexts + [t_role])] = {'centroid': centroid, 'most_associated': target_list, 'operation':'intersection_empty_get_verbfillers'}
            print('Prototype fillers:',len(target_list), target_list)
        return dic_res

## Run

In [6]:
# load fasttext space
vecs =  load_vectors('/home/giulia.rambelli/to_backup/spaces/wiki-news-300d-1M.vec', withPoS=False, len_vectors=-1)


Load vectors from /home/giulia.rambelli/to_backup/spaces/wiki-news-300d-1M.vec
loaded 999994 vectors


In [8]:
lemma_f = load_lemma_freqs('relations4TFupdate/lemmaNV-freqs-min60.gz')
#rels_f = load_bigr_freqs('/home/giulia.rambelli/to_backup/events-freqs.events2degree.gz')
weight_function = possible_functions['lmi']

In [10]:
#files = [f for f in os.listdir('datasets/baseline_format/')]
out_dir = os.path.join('results/TF-update')
files = ['DTFit_vassallo_deps.txt']
for f in sorted(files):
    print(f)
    maps, inv_maps = load_mapping(+f.split('.')[0]+'.roles_mapping.txt')
    print('Load events..')
    rels_f = load_bigr_freqs('relations4TFupdate/datarel_all_{}.gz'.format(f.split('.')[0]), inv_maps)
    tf = TFmodel(lemma_f, rels_f, vecs, 200, 20, weight_function, 'prod')

    data = load_MIT_data('datasets/baseline_format/'+f)
    res = tf.tf_estimation_update(data)
    outpath = os.path.join(out_dir, '{}.update-model.TF-{}.n{}.txt'.format(f.split('.')[0],'prod',200))
    compute_and_print_scores(res, data, 'datasets/'+f, vecs, outpath)
    print_most_assoc(res, data, outpath.split('.txt')[0]+'.most_assoc.txt')

DTFit_vassallo_deps.txt
Load events..
Update model
0 ['actor@N@NSUBJ', 'win@V@ROOT'] battle@N@OBJ
actor@N@NSUBJ OBJ 3746
win@V@ROOT OBJ 15494
update [('plaudit@N', 74413.73256441773), ('emmys@N', 35345.98415481984)]
Prototype fillers: 2 [('plaudit@N', 74413.73256441773), ('emmys@N', 35345.98415481984)]
1 ['actor@N@NSUBJ', 'win@V@ROOT'] award@N@OBJ
actor@N@NSUBJ OBJ 3746
win@V@ROOT OBJ 15494
update [('plaudit@N', 74413.73256441773), ('emmys@N', 35345.98415481984)]
Prototype fillers: 2 [('plaudit@N', 74413.73256441773), ('emmys@N', 35345.98415481984)]
2 ['anchorman@N@NSUBJ', 'tell@V@ROOT'] parable@N@OBJ
anchorman@N@NSUBJ OBJ 33
tell@V@ROOT OBJ 14483
update [('overture@N', 8879.83512906111), ('bit@N', 5094.942893861944), ('people@N', -78414.31940939861), ('story@N', -216285.03875627136)]
Prototype fillers: 4 [('overture@N', 8879.83512906111), ('bit@N', 5094.942893861944), ('people@N', -78414.31940939861), ('story@N', -216285.03875627136)]
3 ['anchorman@N@NSUBJ', 'tell@V@ROOT'] news@N@OBJ
