# Baselines
Compute baseline structured and unstructured

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giuliarambelli/Event_Knowledge_Model_Comparison/blob/master/baselines_PPMI_structured_and_unstructured.ipynb) 

In [1]:
# load libraries
import collections
import gzip
import os
import itertools
import numpy as np
import pandas as pd

In [2]:
# utils functions

def laplace(ab, a, b, n, v):
    """
    Laplace smoothing
    :param ab: ab frequency
    :param a: a frequency
    :param b: b frequency
    :param n: frequency of all bigrams
    :param v: vocabulary dimension
    :return: smoothed PPMI
    """
    exp = np.true_divide(((a + 1) * (b + 1)), n + v)
    result = np.log2(np.true_divide(ab + 2, exp))
    return max(0, result)


def mi(ab, a, b, n):
    """
    Pointwise Mutual Information (Church & Hanks, 1990)
    if returns 'Inf' in case of division by zero
    :param ab: ab frequency
    :param a: a frequency
    :param b: b frequency
    :param n: frequency of all bigrams
    :return: Mutual Information
    """
    exp = np.true_divide(a * b, n)
    result = np.log2(np.true_divide(ab, exp))
    return result


def ppmi(ab, a, b, n):
    """
    Positive Pointwise mutual information
    :param ab: ab frequency
    :param a: a frequency
    :param b: b frequency
    :param n: frequency of all bigrams
    :return: PPMI score
    """
    res = max(0, mi(ab, a, b, n))
    return res


def load_mapping(fpath):
    """
    Load a space separated files mapping our relations to UD relation(s)
    :param fpath: file mapping role name to UD relation labels
    :return: {label:[ud_label(s)]} and {ud_label: label} dictionaries
    """
    map = {}
    with open(fpath) as fin:
        for line in fin:
            line = line.strip().split()
            map[line[0]] = line[1].split(",")
    inv_map = {}
    for k,v in map.items():
        for x in v:
            inv_map[x] = k
    return map, inv_map


def load_formatted(path, b):
    """
    Load datafile where sentence is converted into lemma@pos@rel format (just verb and nouns items in sentence)
    'The cop is arresting the criminal' -> cop@N@NSUBJ	arrest@V@ROOT	criminal@N@OBJ
    :param path: file path
    :param b: type of baseline will be used, it influences the dataformat it will be passed in main function
    :return: dictionary where keys are sentence's IDs and values are list containing sentence's items
    """
    d = {}
    with open(path, 'r') as f:
        for line in f:
            line = line.strip()
            id = int(line.split('\t')[0])
            items = line.split('\t')[1:]

            if b == 1:
                items = [tuple(i.split('@')) for i in items]  #{0: [(actor,N,NSUBJ),(win,V,ROOT),(battle,N,OBJ)]}
                d[id] = items
            else:
                items = [i.split('@')[0] for i in items]  #{0: [actor,win,battle)]}
                d[id] = items
        return d


def load_events(path, map, e_freq=0):
    """
    Load events for baseline1
    :param path: file path
    :param map: dictionary mapping UD rel labels into our labels
    :param e_freq: minimum frequency required
    :return:
    """
    print('Load events from: {}'.format(path))
    e_dict = {}
    with gzip.open(path, 'rt') as fin:
        for line in fin:
            item, freq = line.strip().split('\t')
            if float(freq) >= e_freq:
                w1, w2, rel = item.split(' ')
                try:
                    e_dict[(w1, w2, map[rel])] = float(freq)
                except KeyError:
                    pass
    return e_dict


def load_lemmas(lemmas_freqs_file):
    """
    Load lemmas for baseline2
    :param lemmas_freqs_file: file path
    :return: [word: freq] dictionary and N as the sum of all frequencies
    """
    lem_freq_dict = collections.defaultdict(int)
    n = 0
    with gzip.open(lemmas_freqs_file, "rt") as fin:
        for line in fin:
            word, freq = line.strip().split('\t')
            if ' ' in word:
                word = tuple(word.split(' '))
            lem_freq_dict[word] += float(freq)
            n += float(freq)
    return lem_freq_dict, n


def load_events2(path, lemmas, words):
    """
    Load events for baseline2
    :param path: file path
    :param lemmas: list of lemmas
    :param words:
    :return:
    """
    print('Load events from: {}'.format(path))
    e_dict = {}
    with gzip.open(path, 'rt') as fin:
        for line in fin:
            item, freq = line.strip().split('\t')
            if all(w in lemmas for w in item.split(' ')) and all(j in words for j in item.split(' ')):
                e_dict[tuple(item.split(' '))] = float(freq)
    return e_dict


def events_bigram(events_dict):
    """
    Compute word-relation frequency from triples
    :param events_dict: {((w1, w2, rel): freq} dictionary
    :return: {(w1, w2, rel): freq} and {(w, rel): freq} dictionaries
    """
    pairs_dict = collections.defaultdict(float)
    word_rel_dic = collections.defaultdict(float)
    n = 0
    for e in events_dict:
        v, a, synrel = e
        #if synrel.startswith('nsubj'):
        #    synrel = 'nsubj'
        #if tuple(v.split('@')) in accepted_lemmas and tuple(a.split('@')) in accepted_lemmas:
        word_rel_dic[(v, synrel)] += events_dict[e]
        word_rel_dic[(a, synrel)] += events_dict[e]
        pairs_dict[e] = events_dict[e]
        n += events_dict[e]
    return pairs_dict, word_rel_dic, n


def get_lemma(w, dict):
    try:
        l = dict[w]
    except KeyError:
        l = 0
    return l

## 1. Baseline 1
**PPMI (structured input, input annotated with grammatical roles)**

The score of a sentence is the sum of the PPMIs of syntactic relations <head, dependent, role>
Frequencies from ukwac+wiki2018 corpora (f min = 2)

In [3]:
class Baseline1(object):
    def __init__(self, events_file, map):
        events = load_events(events_file, map)
        self.events, self.wrel, self.N = events_bigram(events)

    def run_baseline(self, data_path, smooth=False, log_path='.'):
        data = load_formatted(data_path, 1)
        log_out = open('{}/{}.baseline1.log'.format(log_path,os.path.basename(data_path).split('.')[0]),'w')
        res = {}

        for id, item in data.items():

            log_res = []
            freq = 0

            #print(item)
            item = [(i[0] + '@' + i[1], i[2]) for i in item]
            ppmis = []
            v = item[1][0]

            s, rel = item[0]
            if (v, s, rel) not in self.events:
                sv_freq = 0
            else:
                sv_freq = self.events[(v, s, rel)]
            if smooth:
                sv_ppmi = laplace(sv_freq, get_lemma((s, rel), self.wrel), get_lemma((v, rel), self.wrel), self.N,
                                  len(self.events))
            else:
                sv_ppmi = ppmi(sv_freq, get_lemma((s, rel), self.wrel), get_lemma((v, rel), self.wrel), self.N)

            log_res.append('{}-{}-{} freq:{} ppmi:{}'.format(v, s, rel, sv_freq, sv_ppmi))
            freq += sv_freq
            #print(v, s, rel, sv_freq, sv_ppmi)
            ppmis.append(sv_ppmi)

            for arg in item[2:]:
                arg, rel = arg

                if (v, arg, rel) not in self.events:
                    va_freq = 0
                else:
                    va_freq = self.events[(v, arg, rel)]
                if smooth:
                    va_ppmi = laplace(va_freq, get_lemma((v, rel), self.wrel), get_lemma((arg, rel), self.wrel), self.N,
                                      len(self.events))
                else:
                    va_ppmi = ppmi(va_freq, get_lemma((v, rel), self.wrel), get_lemma((arg, rel), self.wrel), self.N)

                log_res.append('{}-{}-{} freq:{} ppmi:{}'.format(v, arg, rel, va_freq, va_ppmi))
                freq += va_freq
                #print(v, arg, rel, va_freq, va_ppmi)
                ppmis.append(va_ppmi)
            if freq==0:
                print(id,'\t'.join(log_res), 'zero_score', file=log_out)
            else:
                print(id,'\t'.join(log_res), 'score', file=log_out)
            res[id] = ppmis
        return res

    def print_result(self, ppmis_dic, original_data, out_dir, print_all=True):
        fname = os.path.basename(original_data).split('.')[0]
        data_sent = pd.read_csv(original_data, sep='\t',
                                header=None)  #os.path.join('datasets', fname+'.txt'), sep='\t', header=None)
        outpath = os.path.join(out_dir, fname + '.scores_baseline1.txt')
        print('Write output: ', outpath)
        with open(outpath, 'w') as fout:
            for id in data_sent[0]:
                sent = data_sent.iloc[id][1]
                if print_all:
                    print('{}\t{}\t{}\t{}'.format(id, sent, ','.join([str(i) for i in ppmis_dic[id]]), sum(ppmis_dic[id])),file=fout)
                else:
                    print('{}\t{}\t{}'.format(id, sent, sum(ppmis_dic[id])), file=fout)


## Baseline 2
**ngram sentence surprisal**

The score of a sentence is the sum of the PPMIs of each bigram in the sentence.
Frequencies from ukwac+wiki2018 corpora (f min = 5). Bigrams are considered in a window +-10.

In [4]:
def baseline2(data_files, events_file, lemmas_freqs_file, out_dir, smooth=False):
    lemmas_freq, N = load_lemmas(lemmas_freqs_file)
    #events = load_events(events_file)
    #events = load_events2(events_file, lemmas_freq.keys())
    for data_file in data_files:
        print('Reading:', data_file)
        data = load_formatted(data_file, 2)
        words = set(itertools.chain(*data.values()))
        events = load_events2(events_file, lemmas_freq.keys(), words)
        res = {}
        for id, item in data.items():
            ppmis = []
            if (item[0], item[1]) not in events:
                sv_freq = 0
            else:
                sv_freq = events[(item[0], item[1])]
            if smooth:
                sv_ppmi = laplace(sv_freq, get_lemma(item[0], lemmas_freq), get_lemma(item[1], lemmas_freq), N,
                                  len(lemmas_freq))
            else:
                sv_ppmi = ppmi(sv_freq, get_lemma(item[0], lemmas_freq), get_lemma(item[1], lemmas_freq), N)

            ppmis.append(sv_ppmi)

            for arg in item[2:]:
                if (item[1], arg) not in events:
                    va_freq = 0
                else:
                    va_freq = events[(item[1], arg)]
                if smooth:
                    va_ppmi = laplace(va_freq, get_lemma(item[1], lemmas_freq), get_lemma(arg, lemmas_freq), N,
                                      len(lemmas_freq))
                else:
                    va_ppmi = ppmi(va_freq, get_lemma(item[1], lemmas_freq), get_lemma(arg, lemmas_freq), N)

                ppmis.append(va_ppmi)
            res[id] = ppmis

        fname = os.path.basename(data_file).split('.')[0]
        data_sent = pd.read_csv(os.path.join('datasets', fname + '.txt'), sep='\t', header=None)
        with open(os.path.join(out_dir, fname + '.scores_baseline2.txt'), 'w') as fout:
            for id in sorted(data):
                sent = data_sent.iloc[id][1]
                print('{}\t{}'.format(id, sum(res[id])), file=fout)

## Run script


### a. Baseline 1

In [5]:
# baseline 1
#lem_path = 'lempos-freqs.50.filtered.gz'
event_path = 'events_baseline1-freqs.2.filtered.gz'  # change path according to where the file is stored

# Define parameters
smooth = True  # apply laplace or not
out_dir = 'results/baseline1_smoothed/'
os.makedirs(out_dir, exist_ok=True)

files = [f for f in os.listdir('datasets/baseline_format/')]
for f in files:
    print(f)
    #load dictionaries to map UD labels into our labels
    _, ud2lab = load_mapping(f.split('.')[0]+'.roles_mapping.txt')
    b1_model = Baseline1(event_path,ud2lab)
    res_dic = b1_model.run_baseline('datasets/baseline_format/'+f, smooth, out_dir)
    b1_model.print_result(res_dic,'datasets/'+f, out_dir)

newsentences_EventsAdapt.txt
Load events from: events_baseline1-freqs.2.filtered.gz
Write output:  results/baseline1_smoothed/newsentences_EventsAdapt.scores_baseline1.txt
ev1_deps.txt
Load events from: events_baseline1-freqs.2.filtered.gz
Write output:  results/baseline1_smoothed/ev1_deps.scores_baseline1.txt
DTFit_vassallo_deps.txt
Load events from: events_baseline1-freqs.2.filtered.gz
Write output:  results/baseline1_smoothed/DTFit_vassallo_deps.scores_baseline1.txt


In [None]:
# baseline 2
lempath = 'lemma-freqs.50.filtered.gz'  # change path according to where the file is stored 
event_path = 'events_baseline2-freqs.5.filtered.gz'  # change path according to where the file is stored 
baseline2(f, event_path, lempath, out_dir, smooth)

**NOTE**

Lemma and Event frequency files are not in github directory for space reason. Please contact the authors for getting these files.