# Baselines
Compute baseline structured and unstructured

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giuliarambelli/Event_Knowledge_Model_Comparison/blob/master/baselines_PPMI_structured_and_unstructured.ipynb) 

In [1]:
import collections
import gzip
import os
import itertools
import numpy as np
import pandas as pd

In [9]:
# utils.py

def laplace(ab,a,b,n,v):
	exp = np.true_divide(((a+1) * (b+1)), n+v)
	result = np.log2(np.true_divide(ab+2, exp))
	return max(0, result)


def mi(ab, a, b, n):
	'''
	Pointwise Mutual Information (Church & Hanks, 1990)
	if returns 'Inf' in case of division by zero
	'''
	exp = np.true_divide(a * b, n)
	result = np.log2(np.true_divide(ab, exp))
	return result

def ppmi(ab, a, b, n):
	res = max(0, mi(ab, a, b, n))
	return res

synrel_dic = {'NSUBJ':'nsubj', 'OBJ':'dobj', 'OBL':'nmod'}

def load_formatted(path, b):
	d = {}
	with open(path, 'r') as f:
		for line in f:
			line = line.strip()
			id = int(line.split('\t')[0])
			items = line.split('\t')[1:]

			if b == 1:
				items = [tuple(i.split('@')) for i in items] #{0: [(actor,N,NSUBJ),(win,V,ROOT),(battle,N,OBJ)]}
				d[id] = items
			else:
				items = [i.split('@')[0] for i in items] #{0: [actor,win,battle)]}
				d[id] = items
		return d

def load_events(path, e_freq=20):
	print('Load events from: {}'.format(path))
	e_dict = {}
	with gzip.open(path, 'rt') as fin:
		for line in fin:
			item, freq = line.strip().split('\t')
			#if float(freq)> e_freq:
			e_dict[tuple(item.split(' '))] = float(freq)
	return e_dict

def load_events2(path, lemmas, words):
	print('Load events from: {}'.format(path))
	e_dict = {}
	with gzip.open(path, 'rt') as fin:
		for line in fin:
			item, freq = line.strip().split('\t')
			if all( w in lemmas for w in item.split(' ')) and all( j in words for j in item.split(' ')):
				e_dict[tuple(item.split(' '))] = float(freq)
	return e_dict


def load_lemmas(lemmas_freqs_file):
	lem_freq_dict = collections.defaultdict(int)
	n = 0
	with gzip.open(lemmas_freqs_file, "rt") as fin:
		for line in fin:
			word, freq = line.strip().split('\t')
			if ' ' in word:
				word = tuple(word.split(' '))
			lem_freq_dict[word] += float(freq)
			n+= float(freq)
	return lem_freq_dict,n


def check_coverage(words, all_lemmas):
	for w in words:
		if w not in all_lemmas:
			print(w)

def events_bigram(events_dict):
	pairs_dict = collections.defaultdict(float)
	word_rel_dic = collections.defaultdict(float)
	n = 0
	for e in events_dict:
		v,a,synrel = e
		if synrel.startswith('nsubj'):
			synrel = 'nsubj'
		#if tuple(v.split('@')) in accepted_lemmas and tuple(a.split('@')) in accepted_lemmas:
		word_rel_dic[(v,synrel)]+=events_dict[e]
		word_rel_dic[(a, synrel)] += events_dict[e]
		pairs_dict[e] = events_dict[e]
		n+= events_dict[e]
	return pairs_dict, word_rel_dic, n


def get_lemma(w, dict):
	try:
		l = dict[w]
	except KeyError:
		l = 0
	return l

## 1. Baseline 1
**PPMI (structured input, input annotated with grammatical roles)**

The score of a sentence is the sum of the PPMIs of syntactic relations <head, dependent, role>
Frequencies from ukwac+wiki2018 corpora (f min = 2)

In [27]:
def baseline1(data_files, events_file, out_dir, smooth=False):
	events = load_events(events_file)
	#lemmas = load_lemmas(lemmas_freqs_file)
	events, wrel, N = events_bigram(events)

	for data_file in data_files:
		print('Reading:', data_file)
		data = load_formatted(data_file, 1)

		res = {}
		for id, item in data.items():
			item = [(i[0]+'@'+i[1],i[2]) for i in item]
			ppmis = []
			v = item[1][0]

			rel = 'nsubj'
			s = item[0][0]
			if (v, s, rel) not in events:
				sv_freq = 0
			else:
				sv_freq = events[(v, s, rel)]
			if smooth:
				sv_ppmi = laplace(sv_freq,get_lemma((s, rel), wrel),get_lemma((v, rel), wrel), N, len(events))
			else:
				sv_ppmi = ppmi(sv_freq,get_lemma((s, rel), wrel),get_lemma((v, rel), wrel), N)


			ppmis.append(sv_ppmi)

			for arg in item[2:]:
				arg, rel = arg
				if rel == 'OBJ':
					rel = 'dobj'
				elif rel.startswith('OBL'):
					try:
						prep = rel.split(':')[1]
						rel = 'nmod:' + prep
					except IndexError:
						rel = 'nmod:by'
				if (v, arg, rel) not in events:
					va_freq = 0
				else:
					va_freq = events[(v, arg, rel)]
				if smooth:
					va_ppmi = laplace(va_freq, get_lemma((v, rel), wrel),get_lemma((arg, rel), wrel), N, len(events))
				else:
					va_ppmi = ppmi(va_freq, get_lemma((v, rel), wrel),get_lemma((arg, rel), wrel), N)

				ppmis.append(va_ppmi)
			res[id] = ppmis

		fname = os.path.basename(data_file).split('.')[0]
		data_sent = pd.read_csv(os.path.join('datasets', fname+'.txt'), sep='\t', header=None)
		with open(os.path.join(out_dir, fname+'.scores_baseline1.txt'), 'w') as fout:
			for id in sorted(data):
				sent = data_sent.iloc[id][1]
				print('{}\t{}\t{}'.format(id, sent, sum(res[id])), file=fout)

## Baseline 2
**ngram sentence surprisal**

The score of a sentence is the sum of the PPMIs of each bigram in the sentence.
Frequencies from ukwac+wiki2018 corpora (f min = 5). Bigrams are considered in a window +-10.

In [29]:
def baseline2(data_files, events_file, lemmas_freqs_file, out_dir, smooth=False):
	lemmas_freq, N = load_lemmas(lemmas_freqs_file)
	#events = load_events(events_file)
	#events = load_events2(events_file, lemmas_freq.keys())
	for data_file in data_files:
		print('Reading:', data_file)
		data = load_formatted(data_file, 2)
		words = set(itertools.chain(*data.values()))
		events = load_events2(events_file, lemmas_freq.keys(), words)
		res = {}
		for id, item in data.items():
			ppmis = []
			if (item[0], item[1]) not in events:
				sv_freq = 0
			else:
				sv_freq = events[(item[0], item[1])]
			if smooth:
				sv_ppmi = laplace(sv_freq, get_lemma(item[0], lemmas_freq), get_lemma(item[1], lemmas_freq), N, len(lemmas_freq))
			else:
				sv_ppmi = ppmi(sv_freq, get_lemma(item[0], lemmas_freq), get_lemma(item[1], lemmas_freq), N)

			ppmis.append(sv_ppmi)

			for arg in item[2:]:
				if (item[1], arg) not in events:
					va_freq = 0
				else:
					va_freq = events[(item[1], arg)]
				if smooth:
					va_ppmi = laplace(va_freq, get_lemma(item[1], lemmas_freq), get_lemma(arg, lemmas_freq), N, len(lemmas_freq))
				else:
					va_ppmi = ppmi(va_freq, get_lemma(item[1], lemmas_freq), get_lemma(arg, lemmas_freq), N)

				ppmis.append(va_ppmi)
			res[id] = ppmis

		fname = os.path.basename(data_file).split('.')[0]
		data_sent = pd.read_csv(os.path.join('datasets', fname+'.txt'), sep='\t', header=None)
		with open(os.path.join(out_dir, fname+'.scores_baseline2.txt'), 'w') as fout:
			for id in sorted(data):
				sent = data_sent.iloc[id][1]
				print('{}\t{}'.format(id, sum(res[id])), file=fout)

### Run script


In [13]:
# Define parameters
smooth = True # apply laplace or not

#list dataset paths (specific format) 
f= ['datasets/parsed/new-EventsAdapt-sentences.txt']

out_dir = 'baseline_res/smoothed/'
os.makedirs(out_dir, exist_ok=True)

In [28]:
# baseline 1
#lem_path = 'freqs/lempos-freqs.50.filtered.gz'
event_path = 'freqs/events_baseline1-freqs.2.filtered.gz'
baseline1(f, event_path, out_dir, smooth)


Load events from: freqs/events_baseline1-freqs.2.filtered.gz
Reading: datasets/parsed/new-EventsAdapt-sentences.txt


In [30]:
# baseline 2
lempath = 'freqs/lemma-freqs.50.filtered.gz'
event_path = 'freqs/events_baseline2-freqs.5.filtered.gz'
baseline2(f, event_path, lempath, out_dir, smooth)

Reading: datasets/parsed/new-EventsAdapt-sentences.txt
Load events from: freqs/events_baseline2-freqs.5.filtered.gz


**NOTE**

Lemma and Event frequency files are not in github directory for space reason. Please contact the authors for getting these files.