In [1]:
import math
import numpy as np
import os
import pandas as pd

from lxml import etree
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import single_meteor_score
from PyMultiDictionary import MultiDictionary
from sacrebleu.metrics import BLEU, CHRF
from tqdm import tqdm


#### Preprocessing

First, we start by parsing the trees generated using UCCA

In [2]:
def parse_words(father):
    words = {}
    for node in father:
        if node.tag == "node":
            for attribute in node:
                words[node.attrib["ID"]] = attribute.attrib["text"]
    return words

def is_edge_remote(edge):
    for attribute in edge:
        if attribute.tag == "attributes" and "remote" in attribute.attrib.keys():
            return attribute.attrib["remote"]
    return False

def parse_edges(father):
    edges = {}
    for node in father:
        if node.tag == "node":
            id = node.attrib["ID"]
            for edge in node:
                if edge.tag == "edge":
                    toid = edge.attrib["toID"]

                    if is_edge_remote(edge):
                        continue

                    for category in edge:
                        if category.tag == "category":
                            tag = category.attrib["tag"]
                            edges[toid] = (id, tag)
    return edges

def parse_trees(trees_path, sentences_path, offset):
    sentences = None
    with open(sentences_path, "r") as file:
        sentences = [line.strip("\n") for line in file.readlines() if line != "\n"]

    trees = {}
    tree_files = os.listdir(trees_path)
    tree_files.sort(key=lambda x: int(x[offset:-4]))
    for i, file in enumerate(tree_files):
        tree_path = os.path.join(trees_path, file)

        parser = etree.XMLParser(recover=True, encoding="utf-8")
        tree = etree.parse(tree_path, parser=parser)

        words = {}
        edges = {}
        for child in tree.getroot():
            if child.tag == "layer" and child.attrib["layerID"] == "0":
                words = parse_words(child)
            if child.tag == "layer" and child.attrib["layerID"] == "1":
                edges = parse_edges(child)
        trees[sentences[i]] = (words, edges)
    return trees

Then we generate the core-words for each sentence.

In [3]:
def generate_core_words(trees):
    core_words_dict = {}
    for sentence, (words, edges) in trees.items():
        core_words = []
        for node, (parent, _) in edges.items():
            if node[0] == "0": # node is a leaf
                _, tag = edges[parent]
                if tag in "PSAC":
                    core_words.append(words[node])
        core_words_dict[sentence] = core_words
    return core_words_dict

Finally, for each sentence we calculate aditional information such as:
- the number of scenes 
- the number of nodes
- the number of critical edges

In [4]:
def generate_penalty_information(trees):
    penalties_dict = {}
    for sentence, (words, edges) in trees.items():
        scene_count = 0
        for _, tag in edges.values():
            if tag in "PS":
                scene_count += 1

        node_count = len(edges) - len(words) + 1

        critical_edge_count = 0
        for _, tag in edges.values():
            if tag in "PSA":
                critical_edge_count += 1

        penalties_dict[sentence] = (scene_count, node_count, critical_edge_count)
    return penalties_dict

This is a list of constant values, used for language control ("en" or "de")

In [5]:
LANGUAGE = "en"
REFERENCE_TREES = f"../data/{LANGUAGE}/{LANGUAGE}-references-trees/"
REFERENCE_SENTENCES = f"../data/{LANGUAGE}/{LANGUAGE}_refs.txt"
REFERENCE_OFFSET = 8
SYSOUT_TREES = f"../data/{LANGUAGE}/{LANGUAGE}-sysout-trees/"
SYSOUT_SENTENCES = f"../data/{LANGUAGE}/{LANGUAGE}_sysout.txt"
SYSOUT_OFFSET = 10


#### Parameters generation

Functions for calculating the precision, recall and f1 score

In [6]:
OMEGA = 0.5

def generate_synonims(word, stemmer):
    synonims = []
    if LANGUAGE == "en":
        for syn in wordnet.synsets(word):
            for l in syn.lemmas():
                synonims.append(l.name())
    else:
        dictionary = MultiDictionary()
        try:
            synonims = dictionary.synonym("de", word)
        except:
            synonims = []

    synonims = [stemmer.stem(word) for word in synonims]
    return set(synonims)

def compute_precision_recall(reference, predict, stemmer, include_synonims):
    matches = 0
    marked = [False for _ in range(len(reference))]
    for word_predict, stem_word_predict in predict:
        match_found = False
        for i, (_, stem_word_ref) in enumerate(reference):
            if not marked[i] and stem_word_ref == stem_word_predict:
                matches += 1
                marked[i] = True
                match_found = True
                break
        if not match_found and include_synonims:
            synonims = generate_synonims(word_predict, stemmer)
            for i, (_, stem_word_ref) in enumerate(reference):
                if not marked[i] and stem_word_ref in synonims:
                    matches += 1
                    marked[i] = True
                    break
    precision = (matches / len(predict)) if len(predict) > 0 else OMEGA
    recall = (matches / len(reference)) if len(reference) > 0 else OMEGA
    return precision, recall


def calculate_accuracy(reference, predict, include_synonims):
    if LANGUAGE == "en":
        stemmer = PorterStemmer()
    else:
        stemmer = SnowballStemmer("german")
    
    reference = list(zip(reference, [stemmer.stem(word) for word in reference]))
    predict = list(zip(predict, [stemmer.stem(word) for word in predict]))

    precision, recall = compute_precision_recall(reference, predict, stemmer, include_synonims)
    f1 = (2.0 * precision * recall / (precision + recall)) if max(precision, recall) > 0 else 0.0
    return precision, recall, f1

Penalties for a pair of (reference, predicted) sentences according to the paper

In [7]:
def calculate_penalties_for_pair(reference, predict):
    # reference = (state_count, node_count, critical_edges_count)
    # predict = (state_count, node_count, critical_edges_count)
    ps = 0 if min(reference[0], predict[0]) == 0 else 1 - min(reference[0], predict[0]) / max(reference[0], predict[0])
    pn = 0 if min(reference[1], predict[1]) == 0 else 1 - min(reference[1], predict[1]) / max(reference[1], predict[1])
    pe = 0 if min(reference[2], predict[2]) == 0 else 1 - min(reference[2], predict[2]) / max(reference[2], predict[2])

    return ps, pn, pe

Here we compute all the training values for each pair of (reference, predict) sentences such as:
- precision, recall, f1 score
- penalties for states, nodes and critical edges
- the average word count of a sentence pair
- the bleu, meteor and rhcf scores

In [8]:
def custom_tokenizer(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalnum()]
    return tokens
    
def count_words(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum()]
    return len(words)

def generate_parameters(df, trees, include_synonims=False):
    core_words = generate_core_words(trees)
    penalties = generate_penalty_information(trees)

    parameters = {
        "precision": [],
        "recall": [],
        "f1": [],
        "ps": [],
        "pn": [],
        "pe": [],
        "Len": [],
        "raw_score": [],
        "bleu_score": [],
        "meteor_score": [],
        "chrf_score": [],
    }

    bleu = BLEU(effective_order=True)
    chrf = CHRF()
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        reference = row["segreference"]
        predict = row["segpredict"]

        precision, recall, f1 = calculate_accuracy(core_words[reference], core_words[predict], include_synonims)
        parameters["precision"].append(precision)
        parameters["recall"].append(recall)
        parameters["f1"].append(f1)

        ps, pn, pe = calculate_penalties_for_pair(penalties[reference], penalties[predict])
        parameters["ps"].append(ps)
        parameters["pn"].append(pn)
        parameters["pe"].append(pe)

        Len = (count_words(reference) + count_words(predict)) / 2.0
        parameters["Len"].append(Len)

        parameters["raw_score"].append(row["raw_score"])
        parameters["bleu_score"].append(bleu.sentence_score(predict, [reference]).score)
        parameters["meteor_score"].append(single_meteor_score(custom_tokenizer(reference), custom_tokenizer(predict)))
        parameters["chrf_score"].append(chrf.sentence_score(predict, [reference]).score)

    return pd.DataFrame(parameters)
    

# Core

In [9]:
def load_dataset(path):
    train_df = pd.read_csv(os.path.join(path, "train.csv")).dropna()
    test_df = pd.read_csv(os.path.join(path, "train.csv")).dropna()
    return train_df, test_df

In [10]:

train_df, test_df = load_dataset(f"../data/{LANGUAGE}/")

trees = {
    **parse_trees(REFERENCE_TREES, REFERENCE_SENTENCES, REFERENCE_OFFSET),
    **parse_trees(SYSOUT_TREES, SYSOUT_SENTENCES, SYSOUT_OFFSET)
}
core_words = generate_core_words(trees)
penalties = generate_penalty_information(trees)

# Evaluation 

### Functions for evaluation purposes

In [11]:
def calculate_swss_augment(df, model, parameters):
    alpha1, alpha2, alpha3, alpha4, beta = parameters
    augmented_scores = []
    for _, row in df.iterrows():
        f1 = float(row["f1"])
        ps = float(row["ps"])
        pn = float(row["pn"])
        pe = float(row["pe"])
        Len = float(row["Len"])
        model_score = float(row[f"{model}_score"])

        score = f1 * math.exp(-alpha1 * ps - alpha2 * pn - alpha3 * pe - alpha4 * Len)
        augmented_score = model_score + beta * score
        augmented_scores.append(augmented_score)
    return augmented_scores

def evaluate(df, model, parameters):
    model_correlation = np.corrcoef(df.raw_score, df[f"{model}_score"])[0][1]

    augmented_scores = calculate_swss_augment(df, model, parameters)
    model_swss_correlation = np.corrcoef(df.raw_score, augmented_scores)[0][1]

    print(f"{model} only score:   [{model_correlation}]")
    print(f"{model} + swss score: [{model_swss_correlation}]")

# Results

### Without synonims

In [12]:
train_params = generate_parameters(train_df, trees, include_synonims=False)
test_params = generate_parameters(test_df, trees, include_synonims=False)

#alpha1, alpha2, alpha3, alpha4, beta
parameters = [0.2, 1, 0.1, 0.01, 0.2]
evaluate(test_params, "bleu", parameters)
evaluate(test_params, "meteor", parameters)
evaluate(test_params, "chrf", parameters)

bleu only score:   [0.34224190946694766]
bleu + swss score: [0.34261679790904326]
meteor only score:   [0.5289765358914779]
meteor + swss score: [0.5312975958919443]
chrf only score:   [0.5074722657623074]
chrf + swss score: [0.507587844010924]


### With synonims

In [13]:
train_params = generate_parameters(train_df, trees, include_synonims=True)
test_params = generate_parameters(test_df, trees, include_synonims=True)

#alpha1, alpha2, alpha3, alpha4, beta
parameters = [0.2, 1, 0.1, 0.01, 0.2]
evaluate(test_params, "bleu", parameters)
evaluate(test_params, "meteor", parameters)
evaluate(test_params, "chrf", parameters)

bleu only score:   [0.34224190946694766]
bleu + swss score: [0.342642984350138]
meteor only score:   [0.5289765358914779]
meteor + swss score: [0.5332413108644422]
chrf only score:   [0.5074722657623074]
chrf + swss score: [0.5076255010016812]
