In [None]:
import os
import io
import sys
import pandas as pd
import numpy as np
import scipy
from scipy.stats import spearmanr

## Функции

Сурс: https://github.com/akutuzov/rushifteval_public/blob/main/evaluation/evaluate.py

Функции отредактированы с учетом наших задач.

In [None]:
def get_ys(model_answers, true_answers, n_answers=3):
    """
    :param model_answers: path to tab-separated answer file (lemma + "\t" + tab-separated scores)
    :param true_answers: path to tab-separated gold answer file (lemma + "\t" + tab-separated scores)
    :param n_answers: how many scores (time period pairs) for each word?
    :return: a list of the model scores, and one for the true scores
    """
    y_hat_tmp = {}
    errors = 0
    with open(model_answers, "r") as f_in:
        for line in f_in:
            res = line.strip().split("\t")
            lemma = res[0]
            y_hat_tmp[lemma] = []
            for i in range(n_answers):
                score = res[1 + i]
                if score == "nan":
                    errors += 1
                y_hat_tmp[lemma].append(float(score))
    if errors:
        print("Found %d NaN predictions" % errors, file=sys.stderr)
    y_hat, y = [], []
    with open(true_answers, "r") as f_in:
        for line in f_in:
            res = line.strip().split("\t")
            lemma = res[0]
            scores = []
            for i in range(n_answers):
                score = res[1 + i]
                scores.append(float(score))
            try:
                predicted_answer = y_hat_tmp[lemma]
            except KeyError:
                raise SystemExit("Error: the word %s not found in the submission!" % lemma)
            assert len(predicted_answer) == len(scores)
            y.append(scores)
            y_hat.append(predicted_answer)

    return y_hat, y

In [None]:
def evaluation(model_answers, true_answers):
    """
    Computes the Spearman's correlation coefficient against the true rank as annotated by humans
    :param model_answers: list of scores' lists
    :param true_answers: list of scores' lists
    :return: (Spearman's correlation coefficient, p-value)
    """
    assert len(model_answers[0]) == len(true_answers[0])
    nr_scores = len(true_answers[0])
    correlations = []
    for i in range(nr_scores):
        cur_preds = [el[i] for el in model_answers]
        cur_golds = [el[i] for el in true_answers]
        r, p = spearmanr(cur_preds, cur_golds, nan_policy="omit")
        correlations.append((r, p))
    return correlations

In [None]:
def save_scores(pred_file, truth_file, output_file):
    """
    Evaluate a model and save the scores in a .txt file.
    """
    predictions, gold = get_ys(pred_file, truth_file)
    res = evaluation(predictions, gold)
    ave_score = (res[0][0] + res[1][0] + res[2][0]) / 3

    print("Spearman rho score 0: {:.3f}  p: {:.3f}".format(res[0][0], res[0][1]))
    print("Spearman rho score 1: {:.3f}  p: {:.3f}".format(res[1][0], res[1][1]))
    print("Spearman rho score 2: {:.3f}  p: {:.3f}".format(res[2][0], res[2][1]))
    print("Average score: {:.3f}".format(ave_score))

    with open(output_file, 'w', encoding="utf-8") as f:
        f.write("spearman0: {:.3f}\n".format(res[0][0]))
        f.write("spearman1: {:.3f}\n".format(res[1][0]))
        f.write("spearman2: {:.3f}\n".format(res[2][0]))
        f.write("ave_score: {:.3f}\n".format(ave_score))

## Скачиваем предсказания

In [None]:
!mkdir /content/predictions
!mkdir /content/targets
!mkdir /content/scores

mkdir: cannot create directory ‘/content/predictions’: File exists
mkdir: cannot create directory ‘/content/targets’: File exists
mkdir: cannot create directory ‘/content/scores’: File exists


In [None]:
!wget -O /content/predictions/predictions_w2v.tsv https://raw.githubusercontent.com/eanor/nlp_project_2024/main/predictions/prediction_w2v.tsv
!wget -O /content/predictions/predictions_bert.tsv https://raw.githubusercontent.com/eanor/nlp_project_2024/main/predictions/prediction_bert.tsv
!wget -O /content/predictions/predictions_bert_tiny.tsv https://raw.githubusercontent.com/eanor/nlp_project_2024/main/predictions/prediction_bert_tiny.tsv
!wget -O /content/predictions/predictions_chat.tsv https://raw.githubusercontent.com/eanor/nlp_project_2024/main/predictions/prediction_chat.tsv

--2024-03-24 18:48:55--  https://raw.githubusercontent.com/eanor/nlp_project_2024/main/predictions/prediction_w2v.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8125 (7.9K) [text/plain]
Saving to: ‘/content/predictions/predictions_w2v.tsv’


2024-03-24 18:48:55 (60.1 MB/s) - ‘/content/predictions/predictions_w2v.tsv’ saved [8125/8125]

--2024-03-24 18:48:55--  https://raw.githubusercontent.com/eanor/nlp_project_2024/main/predictions/prediction_bert.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7318 (7.1K) [text/plain]
S

In [None]:
!wget -O /content/targets/targets.tsv https://raw.githubusercontent.com/akutuzov/rushifteval_public/main/annotated_testset.tsv
!wget -O /content/targets/chat_targets.tsv https://raw.githubusercontent.com/eanor/nlp_project_2024/main/models/chat_targets.tsv

--2024-03-24 18:48:55--  https://raw.githubusercontent.com/akutuzov/rushifteval_public/main/annotated_testset.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6509 (6.4K) [text/plain]
Saving to: ‘/content/targets/targets.tsv’


2024-03-24 18:48:55 (39.7 MB/s) - ‘/content/targets/targets.tsv’ saved [6509/6509]

--2024-03-24 18:48:55--  https://raw.githubusercontent.com/eanor/nlp_project_2024/main/models/chat_targets.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1294 (1.3K) [text/plain]
Saving to: ‘/content/targets/chat_tar

In [None]:
preds_dir = "/content/predictions/"
targets_dir = "/content/targets/"
scores_dir = "/content/scores/"

## Ансамбль

In [None]:
def ensemble(m1_preds_fp, m2_preds_fp, output_fp, coef=0.5):
    m1_preds, m2_preds = get_ys(m1_preds_fp, m2_preds_fp)
    m1_preds, m2_preds = np.array(m1_preds), np.array(m2_preds)

    target_words = pd.read_csv(m1_preds_fp, sep="\t", names=["word", "score1", "score2", "score3"])["word"].to_list()

    new_preds = (m1_preds * coef) + (m2_preds * (1 - coef))

    new_preds_list = []
    for i in range(len(new_preds)):
        new_pred = {
            "word": target_words[i],
            "score1": new_preds[i][0],
            "score2": new_preds[i][1],
            "score3": new_preds[i][2],
        }
        new_preds_list.append(new_pred)

    pd.DataFrame(new_preds_list).to_csv(output_fp, sep="\t", index=False, header=False)

In [None]:
w2v_fp = preds_dir + "predictions_w2v.tsv"
bert_fp = preds_dir + "predictions_bert.tsv"

ensemble_fp = preds_dir + "predictions_ensemble.tsv"

coef = 0.9996

In [None]:
ensemble(w2v_fp, bert_fp, ensemble_fp, coef=coef)

## Оценка

In [None]:
model_names = ["w2v", "bert", "bert_tiny", "ensemble", "chat"]

In [None]:
for model_name in model_names:

    print(model_name + " scores")

    preds_fp = preds_dir + "predictions_" + model_name + ".tsv"
    targets_fp = targets_dir + "chat_targets.tsv" if model_name in ["chat", "elmo"] else targets_dir + "targets.tsv"
    scores_fp = scores_dir + "scores_" + model_name + ".txt"

    save_scores(preds_fp, targets_fp, scores_fp)

    print("\n")

w2v scores
Spearman rho score 0: 0.230  p: 0.022
Spearman rho score 1: 0.343  p: 0.001
Spearman rho score 2: 0.236  p: 0.019
Average score: 0.269


bert scores
Spearman rho score 0: 0.208  p: 0.039
Spearman rho score 1: 0.250  p: 0.013
Spearman rho score 2: 0.020  p: 0.841
Average score: 0.159


bert_tiny scores
Spearman rho score 0: -0.226  p: 0.024
Spearman rho score 1: 0.104  p: 0.307
Spearman rho score 2: -0.161  p: 0.112
Average score: -0.094


ensemble scores
Spearman rho score 0: 0.260  p: 0.009
Spearman rho score 1: 0.369  p: 0.000
Spearman rho score 2: 0.239  p: 0.017
Average score: 0.290


chat scores
Spearman rho score 0: -0.166  p: 0.484
Spearman rho score 1: -0.104  p: 0.662
Spearman rho score 2: 0.189  p: 0.425
Average score: -0.027


