In [1]:
import os
import time
import json
import pandas as pd
# import bleu from nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

In [2]:
def read_json(filename):
    try:
        with open(filename) as f:
            return json.load(f)
    except Exception as e:
        print(str(e))
        return None
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def extract_string_qald(json_data, language):
    for question in json_data['question']:
        if question['language'] == language:
            return question['string']
        

def extract_question_by_id(qald_json, qald_id):
    for question in qald_json['questions']:
        if question['id'] == qald_id:
            return question
        
standard_columns = ['system', 'dataset', 'language', 'id', 'experiment_url']

In [3]:
ds_paths = {
    "qald10": "qald10.json",
    "qald9train-dbpedia": "qald-9-train-multilingual.json",
    "qald9test-dbpedia": "qald-9-test-multilingual.json",
    "qald9train-wikidata": "qald-9-train-multilingual.json",
    "qald9test-wikidata": "qald-9-test-multilingual.json"
}

datasets = ['qald10']
languages = ['de', 'ru', 'fr']

In [4]:
dir_path = "../translated_data/" # or "../translated_data/"
log_files = [file for file in os.listdir(dir_path) if ".log" in file] 

In [5]:
df_list = []
for file in log_files:
    df = pd.read_csv(dir_path + file, sep='\t')
    # find which dataset is in the file
    dataset = [ds for ds in datasets if ds in file.lower()][0]
    # find which language is in the file
    # language = [lang for lang in languages if lang in file.lower()][0]
    # read the original dataset
    ds = read_json("../original_data/" + ds_paths[dataset])
    ds_tr = read_json(dir_path + file.replace(".log", ""))
    
    bleu_values = []
    meteor_values = []
    for i, row in df.iterrows():
        q_obj = extract_question_by_id(ds, row['id'])
        q_string = extract_string_qald(q_obj, "en")

        q_tr_obj = extract_question_by_id(ds_tr, row['id'])
        q_tr_string = extract_string_qald(q_tr_obj, "en")
        
        bleu_values.append(sentence_bleu([q_string.split()], q_tr_string.split()))
        meteor_values.append(meteor_score([q_string], q_tr_string))
        # print("The bleu score for questions {} is {}".format(q_string + " and " + q_tr_string, bleu_values[-1]))

    df['bleu'] = bleu_values
    df['meteor'] = meteor_values

    df["bleu_cat"] = df.bleu.apply(lambda x: 0 if x <= df.bleu.median() else 1)
    df["meteor_cat"] = df.meteor.apply(lambda x: 0 if x <= df.meteor.median() else 1)
    df["macroF1_cat"] = df.macroF1.apply(lambda x: 2 if x <= df.macroF1.median() else 3)

    df_list.append(df)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [6]:
# combine the categories in pairs
pairs = [(0, 2), (0, 3), (1, 2), (1, 3)]
values = [0]*len(pairs)
# compute how many respective pairs are in the data

for df in df_list:
    for i, pair in enumerate(pairs):
        values[i] += len(df[(df.bleu_cat == pair[0]) & (df.macroF1_cat == pair[1])])

In [7]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["BLEU < median", "BLEU > median", "F1 < median", "F1 > median"],
      color = ["blue", "green", "blue", "green"]
    ),
    link = dict(
      source = [pair[0] for pair in pairs],
      target = [pair[1] for pair in pairs],
      value = values,
      color = ["lightblue", "lightgreen", "lightblue", "lightgreen"]
  ))])

fig.update_layout(title_text="Does low BLEU leads to low QA F1?", font_size=10)
fig.show()

In [8]:
# combine the categories in pairs
pairs = [(0, 2), (0, 3), (1, 2), (1, 3)]
values = [0]*len(pairs)
# compute how many respective pairs are in the data

for df in df_list:
    for i, pair in enumerate(pairs):
        values[i] += len(df[(df.meteor_cat == pair[0]) & (df.macroF1_cat == pair[1])])

In [9]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["Meteor < median", "Meteor > median", "F1 < median", "F1 > median"],
      color = ["blue", "green", "blue", "green"]
    ),
    link = dict(
      source = [pair[0] for pair in pairs],
      target = [pair[1] for pair in pairs],
      value = values,
      color = ["lightblue", "lightgreen", "lightblue", "lightgreen"]
  ))])

fig.update_layout(title_text="Does low Meteor leads to low QA F1?", font_size=10)
fig.show()

In [14]:
df.macroF1.value_counts()

0    198
1    197
Name: bleu_cat, dtype: int64

## Scores distribution

In [15]:
# join list of lists inline (macroF1) across the df_list
values = []
for df in df_list:
    values += df.macroF1.tolist()
# create histogram fro macroF1
fig = go.Figure()
fig.add_trace(go.Histogram(x=values, name="F1"))

fig.update_layout(title_text="Histogram of F1 scores", font_size=10)
fig.show()