In [1]:
%matplotlib inline
import os
import glob
import json
import tarfile
from collections import Counter, defaultdict

import urllib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Download the wmt19 data from https://www.computing.dcu.ie/~ygraham/newstest2019-humaneval.tar.gz
# (linked from http://www.statmt.org/wmt19/results.html)

url = 'https://www.computing.dcu.ie/~ygraham/newstest2019-humaneval.tar.gz'
outdir = os.path.join('..', 'data', 'annotated_datasets')
if not os.path.exists(outdir):
    os.makedirs(outdir)
outfile = os.path.join(outdir, 'newstest2019-humaneval.tar.gz')
urllib.request.urlretrieve(url, outfile)

('../data/annotated_datasets/newstest2019-humaneval.tar.gz',
 <http.client.HTTPMessage at 0x7fd00fab3690>)

In [10]:
# Extract a particular .tsv file from the downloaded tar.gz file
with tarfile.open(outfile, "r:gz") as tar:
    filepath = os.path.join('newstest2019-humaneval', 'appraise-doclevel-humaneval-newstest2019', 'analysis', 'ad-good-raw-redup.csv')
    df = pd.read_csv(tar.extractfile(filepath), sep='\t')

print(df.shape)
df.head()

(194625, 12)


Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time
0,,engfine420,en,fi,ad,,HUMAN,,SYSTEM,rt.com.91335_8,100,20.122
1,,engfine420,en,fi,ad,,online-X.0,,SYSTEM,cnbc.com.6790_8,97,13.802
2,,engfine420,en,fi,ad,,online-Y.0,,SYSTEM,newsweek.51331_20,65,6.619
3,,engfine420,en,fi,ad,,online-Y.0,,SYSTEM,newsweek.51331_8,55,15.882
4,,engfine420,en,fi,ad,,online-Y.0,,SYSTEM,newsweek.51331_6,53,9.849


In [11]:
# Target language
print(set(df['Input.trg'].values))
target_counter = Counter(df['Input.trg'])
print(target_counter)

{'kk', 'ru', 'zh', 'gu', 'fi', 'de', 'lt', 'cs'}
Counter({'de': 49535, 'cs': 29207, 'zh': 28801, 'ru': 24441, 'fi': 22310, 'kk': 15039, 'lt': 14069, 'gu': 11223})


In [12]:
# Let's just use the largest subset for now (en->de)
subset = df[df['Input.trg'] == 'de']
print(subset.shape)

(49535, 12)


In [13]:
# Source language
print(set(df['Input.src'].values))

{'en'}


In [14]:
# Annotators
len(set(subset['WorkerId']))

286

In [15]:
# Translation systems (?)
len(set(subset['sys_id']))
sys_counter = Counter(subset['sys_id'])
print(sys_counter)

Counter({'HUMAN': 2440, 'UdS-DFKI.6871': 2288, 'MLLP-UPV.6651': 2274, 'NEU.6763': 2231, 'Microsoft-WMT19-sentence_document.6974': 2223, 'UCAM.6731': 2204, 'lmu-ctx-tf-single-en-de.6981': 2189, 'online-B.0': 2185, 'eTranslation.6823': 2174, 'dfki-nmt.6479': 2173, 'JHU.6819': 2168, 'TartuNLP-c.6508': 2154, 'online-G.0': 2151, 'online-A.0': 2135, 'online-X.0': 2119, 'Microsoft-WMT19-sentence-level.6785': 2115, 'online-Y.0': 2101, 'Facebook_FAIR.6862': 2091, 'PROMT_NMT_EN-DE.6674': 2071, 'Microsoft-WMT19-document-level.6808': 2064, 'en_de_task.6790': 2053, 'Helsinki-NLP.6820': 2029, 'MSRA.MADL.6926': 1903})


In [16]:
# Sentence ID (?)
print(len(set(df['sid'])))
print(len(set(subset['sid'])))

1997
1997


In [17]:
# not meaningful
print(set(df['type']))

{'SYSTEM'}


In [18]:
# Choose two models and assign numeric IDs
model_map = {'HUMAN': -0.5, 'UdS-DFKI.6871': 0.5}
models = subset['sys_id'].values
sel = [index for i, index in enumerate(subset.index) if models[i] in model_map]

In [19]:
# Take a subset
subset_comp = subset.loc[sel]
print(subset_comp.shape)
subset_comp.head()

(4728, 12)


Unnamed: 0,HITId,WorkerId,Input.src,Input.trg,Input.item,hit,sys_id,rid,type,sid,score,time
104,,engdeue428,en,de,ad,,UdS-DFKI.6871,,SYSTEM,dailymail.co.uk.298696_16,81,5.865
341,,engdeue42f,en,de,ad,,HUMAN,,SYSTEM,rt.com.91334_13,85,31.888
342,,engdeue42f,en,de,ad,,HUMAN,,SYSTEM,rt.com.91334_5,92,43.522
346,,engdeue42f,en,de,ad,,HUMAN,,SYSTEM,upi.176251_12,93,10.688
347,,engdeue42f,en,de,ad,,HUMAN,,SYSTEM,upi.176251_10,76,20.09


In [20]:
# Squeeze scores into [0, 1]
print(set(subset_comp['sys_id'].values))
subset_comp['model_code'] = [model_map[model] for model in subset_comp['sys_id'].values]
subset_comp['score_scaled'] = [s/100. for s in subset_comp['score'].values]

{'UdS-DFKI.6871', 'HUMAN'}


In [22]:
subset_comp.to_csv(os.path.join(outdir, 'wmt19.csv'))