In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from plotly import graph_objs as go
from plotly.offline import iplot

from tseval.feature_extraction import get_all_vectorizers
from tseval.qats import (ASPECTS, get_qats_train_data, get_qats_test_data, row_vectorize, evaluate_scoring_method_on_qats,
                         evaluate_regression_pipeline_on_qats, evaluate_classification_pipeline_on_qats, get_qats_results,
                         pearsonr_with_confidence_interval)

from tqdm import tqdm_notebook as tqdm  # Need this import to be last

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Label distribution

In [3]:
counters = defaultdict(dict)
for aspect in ASPECTS:
    counters[aspect]['train'] = Counter(get_qats_train_data(aspect)[1])
    counters[aspect]['test'] = Counter(get_qats_test_data(aspect)[1])
labels = [0, 50, 100]
data = []
for label in labels:
    data.append(go.Bar(x=ASPECTS,
                       y=[(counters[aspect]['train'] + counters[aspect]['test'])[label]
                          for aspect in ASPECTS],
                       marker={'color': {100: 'green', 50: 'orange', 0: 'red'}[label]},
                       name={100: 'Good', 50: 'OK', 0: 'Bad'}[label]))
layout = go.Layout(
        barmode='stack',
        autosize=False,
        width=460,
        height=410,
)
iplot(go.Figure(data=data, layout=layout))

In [4]:
#vectorizers = [wrap_single_sentence_vectorizer(vec) for vec in get_sentence_feature_extractors()]
vectorizers = get_all_vectorizers()

def vectorize_sentence_pair(complex_sentence, simple_sentence):
    return [vec(complex_sentence, simple_sentence) for vec in vectorizers]

feature_extractor = row_vectorize(vectorize_sentence_pair)
vectorizer_names = np.array([vectorizer.__name__ for vectorizer in vectorizers])

# Single features

In [5]:
%%time
single_feature_dfs = {}
metric_name = 'pearson'
for aspect in ['grammaticality', 'meaning_preservation', 'simplicity', 'overall']:
    df = pd.DataFrame(columns=['team', f'valid_{metric_name}', f'{metric_name}'])
    for vectorizer in tqdm(vectorizers):
        df = df.append(evaluate_scoring_method_on_qats(aspect, vectorizer), ignore_index=True)
        df[f'valid_{metric_name}_abs'] = df[f'valid_{metric_name}'].abs()
        single_feature_dfs[aspect] = df.sort_values(by=f'valid_{metric_name}_abs', ascending=False)

HBox(children=(IntProgress(value=0, max=66), HTML(value='')))

Loading fairseq language model...
| dictionary: 267744 types
Done.
Loading FastText embeddings...
Done.
Loading NLGEval models...
Done.
Computing TERp features on all QATS sentence pairs.
Done.
Computing QuEst features on all QATS sentence pairs.
Done.



HBox(children=(IntProgress(value=0, max=66), HTML(value='')))




HBox(children=(IntProgress(value=0, max=66), HTML(value='')))




HBox(children=(IntProgress(value=0, max=66), HTML(value='')))


CPU times: user 11min 31s, sys: 47 s, total: 12min 18s
Wall time: 1min 30s


In [6]:
print('grammaticality')
single_feature_dfs['grammaticality']

grammaticality


Unnamed: 0,team,valid_pearson,pearson,valid_conf_int_high,valid_conf_int_low,valid_p,valid_pearson_abs
27,nlgeval_METEOR,0.357566,0.388357,0.431363,0.279016,1.122385e-16,0.357566
32,nltkBLEU_method2,0.329751,0.343791,0.405344,0.249680,2.835198e-14,0.329751
34,nltkBLEU_method4,0.328759,0.337544,0.404413,0.248636,3.419055e-14,0.328759
37,nltkBLEU_method7,0.328638,0.344558,0.404300,0.248509,3.497754e-14,0.328638
33,nltkBLEU_method3,0.324548,0.340045,0.400462,0.244208,7.511127e-14,0.324548
35,nltkBLEU_method5,0.324099,0.346104,0.400041,0.243736,8.162851e-14,0.324099
36,nltkBLEU_method6,0.322887,0.340999,0.398903,0.242463,1.021055e-13,0.322887
31,nltkBLEU_method1,0.321963,0.339534,0.398036,0.241493,1.210143e-13,0.321963
26,nlgeval_Bleu_4,0.317196,0.338076,0.393559,0.236486,2.882516e-13,0.317196
30,nltkBLEU_method0,0.317195,0.338076,0.393557,0.236485,2.883162e-13,0.317195


In [7]:
print('meaning_preservation')
single_feature_dfs['meaning_preservation']

meaning_preservation


Unnamed: 0,team,valid_pearson,pearson,valid_conf_int_high,valid_conf_int_low,valid_p,valid_pearson_abs
37,nltkBLEU_method7,0.588836,0.522980,0.643052,0.528748,1.885627e-48,0.588836
32,nltkBLEU_method2,0.587551,0.520681,0.641898,0.527331,3.379901e-48,0.587551
34,nltkBLEU_method4,0.586188,0.516174,0.640674,0.525828,6.257325e-48,0.586188
35,nltkBLEU_method5,0.583989,0.518289,0.638699,0.523405,1.680190e-47,0.583989
33,nltkBLEU_method3,0.581627,0.511795,0.636576,0.520802,4.815431e-47,0.581627
36,nltkBLEU_method6,0.579019,0.512754,0.634231,0.517931,1.524150e-46,0.579019
31,nltkBLEU_method1,0.578284,0.510670,0.633570,0.517121,2.105299e-46,0.578284
25,nlgeval_Bleu_3,0.574913,0.519681,0.630537,0.513412,9.159703e-46,0.574913
27,nlgeval_METEOR,0.574645,0.582689,0.630296,0.513118,1.028769e-45,0.574645
24,nlgeval_Bleu_2,0.572326,0.522042,0.628210,0.510568,2.797788e-45,0.572326


In [8]:
print('simplicity')
single_feature_dfs['simplicity']

simplicity


Unnamed: 0,team,valid_pearson,pearson,valid_conf_int_high,valid_conf_int_low,valid_p,valid_pearson_abs
5,count_characters_per_sentence,-0.524111,-0.454583,-0.457792,-0.584630,5.631013e-37,0.524111
6,count_syllables_per_sentence,-0.518966,-0.485575,-0.452187,-0.579959,3.631159e-36,0.518966
4,count_words_per_sentence,-0.511017,-0.385900,-0.443539,-0.572735,6.077295e-35,0.511017
1,count_characters,-0.477266,-0.371695,-0.406958,-0.541952,4.322839e-30,0.477266
50,QuEst_nb_target_tokens,-0.471573,-0.287381,-0.400810,-0.536742,2.525412e-29,0.471573
0,count_words,-0.469456,-0.288663,-0.398526,-0.534804,4.826552e-29,0.469456
3,count_syllables_in_sentence,-0.459571,-0.416363,-0.387870,-0.525744,9.371226e-28,0.459571
65,QuEst_nb_target_punct,-0.416411,-0.311810,-0.341567,-0.486008,1.340936e-22,0.416411
47,TERp_NumWd,-0.381278,-0.215241,-0.304141,-0.453448,6.406164e-19,0.381278
49,QuEst_nb_source_tokens,-0.376916,-0.211450,-0.299512,-0.449392,1.711243e-18,0.376916


In [9]:
print('overall')
single_feature_dfs['overall']

overall


Unnamed: 0,team,valid_pearson,pearson,valid_conf_int_high,valid_conf_int_low,valid_p,valid_pearson_abs
49,QuEst_nb_source_tokens,-0.433590,-0.215692,-0.359953,-0.501858,1.457161e-24,0.433590
47,TERp_NumWd,-0.432297,-0.219897,-0.358567,-0.500667,2.066743e-24,0.432297
1,count_characters,-0.306712,-0.077350,-0.225491,-0.383698,1.842269e-12,0.306712
3,count_syllables_in_sentence,-0.306259,-0.130305,-0.225017,-0.383272,1.992580e-12,0.306259
50,QuEst_nb_target_tokens,-0.294076,-0.019842,-0.212268,-0.371791,1.564298e-11,0.294076
0,count_words,-0.289823,-0.017346,-0.207823,-0.367777,3.140102e-11,0.289823
27,nlgeval_METEOR,0.281753,0.148445,0.360154,0.199401,1.140887e-10,0.281753
14,sentence_fkgl,-0.279873,-0.244636,-0.197440,-0.358377,1.531859e-10,0.279873
6,count_syllables_per_sentence,-0.274673,-0.178439,-0.192020,-0.353457,3.421647e-10,0.274673
46,TERp_NumEr,-0.273803,-0.144755,-0.191113,-0.352633,3.907637e-10,0.273803


# Bag of features

In [10]:
from sklearn import tree, ensemble, linear_model, neighbors, svm, neural_network
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn import decomposition


def get_sklearn_classifiers(probability=False):
    return [
        tree.DecisionTreeClassifier(max_depth=5),
        ensemble.RandomForestClassifier(random_state=42),
        ensemble.AdaBoostClassifier(random_state=42),
        ensemble.GradientBoostingClassifier(random_state=42),
        linear_model.LogisticRegression(),
        neighbors.KNeighborsClassifier(3),
        svm.SVC(kernel="linear", C=0.025),
        neural_network.MLPClassifier(alpha=1),
    ]


def get_sklearn_regressors():
    return [
        linear_model.LinearRegression(),
        linear_model.Lasso(),
        ensemble.AdaBoostRegressor(),
        ensemble.GradientBoostingRegressor(),
        ensemble.RandomForestRegressor(),
        linear_model.Ridge(),
        svm.LinearSVR(),
        ensemble.BaggingRegressor(),
    ]


def get_regression_df_and_pipelines(aspects):
    metric_name = 'pearson'
    df = pd.DataFrame(columns=['team', f'valid_{metric_name}', f'{metric_name}'])
    pipelines = []
    for model in tqdm(get_sklearn_regressors()):
        method_name = model.__class__.__name__
        print(method_name)
        pipeline = Pipeline(steps=[
            ('feature_extraction', FunctionTransformer(feature_extractor)),
            ('feature_scaling', StandardScaler()),
            #('feature_skewing', FeatureSkewer()),
            #('feature_selection', SelectFromModel(LinearRegression())),
            ('dimensionality_reduction', decomposition.PCA(n_components=25)),
            ('prediction', model),
        ])
        pipelines.append(pipeline)
        df = df.append(evaluate_regression_pipeline_on_qats(aspect, pipeline, method_name), ignore_index=True)
    df = df.sort_values(by=f'valid_{metric_name}', ascending=False)
    return df, pipelines


def get_classification_df_and_pipelines(aspect):
    metric_name = 'weighted_f_score'
    df = pd.DataFrame(columns=['team', f'valid_{metric_name}', f'{metric_name}'])
    pipelines = []
    for model in tqdm(get_sklearn_classifiers()):
        method_name = model.__class__.__name__
        print(method_name)
        pipeline = Pipeline(steps=[
            ('feature_extraction', FunctionTransformer(feature_extractor)),
            ('feature_scaling', StandardScaler()),
            #('feature_skewing', FeatureSkewer()),
            #('feature_selection', SelectFromModel(LinearRegression())),
            ('dimensionality_reduction', decomposition.PCA(n_components=25)),
            ('prediction', model),
        ])
        pipelines.append(pipeline)
        df = df.append(evaluate_classification_pipeline_on_qats(aspect, pipeline, method_name), ignore_index=True)
    df = df.sort_values(by=f'valid_{metric_name}', ascending=False)
    return df, pipelines


def analyse_results(df, pipelines, metric_name):
    argmax = int(df[f'valid_{metric_name}'].idxmax())
    if 'feature_selection' in pipelines[argmax].named_steps:
        print('Selected features: ', get_selected_features(pipelines[argmax], vectorizer_names))
    if 'dimensionality_reduction' in pipelines[argmax].named_steps:
        pca = pipelines[argmax].named_steps['dimensionality_reduction']
        vectorizer_names[np.argsort(pca.components_[0])]
        print('PCA explained variance: ', pca.explained_variance_ratio_)

    data = [go.Scatter(x=df[f'valid_{metric_name}'].values, y=df[metric_name].values, mode='markers', marker={'size': 5})]
    layout = go.Layout(title='test = f(validation)')
    iplot(go.Figure(data=data, layout=layout))

    print('Ranking')
    return df

In [11]:
def to_latex(df, caption=None, label=None):
    df = df.copy()
    latex_str = '\n'.join([
        r'\begin{table*}',
        df.to_latex(index=False, float_format='%.2f'),
        r'\caption{' + caption + '}' if caption is not None else '',
        r'\label{' + label + '}' if label is not None else '',
        r'\end{table*}',
    ])
    return latex_str

pd.set_option('display.max_colwidth', -1)

## Regression

In [12]:
%%time
regression_results = {}
for aspect in tqdm(['grammaticality', 'meaning_preservation', 'simplicity', 'overall']):
    regression_results[aspect] = get_regression_df_and_pipelines(aspect)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

LinearRegression
Lasso
AdaBoostRegressor
GradientBoostingRegressor
RandomForestRegressor
Ridge
LinearSVR
BaggingRegressor


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

LinearRegression
Lasso
AdaBoostRegressor
GradientBoostingRegressor
RandomForestRegressor
Ridge
LinearSVR
BaggingRegressor


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

LinearRegression
Lasso
AdaBoostRegressor
GradientBoostingRegressor
RandomForestRegressor
Ridge
LinearSVR
BaggingRegressor


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

LinearRegression
Lasso
AdaBoostRegressor
GradientBoostingRegressor
RandomForestRegressor
Ridge
LinearSVR
BaggingRegressor

CPU times: user 1h 36min 44s, sys: 6min 44s, total: 1h 43min 29s
Wall time: 12min 58s


In [13]:
aspect = 'grammaticality'
analyse_results(*regression_results[aspect], 'pearson')

PCA explained variance:  [0.37227626 0.15087662 0.09159073 0.04679639 0.041848   0.0314406
 0.02637758 0.02401313 0.02124521 0.02087028 0.0185782  0.01700259
 0.01555948 0.0135297  0.01252479 0.01095355 0.01033294 0.00980717
 0.00788858 0.00741996 0.00664606 0.00536601 0.00491738 0.00439922
 0.00373379]


Ranking


Unnamed: 0,team,valid_pearson,pearson,valid_conf_int
3,GradientBoostingRegressor,0.23478,0.136178,0.143852
1,Lasso,0.221512,0.327185,0.136472
5,Ridge,0.220793,0.338792,0.123523
0,LinearRegression,0.220645,0.338691,0.123364
7,BaggingRegressor,0.196564,0.321468,0.114759
2,AdaBoostRegressor,0.159074,0.303485,0.159531
4,RandomForestRegressor,0.153315,0.21198,0.147935
6,LinearSVR,0.063624,0.064631,0.164817


In [14]:
# Our methods in the QATS leaderboard
metric = 'pearson'
result_dfs = {aspect: df for aspect, (df, _) in regression_results.items()}
dfs = []
for aspect in ['grammaticality', 'meaning_preservation', 'simplicity', 'overall']:
    df_leaderboard = get_qats_results(aspect=aspect)[['team', metric]].dropna()
    df_ours = result_dfs[aspect][['team', metric]].head(2)  # Take our two best methods
    df_ours['team'] = df_ours['team'].apply(lambda team: f'**{team}**')  # Make our methods stand out
    df_leaderboard = df_leaderboard.append(df_ours, ignore_index=True)
    df_leaderboard = df_leaderboard.sort_values(by=metric, ascending=False).reset_index()
    df_leaderboard[aspect] = df_leaderboard[metric].map(lambda x: f'{x:.3f}   ') + df_leaderboard['team']
    dfs.append(df_leaderboard[aspect])
df_leaderboard = pd.concat(dfs, axis=1)
df_leaderboard
#print(to_latex(df_leaderboard))

Unnamed: 0,grammaticality,meaning_preservation,simplicity,overall
0,0.482 OSVCML1,0.588 IIT-Meteor,0.487 **Ridge**,0.423 **Ridge**
1,0.384 METEOR,0.585 OSVCML,0.457 **LinearSVR**,0.423 **LinearRegression**
2,0.344 BLEU,0.575 **Ridge**,0.382 OSVCML1,0.343 OSVCML2
3,0.340 OSVCML,0.573 OSVCML2,0.376 OSVCML2,0.334 OSVCML
4,0.327 **Lasso**,0.555 **Lasso**,0.339 OSVCML,0.232 SimpleNets-RNN2
5,0.323 TER,0.533 BLEU,0.320 SimpleNets-MLP,0.230 OSVCML1
6,0.308 SimpleNets-MLP,0.527 METEOR,0.307 SimpleNets-RNN3,0.205 UoLGP-emb
7,0.308 WER,0.513 TER,0.240 SimpleNets-RNN2,0.198 SimpleNets-MLP
8,0.256 UoLGP-emb,0.495 WER,0.123 UoLGP-combo,0.196 METEOR
9,0.256 UoLGP-combo,0.482 OSVCML1,0.120 UoLGP-emb,0.189 UoLGP-combo


In [15]:
df, pipelines = regression_results['grammaticality']
[lasso_pipeline] = [pipeline for pipeline in pipelines if pipeline.named_steps['prediction'].__class__.__name__ == 'Lasso']
test_sentences, test_labels = get_qats_test_data('grammaticality')
pred_labels = lasso_pipeline.predict(test_sentences)
score, p_value, conf_int_low, conf_int_high = pearsonr_with_confidence_interval(test_labels, pred_labels)
uncertainty = np.max([score - conf_int_low, conf_int_high - score])
print('Confidence interval of the Lasso model on the test set')
print(f'Score ~ {score:.2f} +- {uncertainty:.2f}')
print(f'Confidence interval: {conf_int_low:.2f} < {score:.2f} < {conf_int_high:.2f}')

Confidence interval of the Lasso model on the test set
Score ~ 0.33 +- 0.17
Confidence interval: 0.16 < 0.33 < 0.47


## Classification

In [16]:
%%time
classification_results = {}
for aspect in tqdm(['grammaticality', 'meaning_preservation', 'simplicity', 'overall']):
    classification_results[aspect] = get_classification_df_and_pipelines(aspect)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
LogisticRegression
KNeighborsClassifier
SVC
MLPClassifier


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
LogisticRegression
KNeighborsClassifier
SVC
MLPClassifier


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
LogisticRegression
KNeighborsClassifier
SVC
MLPClassifier


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
LogisticRegression
KNeighborsClassifier
SVC
MLPClassifier

CPU times: user 1h 54min 7s, sys: 7min 45s, total: 2h 1min 52s
Wall time: 13min 40s


In [17]:
aspect = 'grammaticality'
analyse_results(*classification_results[aspect], 'weighted_f_score')

PCA explained variance:  [0.37227626 0.15087662 0.09159073 0.04679639 0.041848   0.0314406
 0.02637758 0.02401313 0.02124521 0.02087028 0.0185782  0.01700259
 0.01555948 0.0135297  0.01252479 0.01095355 0.01033294 0.00980717
 0.00788858 0.00741996 0.00664606 0.00536601 0.00491738 0.00439922
 0.00373379]


Ranking


Unnamed: 0,team,valid_weighted_f_score,weighted_f_score,valid_conf_int
4,LogisticRegression,0.672138,0.704345,0.044422
3,GradientBoostingClassifier,0.67065,0.67598,0.031226
7,MLPClassifier,0.649749,0.716422,0.054547
1,RandomForestClassifier,0.64664,0.684533,0.041006
6,SVC,0.642613,0.658945,0.019637
0,DecisionTreeClassifier,0.63651,0.642173,0.056255
5,KNeighborsClassifier,0.627538,0.691141,0.075061
2,AdaBoostClassifier,0.620503,0.627691,0.04761


In [18]:
# Our methods in the QATS leaderboard
metric = 'weighted_f_score'
result_dfs = {aspect: df for aspect, (df, _) in classification_results.items()}
dfs = []
for aspect in ['grammaticality', 'meaning_preservation', 'simplicity', 'overall']:
    df_leaderboard = get_qats_results(aspect=aspect)[['team', metric]].dropna()
    df_ours = result_dfs[aspect][['team', metric]].head(2)  # Take our two best methods
    df_ours[metric] = df_ours[metric] * 100
    df_ours['team'] = df_ours['team'].apply(lambda team: f'**{team}**')  # Make our methods stand out
    df_leaderboard = df_leaderboard.append(df_ours, ignore_index=True)
    df_leaderboard = df_leaderboard.sort_values(by=metric, ascending=False).reset_index()
    df_leaderboard[aspect] = df_leaderboard[metric].map(lambda x: f'{x:.2f}   ') + df_leaderboard['team']
    dfs.append(df_leaderboard[aspect])
df_leaderboard = pd.concat(dfs, axis=1)
df_leaderboard
#print(to_latex(df_leaderboard))

Unnamed: 0,grammaticality,meaning_preservation,simplicity,overall
0,71.84 SMH-RandForest,70.14 **SVC**,61.60 **SVC**,49.61 **LogisticRegression**
1,71.64 SMH-IBk,68.07 SMH-Logistic,56.95 **AdaBoostClassifier**,48.57 SMH-RandForest-b
2,70.43 **LogisticRegression**,65.60 MS-RandForest,56.42 SMH-RandForest-b,48.20 UoW
3,69.96 SMH-RandForest-b,64.40 SMH-RandForest,53.02 SMH-RandForest,47.54 SMH-Logistic
4,69.09 BLEU,63.74 TER,51.12 SMH-IBk,46.06 SimpleNets-RNN2
5,68.82 SimpleNets-MLP,63.54 SimpleNets-MLP,49.96 SimpleNets-RNN3,45.71 **AdaBoostClassifier**
6,68.36 TER,62.82 BLEU,49.81 SimpleNets-MLP,44.50 SMH-RandForest
7,67.60 **GradientBoostingClassifier**,62.72 MT-baseline,48.31 MT-baseline,40.94 METEOR
8,67.53 MS-RandForest,62.69 IIT-Meteor,47.84 MS-IBk-b,40.75 SimpleNets-RNN3
9,67.50 IIT-LM,61.71 MS-IBk-b,47.82 MS-RandForest,39.85 MS-RandForest


# Latex tables

In [19]:
from io import StringIO

# Method name, Short name, Description
method_short_desc = '''
Method name, Short name, Description
QuEst_nb_source_punct, NBSourcePunct, Number of punctuation tokens in source (QuEst)
QuEst_nb_source_tokens, NBSourceWords, Number of source words (QuEst)
QuEst_nb_target_punct, NBOutputPunt, Number of punctuation tokens in output (QuEst)
QuEst_type_token_ratio, TypeTokenRatio, Type token ratio (QuEst)
TERp_Del, TERp_Del, Number of deletions (TERp component)
TERp_NumEr, TERp_NumEr, Number of total errors (TERp component)
TERp_Sub, TERp_Sub, Number of substitutions (TERp component)
TERp_TERp, TERp, TERp MT metric
average_concreteness, AvgConcreteness, Average word concreteness (Brysbaert list)
average_cosine, AvgCosineSim, Cosine similarity between source and output word embeddings
count_characters, NBOutputChars, Number of characters in the output sentence
count_characters_per_word, NBOutputCharsPerWord, Average number of characters per word in the output
count_syllables_in_sentence, NBOutputSyllables, Number of syllables in the output
count_syllables_per_word, NBOutputSyllablesPerWord, Average number of syllables per word in the output
count_words, NBOutputWords, Number of words in the output
count_characters_per_sentence, NBOutputCharsPerSent, Average number of characters per sentence in the output
count_words_per_sentence, NBOutputWordsPerSent, Average number of words per sentence in the output
count_syllables_per_sentence, NBOutputSyllablesPerSent, Average number of syllables per sentence in the output
average_sentence_lm_prob, AvgLMProbsOutput, Average probabilities of output words (Language Model)
min_sentence_lm_prob, MinLMProbsOutput, Min probability of output words (Language Model)
max_pos_in_freq_table, MaxPosInFreqTable, Maximum position of output words in the frequency table
min_concreteness, MinConcreteness, Minimum word concreteness according to Brysbaert concreteness list
nlgeval_Bleu_1, BLEU_1gram, BLEU MT metric with unigrams only
nlgeval_Bleu_2, BLEU_2gram, BLEU MT metric up to bigrams
nlgeval_Bleu_3, BLEU_3gram, BLEU MT metric up to trigrams
nlgeval_Bleu_4, BLEU_4gram, BLEU MT metric up to 4-grams
nlgeval_METEOR, METEOR, METEOR MT metric
nlgeval_ROUGE_L, ROUGE, ROUGE summarization metric
nltkBLEU_method7, BLEUSmoothed, BLEU MT metric with smoothing (method 7 from nltk)
sentence_fkgl, outputFKGL, Flesch-Kincaid Grade Level
sentence_fre, outputFRE, Flesch Reading Ease
word_intersection, WordsInCommon, Percentage of words in common between source and output
'''.replace(', ', ',')
df_desc = pd.read_csv(StringIO(method_short_desc))
print('% Features description')
print(to_latex(df_desc[['Short name', 'Description']]))

% Features description
\begin{table*}
\begin{tabular}{ll}
\toprule
               Short name &                                                         Description \\
\midrule
 NBSourcePunct &  Number of punctuation tokens in source (QuEst) \\
 NBSourceWords &  Number of source words (QuEst) \\
 NBOutputPunt &  Number of punctuation tokens in output (QuEst) \\
 TypeTokenRatio &  Type token ratio (QuEst) \\
 TERp\_Del &  Number of deletions (TERp component) \\
 TERp\_NumEr &  Number of total errors (TERp component) \\
 TERp\_Sub &  Number of substitutions (TERp component) \\
 TERp &  TERp MT metric \\
 AvgConcreteness &  Average word concreteness (Brysbaert list) \\
 AvgCosineSim &  Cosine similarity between source and output word embeddings \\
 NBOutputChars &  Number of characters in the output sentence \\
 NBOutputCharsPerWord &  Average number of characters per word in the output \\
 NBOutputSyllables &  Number of syllables in the output \\
 NBOutputSyllablesPerWord &  Average number

In [20]:
def get_short_name(method_name):
    method_name_to_short_name = {method_name: short_name 
                                 for method_name, short_name
                                 in df_desc[['Method name', 'Short name']].values}
    if method_name not in method_name_to_short_name:
        return None
    return method_name_to_short_name[method_name]


processed_dfs = []
n = 15
for aspect in ['grammaticality', 'meaning_preservation', 'simplicity']:
    df = single_feature_dfs[aspect].copy()
    aspect = aspect.replace('_', '\_')
    df['team'] = df['team'].apply(get_short_name)
    df = df.dropna()
    df = df.head(n)[['team', 'valid_pearson', 'pearson']]
    df.columns = [(aspect, 'Short name'), (aspect, 'Valid'), (aspect, 'Test')]
    df.index = range(n)
    processed_dfs.append(df)

concat_df = pd.concat(processed_dfs, axis=1)
concat_df.columns = pd.MultiIndex.from_tuples(concat_df.columns)
print(to_latex(concat_df))

\begin{table*}
\begin{tabular}{lrrlrrlrr}
\toprule
   grammaticality & \multicolumn{3}{l}{meaning\textbackslash \_preservation} & \multicolumn{3}{l}{simplicity} \\
       Short name & Valid &  Test &            Short name & Valid &  Test &                Short name & Valid &  Test \\
\midrule
 METEOR & 0.36 & 0.39 &  BLEUSmoothed & 0.59 & 0.52 &  NBOutputCharsPerSent & -0.52 & -0.45 \\
 BLEUSmoothed & 0.33 & 0.34 &  BLEU\_3gram & 0.57 & 0.52 &  NBOutputSyllablesPerSent & -0.52 & -0.49 \\
 BLEU\_4gram & 0.32 & 0.34 &  METEOR & 0.57 & 0.58 &  NBOutputWordsPerSent & -0.51 & -0.39 \\
 BLEU\_3gram & 0.31 & 0.34 &  BLEU\_2gram & 0.57 & 0.52 &  NBOutputChars & -0.48 & -0.37 \\
 TERp\_NumEr & -0.30 & -0.31 &  BLEU\_4gram & 0.57 & 0.51 &  NBOutputWords & -0.47 & -0.29 \\
 BLEU\_2gram & 0.30 & 0.34 &  WordsInCommon & 0.55 & 0.50 &  NBOutputSyllables & -0.46 & -0.42 \\
 TERp & -0.30 & -0.32 &  BLEU\_1gram & 0.55 & 0.52 &  NBOutputPunt & -0.42 & -0.31 \\
 ROUGE & 0.29 & 0.29 &  ROUGE & 0.55 & 0.47