In [9]:
# File system
import os
import shutil

import numpy as np
import time
from tqdm import tqdm
import regex

In [10]:
def load_dataset(data_source_filename):
    print("Loading %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    dataset = dict()
    with np.load(data_source_filename, allow_pickle=True) as source_file:
        for key in source_file.keys():
          # print(key)
          dataset[key] = source_file[key].tolist()
        print("done (%.1fs)" % (time.time()-t), flush=True)
        return dataset

base_path = "data"
assert os.path.exists(base_path)
    
data_source_filenames = [os.path.join(base_path, fn) for fn in os.listdir(base_path)
                            if os.path.isfile(os.path.join(base_path, fn)) and fn[-3:]=='npz']
data_source_filenames

['data/StClare_facs_danish.npz',
 'data/SDHK_Latin.npz',
 'data/StClare_facs_latin.npz',
 'data/StClare_dipl_danish.npz',
 'data/StClare_dipl_latin.npz',
 'data/SDHK_Swedish.npz',
 'data/Colonia.npz',
 'data/SemEval2015.npz']

In [11]:
data_source_filename = data_source_filenames[-1]
dataset = load_dataset(data_source_filename)

Loading SemEval2015.npz... done (0.2s)


In [12]:
dataset.keys()

dict_keys(['data', 'folds', 'feature_sets', 'linearsvc', 'multinomial_naive_bayes', 'gaussian_naive_bayes', 'svc', 'gaussianprocess', 'temporallm', 'baseline'])

In [13]:
import pandas as pd
from sklearn.metrics import accuracy_score

In [17]:
def results_table(data_source_filenames, model, metric):
    datasets = iter(load_dataset(data_source) for data_source in data_source_filenames)
    datasets_names = list(map(lambda x: x[x.rfind("/")+1:x.rfind(".")], data_source_filenames))

    results = None

    for i, (dataset, name) in enumerate(zip(datasets, datasets_names)):
        if not isinstance(results, pd.DataFrame):
            columns = [[model]*len(dataset[model]), sorted(dataset[model])]
            columns = list(zip(*columns))
            columns = pd.MultiIndex.from_tuples(columns)
            results = pd.DataFrame(index=sorted(datasets_names), columns=columns)

        for config, result in dataset[model].items():
            try:
                results.loc[name, (model, config)] = metric(result["y_true"], result["y_pred"])
            except KeyError:
                try:
                    results.loc[name, (model, config)] = metric(result["y_test"], result["y_pred"])
                except KeyError:
                    results.loc[name, (model, config)] = result["accuracy"]
                    

    return results

## Latex

In [18]:
table_config = {"float_format": lambda x: "{:0.2f}".format(x*100),
                "multicolumn":True,
                "multicolumn_format":"c",
                "na_rep":"-"}

## Baselines

In [19]:
results = results_table(data_source_filenames, "baseline", accuracy_score)
results

Loading StClare_facs_danish.npz... done (0.0s)
Loading SDHK_Latin.npz... done (1.5s)
Loading StClare_facs_latin.npz... done (0.2s)
Loading StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.5s)
Loading Colonia.npz... done (5.6s)
Loading SemEval2015.npz... done (0.2s)


Unnamed: 0_level_0,baseline,baseline,baseline
Unnamed: 0_level_1,mle,uniform,weighted
Colonia,26.3158,5.55121,11.8936
SDHK_Latin,26.288,5.88046,17.7213
SDHK_Swedish,69.0438,7.13821,54.2381
SemEval2015,25.5193,8.33245,11.8948
StClare_dipl_danish,15.7895,12.5164,13.8824
StClare_dipl_latin,19.7183,8.33151,14.6449
StClare_facs_danish,15.7895,12.4936,13.9081
StClare_facs_latin,19.7183,8.33917,14.6829


In [20]:
print(results.to_latex(**table_config))

\begin{tabular}{llll}
\toprule
{} & \multicolumn{3}{c}{baseline} \\
{} &      mle & uniform & weighted \\
\midrule
Colonia             &  2631.58 &  555.12 &  1189.36 \\
SDHK\_Latin          &  2628.80 &  588.05 &  1772.13 \\
SDHK\_Swedish        &  6904.38 &  713.82 &  5423.81 \\
SemEval2015         &  2551.93 &  833.25 &  1189.48 \\
StClare\_dipl\_danish &  1578.95 & 1251.64 &  1388.24 \\
StClare\_dipl\_latin  &  1971.83 &  833.15 &  1464.49 \\
StClare\_facs\_danish &  1578.95 & 1249.36 &  1390.81 \\
StClare\_facs\_latin  &  1971.83 &  833.92 &  1468.29 \\
\bottomrule
\end{tabular}



## Temporal Language Models

In [21]:
model = "temporallm"
results_lm = results_table(data_source_filenames, model, accuracy_score)
results_lm

Loading StClare_facs_danish.npz... done (0.1s)
Loading SDHK_Latin.npz... done (1.3s)
Loading StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.5s)
Loading Colonia.npz... done (5.5s)
Loading SemEval2015.npz... done (0.2s)


Unnamed: 0_level_0,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm,temporallm
Unnamed: 0_level_1,char_1_25,char_1_50,char_2_25,char_2_50,char_3_25,char_3_50,char_5_25,char_5_50,word_1_25,word_1_50,word_2_25,word_2_50,word_3_25,word_3_50,word_5_25,word_5_50
Colonia,0.0526316,0.315789,0.0526316,0.368421,0.0526316,0.473684,0.0526316,0.578947,0.0526316,0.105263,0.0526316,0.263158,0.0526316,0.315789,0.0526316,0.315789
SDHK_Latin,,,,,,,,,,,,,,,,
SDHK_Swedish,0.0162075,0.0324149,0.943274,0.940032,0.95624,0.95624,0.954619,0.957861,0.00324149,0.00162075,0.0113452,0.00648298,0.0567261,0.0518639,0.0615883,0.0567261
SemEval2015,0.221068,0.400593,0.489614,0.667656,0.597923,0.721068,0.62908,0.78635,0.0281899,0.0252226,0.158754,0.15727,0.15727,0.167656,0.15727,0.167656
StClare_dipl_danish,0.157895,0.631579,0.368421,0.736842,0.315789,0.684211,0.421053,0.578947,0.0,0.157895,0.210526,0.421053,0.210526,0.421053,0.210526,0.421053
StClare_dipl_latin,0.0140845,0.464789,0.521127,0.732394,0.535211,0.746479,0.619718,0.71831,0.0,0.0140845,0.267606,0.535211,0.295775,0.591549,0.295775,0.591549
StClare_facs_danish,0.421053,0.578947,0.526316,0.789474,0.526316,0.736842,0.473684,0.736842,0.0526316,0.105263,0.105263,0.157895,0.105263,0.157895,0.105263,0.157895
StClare_facs_latin,0.056338,0.492958,0.690141,0.859155,0.71831,0.84507,0.746479,0.830986,0.0,0.0,0.028169,0.0704225,0.028169,0.0704225,0.028169,0.0704225


## Multinomial naive bayes

In [22]:
model = "multinomial_naive_bayes"
results_mnb = results_table(data_source_filenames, model, accuracy_score)
results_mnb

Loading StClare_facs_danish.npz... done (0.0s)
Loading SDHK_Latin.npz... done (1.4s)
Loading StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.5s)
Loading Colonia.npz... done (5.4s)
Loading SemEval2015.npz... done (0.2s)


Unnamed: 0_level_0,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes,multinomial_naive_bayes
Unnamed: 0_level_1,bow_words,character_ngram_1,character_ngram_2,character_ngram_3,tfidf_words,word_ngram_1,word_ngram_2,word_ngram_3,bow_pos,bow_words_pos,tfidf_pos,tfidf_words_pos
Colonia,0.421053,0.263158,0.263158,0.263158,0.263158,0.263158,0.263158,0.263158,0.263158,0.421053,0.263158,0.263158
SDHK_Latin,0.544254,0.26354,0.266843,0.312417,0.404888,0.393659,0.404227,0.379128,,,,
SDHK_Swedish,0.875203,0.690438,0.690438,0.690438,0.703404,0.690438,0.690438,0.690438,,,,
SemEval2015,0.47181,0.255193,0.255193,0.255193,0.256677,0.255193,0.255193,0.255193,,,,
StClare_dipl_danish,0.157895,0.157895,0.210526,0.157895,0.210526,0.263158,0.210526,0.105263,,,,
StClare_dipl_latin,0.521127,0.197183,0.197183,0.197183,0.43662,0.225352,0.211268,0.197183,,,,
StClare_facs_danish,0.473684,0.263158,0.263158,0.263158,0.157895,0.263158,0.210526,0.263158,,,,
StClare_facs_latin,0.661972,0.253521,0.253521,0.197183,0.478873,0.253521,0.197183,0.197183,,,,


## Gaussian naive bayes

In [23]:
model = "gaussian_naive_bayes"
results_gnb = results_table(data_source_filenames, model, accuracy_score)
results_gnb

Loading StClare_facs_danish.npz... done (0.0s)
Loading SDHK_Latin.npz... done (1.4s)
Loading StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.6s)
Loading Colonia.npz... done (5.3s)
Loading SemEval2015.npz... done (0.2s)


Unnamed: 0_level_0,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes,gaussian_naive_bayes
Unnamed: 0_level_1,bow_words,character_ngram_1,character_ngram_2,character_ngram_3,tfidf_words,word_ngram_1,word_ngram_2,word_ngram_3,bow_pos,bow_words_pos,tfidf_pos,tfidf_words_pos
Colonia,0.473684,0.263158,0.368421,0.421053,0.368421,0.368421,0.368421,0.473684,0.052632,0.421053,0.421053,0.368421
SDHK_Latin,0.601717,0.171731,0.401585,0.589828,0.574637,0.628137,,,,,,
SDHK_Swedish,0.857374,0.247974,0.871961,0.938412,0.850891,0.87034,0.896272,,,,,
SemEval2015,0.519288,0.158754,0.24184,0.425816,0.532641,0.569733,0.532641,0.430267,,,,
StClare_dipl_danish,0.315789,0.263158,0.315789,0.210526,0.315789,0.368421,0.473684,0.157895,,,,
StClare_dipl_latin,0.605634,0.15493,0.380282,0.549296,0.521127,0.549296,0.661972,0.690141,,,,
StClare_facs_danish,0.421053,0.315789,0.368421,0.368421,0.210526,0.368421,0.473684,0.368421,,,,
StClare_facs_latin,0.676056,0.56338,0.676056,0.704225,0.605634,0.633803,0.661972,0.591549,,,,


## LinearSVM

In [25]:
model = "linearsvc"
results_svc = results_table(data_source_filenames, model, accuracy_score)
results_svc

Loading StClare_facs_danish.npz... done (0.0s)
Loading SDHK_Latin.npz... done (1.5s)
Loading StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.5s)
Loading Colonia.npz... done (5.5s)
Loading SemEval2015.npz... done (0.2s)


Unnamed: 0_level_0,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc,linearsvc
Unnamed: 0_level_1,bow_words,character_ngram_1,character_ngram_2,character_ngram_3,tfidf_words,word_ngram_1,word_ngram_2,word_ngram_3,bow_pos,bow_words_pos,tfidf_pos,tfidf_words_pos
Colonia,0.263158,0.315789,0.473684,0.421053,0.473684,0.421053,0.368421,0.315789,0.263158,0.263158,0.421053,0.421053
SDHK_Latin,0.26288,0.389696,0.535667,0.546235,0.346103,0.439894,0.347424,0.342801,,,,
SDHK_Swedish,0.690438,0.824959,0.902755,0.889789,0.669368,0.792545,0.698541,0.693679,,,,
SemEval2015,0.255193,0.310089,0.329377,0.354599,0.296736,0.329377,0.271513,0.274481,,,,
StClare_dipl_danish,0.157895,0.263158,0.526316,0.421053,0.105263,0.0526316,0.105263,0.105263,,,,
StClare_dipl_latin,0.197183,0.366197,0.478873,0.408451,0.323944,0.309859,0.197183,0.183099,,,,
StClare_facs_danish,0.157895,0.421053,0.315789,0.105263,0.0526316,0.0526316,0.105263,0.210526,,,,
StClare_facs_latin,0.197183,0.478873,0.478873,0.394366,0.225352,0.323944,0.183099,0.225352,,,,


## SVCrbf

In [26]:
model = "svc"
results_svcrbf = results_table(data_source_filenames, model, accuracy_score)
results_svcrbf

Loading StClare_facs_danish.npz... done (0.0s)
Loading SDHK_Latin.npz... done (1.4s)
Loading StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.5s)
Loading Colonia.npz... done (5.5s)
Loading SemEval2015.npz... done (0.2s)


Unnamed: 0_level_0,svc,svc,svc,svc,svc,svc,svc,svc,svc,svc,svc,svc
Unnamed: 0_level_1,bow_words,character_ngram_1,character_ngram_2,character_ngram_3,tfidf_words,word_ngram_1,word_ngram_2,word_ngram_3,bow_pos,bow_words_pos,tfidf_pos,tfidf_words_pos
Colonia,0.105263,0.368421,0.473684,0.368421,0.421053,0.421053,0.421053,0.315789,0.105263,0.105263,0.473684,0.421053
SDHK_Latin,0.00660502,0.470277,0.55284,0.577939,0.402906,0.488771,0.384412,0.10502,,,,
SDHK_Swedish,0.252836,0.889789,0.904376,0.901135,0.726094,0.828201,0.012966,0.0145867,,,,
SemEval2015,0.037092,0.307122,0.310089,0.318991,0.307122,0.29822,0.24184,0.0949555,,,,
StClare_dipl_danish,0.157895,0.105263,0.578947,0.210526,0.210526,0.105263,0.157895,0.210526,,,,
StClare_dipl_latin,0.0140845,0.422535,0.464789,0.507042,0.225352,0.380282,0.253521,0.197183,,,,
StClare_facs_danish,0.157895,0.368421,0.210526,0.157895,0.157895,0.210526,0.157895,0.263158,,,,
StClare_facs_latin,0.0140845,0.507042,0.492958,0.464789,0.225352,0.394366,0.169014,0.253521,,,,


## GP

In [27]:
model = "gaussianprocess"
results_gp = results_table(data_source_filenames, model, accuracy_score)
results_gp

Loading StClare_facs_danish.npz... done (0.0s)
Loading SDHK_Latin.npz... done (1.4s)
Loading StClare_facs_latin.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
Loading StClare_dipl_latin.npz... done (0.1s)
Loading SDHK_Swedish.npz... done (0.5s)
Loading Colonia.npz... done (5.5s)
Loading SemEval2015.npz... done (0.2s)


Unnamed: 0_level_0,gaussianprocess,gaussianprocess,gaussianprocess,gaussianprocess,gaussianprocess,gaussianprocess,gaussianprocess,gaussianprocess
Unnamed: 0_level_1,bow_words,character_ngram_1,character_ngram_2,character_ngram_3,tfidf_words,word_ngram_1,word_ngram_2,word_ngram_3
Colonia,0.0526316,,,,,,,
SDHK_Latin,0.270145,0.34214,0.360634,0.395641,0.275429,0.42074,0.286658,0.265522
SDHK_Swedish,0.0145867,0.820097,0.747164,0.786062,0.0291734,0.815235,0.0340357,0.0145867
SemEval2015,0.102374,0.132047,0.158754,0.167656,0.078635,0.114243,0.078635,0.078635
StClare_dipl_danish,0.210526,0.315789,0.263158,0.421053,0.210526,0.263158,0.263158,0.210526
StClare_dipl_latin,0.169014,0.225352,0.225352,0.169014,0.169014,0.253521,0.169014,0.169014
StClare_facs_danish,0.210526,0.263158,0.210526,0.210526,0.210526,0.210526,0.315789,0.210526
StClare_facs_latin,0.169014,0.549296,0.43662,0.43662,0.169014,0.338028,0.169014,0.169014


# Create final results table

In [28]:

def to_latex_table(tables, columns, names, alias, append_stats=True):
    finaltable = ""
    
    n_cols = len(alias)+1 if append_stats else len(alias)
    
    begin = "\\begin{table*}\n\\begin{tabular}{l"+"l"*n_cols+"}\n"
    end = "\\end{tabular}\n\end{table*}"
    
    
    for table, column, name in zip(tables, columns, names):
        column = pd.MultiIndex.from_tuples(list(zip(*[[name]*len(column), column])))
        out_column = pd.MultiIndex.from_tuples(list(zip(*[[name]*len(alias), alias])))

        table = table[column]
        table.columns=out_column
        
        if append_stats:
            def stats_col(means, stds):
                return ["{m:0.2f} ±{s:0.2f}".format(m=m, s=s) for m,s in zip(means, stds)]
            
            def stats_row(means, stds):
                data = dict()
                for (col, m), s in zip(means.iteritems(), stds):
                    data[col] = "{m:0.2f} ±{s:0.2f}".format(m=m, s=s)
                return pd.Series(data=data, name="")
            
            row_means = table.mean(axis=1)
            row_std = table.std(axis=1)
            
            
            col_means = table.mean(axis=0)
            col_std = table.std(axis=0)
            
            table["average"] = stats_col(row_means, row_std)
            table = table.append(stats_row(col_means, col_std))

                        

        table = table.to_latex(**table_config)
        table = table.strip()
        table = table[table.find("\n"):table.rfind("\n")]
        finaltable+=table



    finaltable = begin + finaltable
    finaltable += end 
    return finaltable

In [32]:
table_config = {"float_format": lambda x: "{:0.1f}".format(x*100),
                "multicolumn":True,
                "multicolumn_format":"c",
                "na_rep":"-"}

In [33]:
final = to_latex_table(
    tables=[results_lm, results_mnb, results_gnb, results_svc, results_svcrbf, results_gp],
    columns =[["char_1_25", "char_2_25", "char_3_25", "word_1_25", "word_2_25", "word_3_25"],
              ["character_ngram_1", "character_ngram_2", "character_ngram_3", "word_ngram_1", "word_ngram_2", "word_ngram_3"],
              ["character_ngram_1", "character_ngram_2", "character_ngram_3", "word_ngram_1", "word_ngram_2", "word_ngram_3"],              
              ["character_ngram_1", "character_ngram_2", "character_ngram_3", "word_ngram_1", "word_ngram_2", "word_ngram_3"],                            
              ["character_ngram_1", "character_ngram_2", "character_ngram_3", "word_ngram_1", "word_ngram_2", "word_ngram_3"],
              ["character_ngram_1", "character_ngram_2", "character_ngram_3", "word_ngram_1", "word_ngram_2", "word_ngram_3"],
             ],
    names = ["temporallm", "multinomial_naive_bayes", "gaussian_naive_bayes", "linearsvc", "svc", "gaussianprocess"],
    alias = ["Char(1)", "Char(2)", "Char(3)", "Word(1)", "Word(2)", "Word(3)"],
    append_stats = False,
    
)

In [34]:
print(final.replace("±", "\\textpm"))

\begin{table*}
\begin{tabular}{lllllll}

\toprule
{} & \multicolumn{6}{c}{temporallm} \\
{} &    Char(1) & Char(2) & Char(3) & Word(1) & Word(2) & Word(3) \\
\midrule
Colonia             &        5.3 &     5.3 &     5.3 &     5.3 &     5.3 &     5.3 \\
SDHK\_Latin          &          - &       - &       - &       - &       - &       - \\
SDHK\_Swedish        &        1.6 &    94.3 &    95.6 &     0.3 &     1.1 &     5.7 \\
SemEval2015         &       22.1 &    49.0 &    59.8 &     2.8 &    15.9 &    15.7 \\
StClare\_dipl\_danish &       15.8 &    36.8 &    31.6 &     0.0 &    21.1 &    21.1 \\
StClare\_dipl\_latin  &        1.4 &    52.1 &    53.5 &     0.0 &    26.8 &    29.6 \\
StClare\_facs\_danish &       42.1 &    52.6 &    52.6 &     5.3 &    10.5 &    10.5 \\
StClare\_facs\_latin  &        5.6 &    69.0 &    71.8 &     0.0 &     2.8 &     2.8 \\
\bottomrule
\toprule
{} & \multicolumn{6}{c}{multinomial\_naive\_bayes} \\
{} &                 Char(1) & Char(2) & Char(3) & Word(1) &

In [None]:
results

In [None]:
results.append({("baseline", "mle"): "test", 
                ("baseline", "uniform"): "test",
                ("baseline", "weighted"):"test"
               },ignore_index=True)

In [None]:
means = results.mean(axis=0)
stds = results.std(axis=0)


In [None]:
for m in means.iteritems():
    print(m)

In [None]:
m