# Baseline

In [1]:
import os
import os.path
import numpy as np
import time
from tqdm import tqdm

assert os.path.exists('data')

In [2]:
def load_dataset(data_source_filename):
    print("Loading %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    dataset = dict()
    with np.load(data_source_filename, allow_pickle=True) as source_file:
        for key in source_file.keys():
            dataset[key] = source_file[key].tolist()
    print("done (%.1fs)" % (time.time()-t), flush=True)
    return dataset

data_source_filenames = [os.path.join('data', fn) for fn in os.listdir('data')
                        if os.path.isfile(os.path.join('data', fn)) and fn[-3:]=='npz']
data_source_filenames

['data/StClare_facs_danish.npz',
 'data/SDHK_Latin.npz',
 'data/StClare_facs_latin.npz',
 'data/StClare_dipl_danish.npz',
 'data/StClare_dipl_latin.npz',
 'data/SDHK_Swedish.npz',
 'data/Colonia.npz',
 'data/SemEval2015.npz']

In [3]:
bin_width = 25

In [5]:
#dataset.keys()

In [20]:
from collections import Counter

#np.random.seed(42)

for data_source_filename in data_source_filenames:
    dataset = load_dataset(data_source_filename)
    
    results = {}
    if 'baseline' in dataset.keys():
        dataset.pop('baseline')
        

    y = np.asarray([np.mean(item['date']) if type(item['date']) is tuple else item['date'] for item in dataset['data']], np.int)
    y = bin_width*(y//bin_width)


    train, dev, test = dataset["folds"]["train"], dataset["folds"]["val"], dataset["folds"]["test"]

    print(" Classes in data: %s" % np.unique(y))
    print(" Classes in train and dev: %s" % np.unique(y[train+dev]))        
    print(" Classes in test: %s" % np.unique(y[test]))                

    most_common, _ = Counter(y[train+dev]).most_common(1)[0]
    mle_pred = [most_common]*len(test)
    mle=100*sum(mle_pred==y[test])/len(test)

    uniform_pred = np.random.choice(np.unique(y[dev+train]), size=len(test), replace=True)

    uniform = 100*np.mean([np.sum(np.random.choice(np.unique(y[dev+train]), size=len(test), replace=True)==y[test])/len(test) for i in range(100000)])
    #uniform = 100*np.sum(uniform_pred==y[test])/len(test)

    weighted_pred = np.random.choice(y[train+dev], size=len(test), replace=False)
    weighted = 100*np.mean([np.sum(np.random.choice(y[train+dev], size=len(test), replace=False)==y[test])/len(test) for i in range(100000)])
    #weighted = 100*np.sum(weighted_pred==y[test])/len(test)

    print("  MLE choice: %.1f%%" % mle)
    print("  Uniform: %.1f%%" % uniform)
    print("  Weighted: %.1f%%" % weighted)

    print(" Saving %s... " % data_source_filename.split("/")[-1], end="")

    results["mle"] = {
        "accuracy":mle,
        #"y_pred":mle_pred,
        "y_true":y[test],        
    }
    results["uniform"] = {
        "accuracy":uniform,
        #"y_pred":uniform_pred,
        "y_true":y[test],        
    }
    results["weighted"] = {
        "accuracy":weighted,
        #"y_pred":weighted_pred,
        "y_true":y[test],        
    }
    print()
    np.savez_compressed(data_source_filename, **dataset, baseline=results)

Loading StClare_facs_danish.npz... done (0.0s)
 Classes in data: [1400 1425 1450 1475 1500 1525 1550 1575]
 Classes in train and dev: [1400 1425 1450 1475 1500 1525 1550 1575]
 Classes in test: [1400 1425 1450 1475 1500 1525 1550 1575]
  MLE choice: 15.8%
  Uniform: 12.5%
  Weighted: 13.9%
Saving StClare_facs_danish.npz... 
Loading SDHK_Latin.npz... done (1.4s)
 Classes in data: [1100 1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425
 1450 1475 1500]
 Classes in train and dev: [1100 1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425
 1450 1475 1500]
 Classes in test: [1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425 1450
 1475 1500]
  MLE choice: 26.3%
  Uniform: 5.9%
  Weighted: 17.7%
Saving SDHK_Latin.npz... 
Loading StClare_facs_latin.npz... done (0.1s)
 Classes in data: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500 1525]
 Classes in train and dev: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500 1525]
 Classes in test: [