# Analysis

In this notebook, I analyse the results of our hyperparameter search and the errors the different models make. The notebook is structured as:

1. Results
2. Error analysis

In [82]:
%load_ext blackcellmagic
import json
import os
import pandas as pd

from filenames import ROOT
os.chdir(ROOT)

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


## Results

### Preliminary results
- 201 models trained so far
- Best single model: 0.953
- Best averaged model: 0.949
- Strong positive correlation between performance and token embedding size
- Weak positive correlation between performance and character embedding size
- Models take on average 3.5 minutes to train
- All models achieve above 0.99 top 3 accuracy

In [83]:
# Load relevant results into a pandas DataFrame
DIR = "models/pos"
subdirnames = os.listdir(DIR)
results = []
for subdirname in subdirnames:
    try:
        filename = os.path.join(DIR, subdirname, "metrics.json")
        with open(filename) as file:
            metrics = json.load(file)
        fold, token, char, hidden, batch, pretrained, _ = subdirname.split("-")
        for var in ["fold", "token", "char", "hidden", "batch", "pretrained"]:
            metrics[var] = eval(var)
        results.append(metrics)
    except FileNotFoundError:  # this model may still be training
        continue
results = pd.DataFrame(results)
columns_to_keep = [
    "fold",
    "token",
    "char",
    "hidden",
    "batch",
    "pretrained",
    "best_epoch",
    "training_duration",
    "validation_accuracy",
    "validation_accuracy3",
]
results = results[columns_to_keep]
columns_to_int = ["fold", "token", "char", "hidden", "batch"]
for column in columns_to_int:
    results[column] = results[column].astype(int)
results["pretrained"].replace({"true": True, "false": False}, inplace=True)
results["training_duration"] = pd.to_timedelta(results["training_duration"])
results.head()

Unnamed: 0,fold,token,char,hidden,batch,pretrained,best_epoch,training_duration,validation_accuracy,validation_accuracy3
0,4,25,5,25,8,True,4,00:04:50.368217,0.939985,0.994044
1,8,200,20,25,8,True,2,00:03:03.520337,0.944656,0.99637
2,5,100,20,25,8,True,2,00:03:31.156595,0.948702,0.995534
3,3,100,20,25,8,True,3,00:04:11.575619,0.941062,0.995574
4,3,25,5,25,8,True,4,00:05:00.249385,0.939276,0.99406


In [84]:
# How many models have we trained?
len(results)

211

In [85]:
# What is the best score?
results["validation_accuracy"].max()

0.952930482551476

In [86]:
# Which model was that?
results.iloc[results["validation_accuracy"].idxmax()]

fold                                         5
token                                      300
char                                        20
hidden                                      25
batch                                        8
pretrained                                True
best_epoch                                   2
training_duration       0 days 00:03:34.972703
validation_accuracy                    0.95293
validation_accuracy3                  0.996562
Name: 189, dtype: object

In [87]:
# Which hyperparameters are correlated with performance?
columns = ["fold", "token", "char", "hidden", "batch", "validation_accuracy"]
results[columns].corr()["validation_accuracy"].sort_values(ascending=False)

validation_accuracy    1.000000
token                  0.775576
char                   0.040399
fold                  -0.017889
hidden                -0.432770
batch                       NaN
Name: validation_accuracy, dtype: float64

In [88]:
# How are the models performing on top 3 accuracy?
results["validation_accuracy3"].describe()

count    211.000000
mean       0.994686
std        0.001605
min        0.990294
25%        0.993612
50%        0.994993
75%        0.996061
max        0.997219
Name: validation_accuracy3, dtype: float64

In [89]:
# How do the models compare when averaging over the cross-validation folds?
hyperparams = ["token", "char", "hidden", "batch", "pretrained"]
results.groupby(hyperparams)["validation_accuracy"].mean().to_frame().sort_values(
    by="validation_accuracy"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,validation_accuracy
token,char,hidden,batch,pretrained,Unnamed: 5_level_1
10,5,25,8,True,0.925274
10,20,25,8,True,0.930512
10,10,25,8,True,0.930624
10,5,50,8,True,0.931418
10,20,50,8,True,0.932949
10,10,50,8,True,0.933555
25,10,25,8,True,0.938464
25,5,25,8,True,0.939472
25,20,25,8,True,0.939644
50,20,25,8,True,0.94212


In [90]:
# How long are the models taking to train?
results["training_duration"].describe()

count                       211
mean     0 days 00:04:56.322296
std      0 days 00:01:35.685346
min      0 days 00:02:11.441762
25%      0 days 00:03:35.655628
50%      0 days 00:04:25.496430
75%      0 days 00:06:06.600773
max      0 days 00:09:21.750568
Name: training_duration, dtype: object

In [91]:
# How many epochs are the models taking?
results["best_epoch"].describe()

count    211.000000
mean       3.654028
std        1.820183
min        1.000000
25%        2.000000
50%        3.000000
75%        5.000000
max        8.000000
Name: best_epoch, dtype: float64

## Error analysis

In [20]:
from tqdm import tqdm
from pos import load_model, predict_from_text

In [43]:
def read_fold_validation(fold):
    filename = f"data/evalatin/processed/pos/{fold}-valid-unprocessed.txt"
    with open(filename) as file:
        contents = file.readlines()
    return contents

In [None]:
DIR = "models/pos"
subdirnames = os.listdir(DIR)
results = []
i = 0
for subdirname in tqdm(subdirnames):
    try:
        serialization_dir = os.path.join(DIR, subdirname)
        model = load_model(serialization_dir)
        fold, token, char, hidden, batch, pretrained, _ = subdirname.split("-")
        validation_data = read_fold_validation(fold)
        for sentence in validation_data:
            words = [pair.split("/")[0] for pair in sentence.split()]
            tags = [pair.split("/")[1] for pair in sentence.split()]
            predictions = predict_from_text(model, " ".join(words))
            predictions["true_tag"] = tags
            errors = predictions[predictions["tag"] != predictions["true_tag"]].copy()
            for var in ["fold", "token", "char", "hidden", "batch", "pretrained"]:
                errors[var] = eval(var)
            results.append(errors)
    except FileNotFoundError:  # this model may still be training
        continue
    i += 1
    if i > 3:
        break
results = pd.concat(results, ignore_index=True)
results.head()

In [75]:
# What are the most common errors?
results.groupby(["tag", "true_tag"]).size().to_frame("count").sort_values(by="count", ascending=False).head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
tag,true_tag,Unnamed: 2_level_1
SCONJ,ADV,327
ADJ,NOUN,300
NOUN,ADJ,285
ADJ,VERB,222
VERB,AUX,187
ADV,PRON,142
VERB,ADJ,141
VERB,NOUN,139
PRON,SCONJ,131
ADV,SCONJ,122


In [81]:
# What form are the models most often getting wrong?
results["form"].value_counts().head(20)

ut         252
quod       203
cum        140
quam       107
est         74
quo         68
ne          44
esse        42
ubi         38
tamquam     32
uero        29
eo          29
tantum      25
sunt        22
uel         21
una         19
sui         19
quantum     18
plus        18
Ut          17
Name: form, dtype: int64