# Analysis

In this notebook, I analyse the results of our hyperparameter search and the errors the different models make. The notebook is structured as:

1. Results
2. Error analysis

In [1]:
%load_ext blackcellmagic
import json
import os
import pandas as pd

from filenames import ROOT
os.chdir(ROOT)

## Results

- 1440 models trained
- Best single model: 0.957
- Best averaged model: 0.953
- Strong positive correlation between performance and token embedding size
- Medium positive correlation between performance and hidden size
- Weak positive correlation between performance and character embedding size
- Models take on average 3.5 minutes to train
- Almost all models achieve above 0.99 top 3 accuracy

In [2]:
# Load relevant results into a pandas DataFrame
DIR = "models/pos"
subdirnames = os.listdir(DIR)
results = []
for subdirname in subdirnames:
    try:
        filename = os.path.join(DIR, subdirname, "metrics.json")
        with open(filename) as file:
            metrics = json.load(file)
        fold, token, char, hidden, batch, pretrained, _ = subdirname.split("-")
        for var in ["fold", "token", "char", "hidden", "batch", "pretrained"]:
            metrics[var] = eval(var)
        results.append(metrics)
    except FileNotFoundError:  # this model may still be training
        continue
results = pd.DataFrame(results)
columns_to_keep = [
    "fold",
    "token",
    "char",
    "hidden",
    "batch",
    "pretrained",
    "best_epoch",
    "training_duration",
    "validation_accuracy",
    "validation_accuracy3",
]
results = results[columns_to_keep]
columns_to_int = ["fold", "token", "char", "hidden", "batch"]
for column in columns_to_int:
    results[column] = results[column].astype(int)
results["pretrained"].replace({"true": True, "false": False}, inplace=True)
results["training_duration"] = pd.to_timedelta(results["training_duration"])
results.head()

Unnamed: 0,fold,token,char,hidden,batch,pretrained,best_epoch,training_duration,validation_accuracy,validation_accuracy3
0,1,200,20,25,16,True,3,00:03:14.223714,0.947396,0.995974
1,3,100,10,200,16,True,2,00:02:38.260382,0.949402,0.996835
2,9,300,20,25,16,True,2,00:02:38.560658,0.947312,0.996128
3,4,100,5,50,16,True,1,00:01:49.712467,0.94854,0.996238
4,3,200,5,100,16,True,2,00:02:47.963767,0.952528,0.996411


In [3]:
# How many models have we trained?
len(results)

1440

In [4]:
# What is the best score?
results["validation_accuracy"].max()

0.9570418296719831

In [5]:
# Which model was that?
results.iloc[results["validation_accuracy"].idxmax()]

fold                                         7
token                                      300
char                                        20
hidden                                     200
batch                                       16
pretrained                                True
best_epoch                                   1
training_duration       0 days 00:01:52.333726
validation_accuracy                   0.957042
validation_accuracy3                  0.997179
Name: 360, dtype: object

In [6]:
# Which hyperparameters are correlated with performance?
columns = ["fold", "token", "char", "hidden", "batch", "validation_accuracy"]
results[columns].corr()["validation_accuracy"].sort_values(ascending=False)

validation_accuracy    1.000000
token                  0.724698
hidden                 0.253332
char                   0.042993
fold                   0.024039
batch                 -0.060234
Name: validation_accuracy, dtype: float64

In [7]:
# How are the models performing on top 3 accuracy?
results["validation_accuracy3"].describe()

count    1440.000000
mean        0.995489
std         0.001528
min         0.988242
25%         0.994497
50%         0.995773
75%         0.996686
max         0.998022
Name: validation_accuracy3, dtype: float64

In [8]:
# How do the models compare when averaging over the cross-validation folds?
hyperparams = ["token", "char", "hidden", "batch", "pretrained"]
results.groupby(hyperparams)["validation_accuracy"].mean().to_frame().sort_values(
    by="validation_accuracy"
).tail(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,validation_accuracy
token,char,hidden,batch,pretrained,Unnamed: 5_level_1
100,20,200,16,True,0.950853
100,10,200,8,True,0.950932
300,10,50,8,True,0.950999
100,20,200,8,True,0.951255
200,10,100,16,True,0.951389
300,5,50,8,True,0.951436
300,20,50,8,True,0.951498
200,10,100,8,True,0.951569
200,20,100,8,True,0.952057
300,10,100,16,True,0.952066


In [9]:
# How long are the models taking to train?
results["training_duration"].describe()

count                      1440
mean     0 days 00:03:31.613191
std      0 days 00:01:18.955659
min      0 days 00:01:27.071719
25%      0 days 00:02:36.832780
50%      0 days 00:03:11.746210
75%      0 days 00:04:10.158404
max      0 days 00:09:21.750568
Name: training_duration, dtype: object

In [10]:
# How many epochs are the models taking?
results["best_epoch"].describe()

count    1440.000000
mean        2.681944
std         1.774722
min         1.000000
25%         1.000000
50%         2.000000
75%         3.250000
max         9.000000
Name: best_epoch, dtype: float64

## Error analysis

In [11]:
from tqdm import tqdm
from pos import load_model, predict_from_text

In [12]:
def read_fold_validation(fold):
    filename = f"data/evalatin/processed/pos/{fold}-valid-unprocessed.txt"
    with open(filename) as file:
        contents = file.readlines()
    return contents

In [13]:
DIR = "models/pos"
subdirnames = os.listdir(DIR)
results = []
i = 0
for subdirname in tqdm(subdirnames):
    try:
        serialization_dir = os.path.join(DIR, subdirname)
        model = load_model(serialization_dir)
        fold, token, char, hidden, batch, pretrained, _ = subdirname.split("-")
        validation_data = read_fold_validation(fold)
        for sentence in validation_data:
            words = [pair.split("/")[0] for pair in sentence.split()]
            tags = [pair.split("/")[1] for pair in sentence.split()]
            predictions = predict_from_text(model, " ".join(words))
            predictions["true_tag"] = tags
            errors = predictions[predictions["tag"] != predictions["true_tag"]].copy()
            for var in ["fold", "token", "char", "hidden", "batch", "pretrained"]:
                errors[var] = eval(var)
            results.append(errors)
    except FileNotFoundError:  # this model may still be training
        continue
    i += 1
    if i > 3:
        break
results = pd.concat(results, ignore_index=True)
results.head()

  0%|          | 3/1440 [01:39<13:16:54, 33.27s/it]


Unnamed: 0,form,tag,NOUN,VERB,ADJ,ADV,PRON,DET,CCONJ,ADP,...,NUM,X,INTJ,true_tag,fold,token,char,hidden,batch,pretrained
0,necessarios,ADJ,0.001998,0.002018,0.994398,0.001078,5.708323e-07,3.461224e-05,7.435868e-07,2.1e-05,...,2e-05,8.779175e-07,3.9e-05,NOUN,1,200,20,25,16,True
1,uero,CCONJ,1.9e-05,0.001133,0.000889,0.182322,0.001020288,0.001093399,0.8094049,0.000205,...,0.001462,9.03302e-06,0.000831,ADV,1,200,20,25,16,True
2,sum,VERB,0.000382,0.86666,0.000182,0.000431,0.001282728,1.795294e-07,4.697729e-05,3.3e-05,...,1e-06,1.379804e-05,0.000412,AUX,1,200,20,25,16,True
3,remi,PROPN,0.004165,0.000175,0.018889,3.8e-05,9.876617e-07,2.350661e-05,1.347138e-05,0.000271,...,0.000506,5.992614e-05,7.4e-05,NOUN,1,200,20,25,16,True
4,quo,ADV,0.000389,0.004365,0.000101,0.671974,0.2409003,0.0002506337,0.0002201234,0.000368,...,9e-06,2.87734e-06,3.3e-05,SCONJ,1,200,20,25,16,True


In [14]:
# What are the most common errors?
results.groupby(["tag", "true_tag"]).size().to_frame("count").sort_values(by="count", ascending=False).head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
tag,true_tag,Unnamed: 2_level_1
NOUN,ADJ,488
ADJ,NOUN,472
ADJ,VERB,383
SCONJ,ADV,330
NOUN,VERB,299
VERB,ADJ,291
VERB,AUX,230
VERB,NOUN,221
PRON,SCONJ,216
ADJ,PROPN,184


In [15]:
# What form are the models most often getting wrong?
results["form"].value_counts().head(20)

quod        275
ut          255
cum         178
est         110
quam        107
quo          80
ne           63
esse         56
tantum       38
eo           37
ubi          35
quamquam     35
uero         33
tamquam      32
uel          28
sunt         27
quantum      27
esset        25
siue         24
una          22
Name: form, dtype: int64