# Analysis

In this notebook, I analyse the results of our hyperparameter search and the errors the different models make. The notebook is structured as:

1. Results
2. Error analysis

In [1]:
%load_ext blackcellmagic
import json
import os
import pandas as pd

from filenames import ROOT
os.chdir(ROOT)

## Results

### Preliminary results
- 201 models trained so far
- Best single model: 0.953
- Best averaged model: 0.949
- Strong positive correlation between performance and token embedding size
- Weak positive correlation between performance and character embedding size
- Models take on average 3.5 minutes to train
- All models achieve above 0.99 top 3 accuracy

In [2]:
# Load relevant results into a pandas DataFrame
DIR = "models/pos"
subdirnames = os.listdir(DIR)
results = []
for subdirname in subdirnames:
    try:
        filename = os.path.join(DIR, subdirname, "metrics.json")
        with open(filename) as file:
            metrics = json.load(file)
        fold, token, char, hidden, batch, pretrained, _ = subdirname.split("-")
        for var in ["fold", "token", "char", "hidden", "batch", "pretrained"]:
            metrics[var] = eval(var)
        results.append(metrics)
    except FileNotFoundError:  # this model may still be training
        continue
results = pd.DataFrame(results)
columns_to_keep = [
    "fold",
    "token",
    "char",
    "hidden",
    "batch",
    "pretrained",
    "best_epoch",
    "training_duration",
    "validation_accuracy",
    "validation_accuracy3",
]
results = results[columns_to_keep]
columns_to_int = ["fold", "token", "char", "hidden", "batch"]
for column in columns_to_int:
    results[column] = results[column].astype(int)
results["pretrained"].replace({"true": True, "false": False}, inplace=True)
results["training_duration"] = pd.to_timedelta(results["training_duration"])
results.head()

Unnamed: 0,fold,token,char,hidden,batch,pretrained,best_epoch,training_duration,validation_accuracy,validation_accuracy3
0,4,25,5,25,8,True,4,00:04:50.368217,0.939985,0.994044
1,8,200,20,25,8,True,2,00:03:03.520337,0.944656,0.99637
2,5,100,20,25,8,True,2,00:03:31.156595,0.948702,0.995534
3,3,100,20,25,8,True,3,00:04:11.575619,0.941062,0.995574
4,3,25,5,25,8,True,4,00:05:00.249385,0.939276,0.99406


In [3]:
# How many models have we trained?
len(results)

201

In [4]:
# What is the best score?
results["validation_accuracy"].max()

0.952930482551476

In [5]:
# Which model was that?
results.iloc[results["validation_accuracy"].idxmax()]

fold                                         5
token                                      300
char                                        20
hidden                                      25
batch                                        8
pretrained                                True
best_epoch                                   2
training_duration       0 days 00:03:34.972703
validation_accuracy                    0.95293
validation_accuracy3                  0.996562
Name: 182, dtype: object

In [6]:
# Which hyperparameters are correlated with performance?
columns = ["fold", "token", "char", "hidden", "batch", "validation_accuracy"]
results[columns].corr()["validation_accuracy"].sort_values(ascending=False)

validation_accuracy    1.000000
token                  0.772733
char                   0.116105
fold                  -0.001672
hidden                -0.387754
batch                       NaN
Name: validation_accuracy, dtype: float64

In [7]:
# How are the models performing on top 3 accuracy?
results["validation_accuracy3"].describe()

count    201.000000
mean       0.994742
std        0.001612
min        0.990294
25%        0.993712
50%        0.995030
75%        0.996131
max        0.997219
Name: validation_accuracy3, dtype: float64

In [8]:
# How do the models compare when averaging over the cross-validation folds?
hyperparams = ["token", "char", "hidden", "batch", "pretrained"]
results.groupby(hyperparams)["validation_accuracy"].mean().to_frame().sort_values(
    by="validation_accuracy"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,validation_accuracy
token,char,hidden,batch,pretrained,Unnamed: 5_level_1
10,5,25,8,True,0.925274
10,20,25,8,True,0.930512
10,10,25,8,True,0.930624
10,5,50,8,True,0.931418
10,10,50,8,True,0.933555
10,20,50,8,True,0.937157
25,10,25,8,True,0.938464
25,5,25,8,True,0.939472
25,20,25,8,True,0.939644
50,20,25,8,True,0.94212


In [9]:
# How long are the models taking to train?
results["training_duration"].describe()

count                       201
mean     0 days 00:04:53.046505
std      0 days 00:01:35.293417
min      0 days 00:02:11.441762
25%      0 days 00:03:34.949977
50%      0 days 00:04:20.673414
75%      0 days 00:05:58.602896
max      0 days 00:09:21.750568
Name: training_duration, dtype: object

In [10]:
# How many epochs are the models taking?
results["best_epoch"].describe()

count    201.000000
mean       3.572139
std        1.801671
min        1.000000
25%        2.000000
50%        3.000000
75%        5.000000
max        8.000000
Name: best_epoch, dtype: float64

## Error analysis