In [1]:
from pathlib import Path
import fasttext
from fasttext import util as ftu

In [2]:
vec_file = Path("/Users/stm/cltk_data/cc.la.300.bin")

In [3]:
vec_file.exists()

True

In [4]:
embeddings = fasttext.load_model(str(vec_file))

In [5]:
embeddings.get_dimension()

300

In [6]:
ftu.reduce_model(embeddings, 100)

<fasttext.FastText._FastText at 0x1054d5af0>

In [7]:
embeddings.get_dimension()

100

In [8]:
embeddings.save_model(str(vec_file).replace("300", "100"))

In [9]:
from latin_author_learning.fasttext_wrapper import model_to_vec_str



In [10]:
vec_str = model_to_vec_str(embeddings)

In [11]:
from pathlib import Path
import os

In [12]:
cltk_path = Path(os.environ["HOME"]) / "cltk_data" / "lat" / "text"

In [13]:
perseus_path = cltk_path / "lat_text_perseus"

In [14]:
vec_path = cltk_path / "cc.la.100.vec"

In [15]:
with open(vec_path, "w") as f:
    f.write(vec_str)

In [16]:
from latin_author_learning.corpus import Corpus

In [17]:
perseus = Corpus("lat_text_perseus")

In [18]:
perseus.add_data_from_files(
    perseus_path,
    filename_contains="_lat.",
    meta_keys=["fileDesc", "teiHeader"],
    meta_key_prefix="@", 
)

In [19]:
from latin_author_learning.fasttext_wrapper import DatasetWrapper

In [20]:
ds = DatasetWrapper(corpus=perseus, fraction_for_test=0.2, chunksize=100)

In [21]:
trainfile = cltk_path / "perseues_train.txt"
ds.get_training_data(trainfile)

In [22]:
valid_file = cltk_path / "perseues_valid.txt"
ds.get_validation_data(valid_file)

In [None]:
author_model = fasttext.train_supervised(
    input=str(trainfile), dim=100, lr=1.,
    thread=7,
    pretrainedVectors=str(vec_path),
    wordNgrams=2, minn=3, maxn=6,
    epoch=200,
)

Read 3M words
Number of words:  392447
Number of labels: 23
Progress:   3.5% words/sec/thread:  207449 lr:  0.964639 avg.loss:  0.197098 ETA:   0h10m19s

In [24]:
author_model.test(str(valid_file))

(9246, 0.8822193380921479, 0.8822193380921479)

In [23]:
author_model = fasttext.train_supervised(
    input=str(trainfile),
    thread=7,
    pretrainedVectors=str(vec_path),
    dim=100,
    epoch=200,
    autotuneValidationFile=str(valid_file),
    autotuneDuration=90*60,
)

Progress: 100.0% Trials:    7 Best score:  0.880813 ETA:   0h 0m 0s
Training again with best arguments
Read 3M words
Number of words:  392447
Number of labels: 23

Aborting autotune...
Progress: 100.0% words/sec/thread:  117008 lr:  0.000000 avg.loss:  0.007705 ETA:   0h 0m 0s 41.6% words/sec/thread:  178615 lr:  0.844785 avg.loss:  0.016640 ETA:   0h 7m15s 89.6% words/sec/thread:  112343 lr:  0.149740 avg.loss:  0.008444 ETA:   0h 2m 2s


In [25]:
author_model.test(str(valid_file))

(9246, 0.8778931429807484, 0.8778931429807484)

In [28]:
author_model.test(str(valid_file), 3)

(9246, 0.32244574230297784, 0.9673372269089336)

In [29]:
author_model.test(str(valid_file), 2)

(9246, 0.4724745836037205, 0.944949167207441)

In [30]:
author_model.__dict__

{'f': <fasttext_pybind.fasttext at 0x10563f5f0>,
 '_words': None,
 '_labels': None,
 'lr': 1.4455293468027774,
 'dim': 100,
 'ws': 5,
 'epoch': 200,
 'minCount': 1,
 'minCountLabel': 0,
 'minn': 3,
 'maxn': 6,
 'neg': 5,
 'wordNgrams': 5,
 'loss': <loss_name.softmax: 3>,
 'bucket': 186020,
 'thread': 7,
 'lrUpdateRate': 100,
 't': 0.0001,
 'label': '__label__',
 'verbose': 2,
 'pretrainedVectors': '/Users/stm/cltk_data/lat/text/cc.la.100.vec'}

In [31]:
parameters = author_model.__dict__

In [34]:
parameters = {
    p: parameters[p] for p in parameters if (
        not p.startswith("_") and p not in ("f", "label", "loss")
    )
}

In [36]:
parameters["epoch"] = 400

In [37]:
improved_model = fasttext.train_supervised(**parameters, input=str(trainfile))

Read 3M words
Number of words:  392447
Number of labels: 23
Progress: 100.0% words/sec/thread:   55097 lr:  0.000000 avg.loss:  0.004249 ETA:   0h 0m 0s 36.6% words/sec/thread:  169654 lr:  0.915933 avg.loss:  0.009465 ETA:   0h16m34s 43.7% words/sec/thread:  169089 lr:  0.813669 avg.loss:  0.008143 ETA:   0h14m46s 54.0% words/sec/thread:  169482 lr:  0.664525 avg.loss:  0.006825 ETA:   0h12m 2s 59.4% words/sec/thread:  168304 lr:  0.587108 avg.loss:  0.006320 ETA:   0h10m42s 68.7% words/sec/thread:  167034 lr:  0.451791 avg.loss:  0.005626 ETA:   0h 8m18s 79.7% words/sec/thread:  166604 lr:  0.293029 avg.loss:  0.005020 ETA:   0h 5m24s


In [38]:
improved_model.test(str(valid_file))

(9246, 0.8837335063811378, 0.8837335063811378)

In [39]:
improved_model.test(str(valid_file), 2)

(9246, 0.47350205494267794, 0.9470041098853559)

In [40]:
improved_model.test(str(valid_file), 3)

(9246, 0.3224817939289062, 0.9674453817867186)