In [1]:
import openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

dataset = openml.datasets.get_dataset(42078)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format="dataframe")
y = LabelEncoder().fit_transform(y)

str_cols = [col for col in X.columns if X[col].dtype == "O"]
X_train, X_test, y_train, y_test = train_test_split(X[str_cols], y, stratify=y, train_size=100_000, test_size=20_000, random_state=0, shuffle=True)

In [2]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss, precision_score, recall_score

def evaluate_clf(clf):
  s = time.time()
  clf.fit(X_train, y_train)
  print("Model fit time:", time.time() - s, "seconds")

  y_pred = clf.predict(X_test)
  y_probs = clf.predict_proba(X_test)

  print("###### Label Predictions #######")
  print("accuracy:", accuracy_score(y_test, y_pred))
  print("precision:", precision_score(y_test, y_pred, average="macro"))
  print("recall:", recall_score(y_test, y_pred, average="macro"))
  print("f1:", f1_score(y_test, y_pred, average="macro"))

  print("###### Label Probabilities #######")
  print("roc_auc:", roc_auc_score(y_test, y_probs, average="macro", multi_class="ovr"))
  print("Log loss:", log_loss(y_test, y_probs))

# FastText without pretrained model

In [3]:
from gama.configuration.fasttextclassifier import FastTextClassifier
import time

clf = FastTextClassifier(epoch=15, lr=0.2)
evaluate_clf(clf)

Read 0M words
Number of words:  25698
Number of labels: 104
Progress: 100.0% words/sec/thread:  561968 lr:  0.002112 avg.loss:  0.206068 ETA:   0h 0m 0s

Model fit time: 7.3589348793029785 seconds


Progress: 100.0% words/sec/thread:  557209 lr:  0.000000 avg.loss:  0.204000 ETA:   0h 0m 0s


###### Label Predictions #######
accuracy: 0.94105
precision: 0.9231602714157237
recall: 0.9123943362459989
f1: 0.9163341401263136
###### Label Probabilities #######
roc_auc: 0.6338957541492822
Log loss: 10.383634787919807


# FastText with pretrained model, dim=100

In [4]:
from gama.configuration.fasttextclassifier import FastTextClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss, precision_score, recall_score

clf = FastTextClassifier(pretrainedVectors="100.vec", dim=100, epoch=15, lr=0.2)
evaluate_clf(clf)

Read 0M words
Number of words:  25698
Number of labels: 104
Progress: 100.0% words/sec/thread:  562147 lr:  0.000000 avg.loss:  0.056791 ETA:   0h 0m 0s


Model fit time: 109.47481489181519 seconds




###### Label Predictions #######
accuracy: 0.94395
precision: 0.9295523875948072
recall: 0.9191230299800558
f1: 0.9235064620124229
###### Label Probabilities #######
roc_auc: 0.6302089306763654
Log loss: 10.563789094786433


# FastText with pretrained model, dim=20

In [5]:
from gama.configuration.fasttextclassifier import FastTextClassifier

clf = FastTextClassifier(pretrainedVectors="20.vec", dim=20, epoch=15, lr=0.2)
evaluate_clf(clf)

Read 0M words
Number of words:  25698
Number of labels: 104
Progress: 100.0% words/sec/thread: 1018112 lr:  0.000000 avg.loss:  0.074384 ETA:   0h 0m 0s 0m 0s


Model fit time: 30.42029595375061 seconds




###### Label Predictions #######
accuracy: 0.93395
precision: 0.9173079778281257
recall: 0.9005298518877918
f1: 0.9068466232696917
###### Label Probabilities #######
roc_auc: 0.6327610671750777
Log loss: 10.558693600697756


# FastText with pretrained model, dim=10

In [6]:
from gama.configuration.fasttextclassifier import FastTextClassifier

clf = FastTextClassifier(pretrainedVectors="10.vec", dim=10, epoch=15, lr=0.2)
evaluate_clf(clf)

Read 0M words
Number of words:  25698
Number of labels: 104
Progress: 100.0% words/sec/thread: 1391374 lr:  0.000000 avg.loss:  0.141932 ETA:   0h 0m 0s


Model fit time: 19.026647090911865 seconds




###### Label Predictions #######
accuracy: 0.9168
precision: 0.8923570240463263
recall: 0.8788533308737339
f1: 0.8842424539885628
###### Label Probabilities #######
roc_auc: 0.6134300841898107
Log loss: 10.57638846292496


# Results
We have experimented with pretrained models of varying sizes and have found that the largest pretrained model `dim=100` brings slight improvements in the performance in terms of accuracy, recall, and precision, but at a cost of a much higher training time with a magnitude of 15 times. All experiments involving smaller models `dim=[20, 10]` all yield worse results than not using a pretrained model at all.
Using a pretrained model therefore barely improves the performance of FastTextClassifier on any of the evaluated metrics, while greatly increasing the training time of the classifier due to the loading time of the pretrained models.

Surprisingly, the usage of pretrained models even **lowers** scores that are calculated with class probabilities such as *ROC AUC* and *negative log loss*. We suspect that using pretrained models lowers the confidence of the classification model due to activations of word vectors from the pretrained model. The usage of smaller pretrained models therefore cause underfitting in the downstream classification models.

# Conclusion
We conclude that the usage of pretrained models **does not improve** the performance of the FastTextClassifier and is in some cases even detrimental to the performance when evaluating the beer review dataset. Not to mention that it also increases the training time significantly due to the loading time of the pretrained models.

# Further work
We have observed that the use of pretrained models does not bring any improvements on the classifier for this data set. The next step is to experiment whether this hypothesis also holds on other datasets.

In [7]:
# Reduce the size of the pretrained model

# import fasttext as ft
# import fasttext.util

# m = ft.load_model("cc.en.300.bin")
# m.get_dimension()
# fasttext.util.reduce_model(m, 10)
# m.save_model("cc.en.10.bin")
# m.get_dimension()