In [1]:
import openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

dataset = openml.datasets.get_dataset(42078)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format="dataframe")
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=100_000, test_size=20_000, random_state=0, shuffle=True)
str_cols = [col for col in X_train.columns if X[col].dtype == "O"]

In [2]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss, precision_score, recall_score

def evaluate_clf(clf):
  s = time.time()
  clf.fit(X_train, y_train)
  print("Model fit time:", time.time() - s, "seconds")

  y_pred = clf.predict(X_test)
  y_probs = clf.predict_proba(X_test)

  print("###### Label Predictions #######")
  print("accuracy:", accuracy_score(y_test, y_pred))
  print("precision:", precision_score(y_test, y_pred, average="macro"))
  print("recall:", recall_score(y_test, y_pred, average="macro"))
  print("f1:", f1_score(y_test, y_pred, average="macro"))

  print("###### Label Probabilities #######")
  print("roc_auc:", roc_auc_score(y_test, y_probs, average="macro", multi_class="ovr"))
  print("Log loss:", log_loss(y_test, y_probs))

# FastText without pretrained model

In [3]:
from gama.configuration.fasttextclassifier import FastTextClassifier
import time

clf = FastTextClassifier(epoch=15, lr=0.2)
evaluate_clf(clf)

Read 1M words
Number of words:  146740
Number of labels: 104
Progress: 100.0% words/sec/thread:  988445 lr:  0.000000 avg.loss:  0.264490 ETA:   0h 0m 0s


Model fit time: 9.409299850463867 seconds




###### Label Predictions #######
accuracy: 0.9471
precision: 0.9184304975591595
recall: 0.9115738264943514
f1: 0.9133281387538676
###### Label Probabilities #######
roc_auc: 0.6607982366742533
Log loss: 10.241884007488643


# FastText with pretrained model, dim=100

In [4]:
from gama.configuration.fasttextclassifier import FastTextClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss, precision_score, recall_score

clf = FastTextClassifier(pretrainedVectors="100.vec", pretrainedDim=100, epoch=15, lr=0.2)
evaluate_clf(clf)

Read 1M words
Number of words:  146740
Number of labels: 104
Progress: 100.0% words/sec/thread:  997758 lr:  0.000000 avg.loss:  0.101807 ETA:   0h 0m 0s 39.8% words/sec/thread: 1006802 lr:  0.120379 avg.loss:  0.245574 ETA:   0h 0m 6s


Model fit time: 106.84839177131653 seconds




###### Label Predictions #######
accuracy: 0.95085
precision: 0.9282907799286956
recall: 0.9216491119070563
f1: 0.9236953304790346
###### Label Probabilities #######
roc_auc: 0.6532018107974695
Log loss: 10.442995417574002


# FastText with pretrained model, dim=20

In [5]:
from gama.configuration.fasttextclassifier import FastTextClassifier

clf = FastTextClassifier(pretrainedVectors="20.vec", pretrainedDim=20, epoch=15, lr=0.2)
evaluate_clf(clf)

Read 1M words
Number of words:  146740
Number of labels: 104
Progress: 100.0% words/sec/thread: 1899770 lr:  0.000000 avg.loss:  0.123397 ETA:   0h 0m 0s


Model fit time: 29.275720834732056 seconds




###### Label Predictions #######
accuracy: 0.9478
precision: 0.9300823716726595
recall: 0.9166128604129699
f1: 0.9223670090719438
###### Label Probabilities #######
roc_auc: 0.646936175251893
Log loss: 10.416794373251498


# FastText with pretrained model, dim=10

In [9]:
from gama.configuration.fasttextclassifier import FastTextClassifier

clf = FastTextClassifier(pretrainedVectors="10.vec", pretrainedDim=10, epoch=15, lr=0.2)
evaluate_clf(clf)

Read 1M words
Number of words:  146740
Number of labels: 104
Progress: 100.0% words/sec/thread: 2483672 lr:  0.000000 avg.loss:  0.159481 ETA:   0h 0m 0s


Model fit time: 18.906963109970093 seconds




###### Label Predictions #######
accuracy: 0.9425
precision: 0.917707658423152
recall: 0.9055881704508555
f1: 0.9100104649580992
###### Label Probabilities #######
roc_auc: 0.6375738190918786
Log loss: 10.369919593889739


# Results
Using a pretrained model does barely improve the performance of FastTextClassifier on all the evaluated metrics, while greatly increasing the training time of the classifier due to the loading time of the pretrained models.

Surprisingly, the usage of pretrained models even **lowers** scores that are calculated with class probabilities such as ROC AUC and log loss. We suspect that using pretrained models lowers the confidence of the classification model due to activations of word vectors from the pretrained model. 

# Conclusion
We conclude that the usage of pretrained models **does not improve** the performance of the FastTextClassifier and is in some cases even detrimental to the performance. Not to mention that it also increases the training time significantly due to the loading time of the pretrained models.

In [7]:
# Reduce the size of the pretrained model

# import fasttext as ft
# import fasttext.util

# m = ft.load_model("cc.en.300.bin")
# m.get_dimension()
# fasttext.util.reduce_model(m, 10)
# m.save_model("cc.en.10.bin")
# m.get_dimension()