# Multi-Label Classifier

In [2]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings, CharacterEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

data_path = '/home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed'
corpus = NLPTaskDataFetcher.load_classification_corpus(Path(data_path), 
                                                       test_file='cooking.test', 
                                                       dev_file='cooking.valid', 
                                                       train_file='cooking.train')

word_embeddings = [WordEmbeddings('glove'),
                  # FlairEmbeddings('news-forward-fast'), 
                  # FlairEmbeddings('news-backward-fast')
                  ]

#word_embeddings = [CharacterEmbeddings()]
#word_embeddings = [WordEmbeddings('glove')]

document_embeddings = DocumentLSTMEmbeddings(word_embeddings, 
                                             hidden_size=512, 
                                             reproject_words=True, 
                                             reproject_words_dimension=256)

print()
print(document_embeddings)


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-21 10:38:30,532 Reading data from /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed
2019-04-21 10:38:30,533 Train: /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed/cooking.train
2019-04-21 10:38:30,533 Dev: /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed/cooking.valid
2019-04-21 10:38:30,534 Test: /home/wohlg/itmo/misc/cooking_classification/simple_and_preprocessed/cooking.test

DocumentLSTMEmbeddings(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings()
  )
  (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
  (rnn): GRU(256, 512)
  (dropout): Dropout(p=0.5)
)




In [3]:
print(corpus)
print(corpus.obtain_statistics())

TaggedCorpus: 7502 train + 1000 dev + 1000 test sentences
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 7502,
        "number_of_documents_per_class": {
            "sauce": 327,
            "cheese": 227,
            "food-safety": 943,
            "storage-method": 359,
            "equipment": 649,
            "bread": 564,
            "baking": 1133,
            "substitutions": 710,
            "chocolate": 227,
            "oven": 223,
            "storage-lifetime": 252,
            "cake": 309,
            "flavor": 290,
            "beef": 190,
            "food-science": 220,
            "cookies": 179,
            "fruit": 211,
            "vegetables": 243,
            "meat": 327,
            "oil": 224,
            "chicken": 369,
            "eggs": 344,
            "pasta": 184,
            "frying": 227,
            "temperature": 211,
            "sugar": 206,
            "food-preservation": 192,
            "freezing": 281,
      

In [4]:
classifier = TextClassifier(document_embeddings, 
                            label_dictionary=corpus.make_label_dictionary(), 
                            multi_label=True)

trainer = ModelTrainer(classifier, corpus)

trainer.train('/tmp', max_epochs=20)

2019-04-21 10:39:46,377 ----------------------------------------------------------------------------------------------------
2019-04-21 10:39:46,378 Evaluation method: MICRO_F1_SCORE
2019-04-21 10:39:46,405 ----------------------------------------------------------------------------------------------------
2019-04-21 10:39:46,541 epoch 1 - iter 0/235 - loss 0.02209757
2019-04-21 10:39:48,754 epoch 1 - iter 23/235 - loss 0.01400830
2019-04-21 10:39:50,957 epoch 1 - iter 46/235 - loss 0.01073977
2019-04-21 10:39:53,534 epoch 1 - iter 69/235 - loss 0.00926511
2019-04-21 10:39:56,666 epoch 1 - iter 92/235 - loss 0.00849951
2019-04-21 10:39:59,865 epoch 1 - iter 115/235 - loss 0.00801554
2019-04-21 10:40:03,058 epoch 1 - iter 138/235 - loss 0.00768011
2019-04-21 10:40:06,447 epoch 1 - iter 161/235 - loss 0.00745158
2019-04-21 10:40:09,162 epoch 1 - iter 184/235 - loss 0.00726992
2019-04-21 10:40:11,618 epoch 1 - iter 207/235 - loss 0.00713704
2019-04-21 10:40:15,126 epoch 1 - iter 230/235 -

  result = unpickler.load()


2019-04-21 10:41:39,681 MICRO_AVG: acc 0.0 - f1-score 0.0
2019-04-21 10:41:39,682 MACRO_AVG: acc 0.0 - f1-score 0.0
2019-04-21 10:41:39,683 baking     tp: 0 - fp: 0 - fn: 146 - tn: 854 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
2019-04-21 10:41:39,683 beef       tp: 0 - fp: 0 - fn: 31 - tn: 969 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
2019-04-21 10:41:39,684 bread      tp: 0 - fp: 0 - fn: 56 - tn: 944 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
2019-04-21 10:41:39,684 cake       tp: 0 - fp: 0 - fn: 48 - tn: 952 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
2019-04-21 10:41:39,685 cheese     tp: 0 - fp: 0 - fn: 37 - tn: 963 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
2019-04-21 10:41:39,686 chicken    tp: 0 - fp: 0 - fn: 62 - tn: 938 - precision: 0.0000 - recall: 0.0000 - accuracy: 0.0000 - f1-score: 0.0000
2019-04-21 10:41:39,686 c

{'test_score': 0.0,
 'dev_score_history': [0.0, 0.0, 0.0],
 'train_loss_history': [0.007012629895901813,
  0.005934814311478558,
  0.005868137669353541],
 'dev_loss_history': [0.005862580146640539,
  0.0057755145244300365,
  0.00570960296317935]}

In [5]:
from flair.data import Sentence

classifier = TextClassifier.load_from_file('/tmp/best-model.pt')

# create example sentence
sentence = Sentence('where is the bacon is it in the oven.')

# predict tags and print
classifier.predict(sentence)

print(sentence.labels)

2019-04-21 10:41:46,314 loading file /tmp/best-model.pt


  result = unpickler.load()


[]
