In [1]:
!pip install flair -U
!pip install allennlp==0.9.0 --no-deps
!pip install spacy
!pip uninstall lxml -y
!pip install lxml


Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.9/401.9 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece==0.1.95
  Downloading sentencepiece-0.1.95-cp39-cp39-manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m99.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting more-itertools
  Downloading more_itertools-8.14.0-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.2/52.2 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m788.5/788.5 kB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gensim>=3.4.0
  Downloading gensim-4.2.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
[2

In [1]:
import os
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings, ELMoEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

In [2]:
def segment_data(data_file):
    try:
        import pandas as pd
    except ImportError:
        raise
    
    data = pd.read_csv(data_file, encoding='latin-1').sample(frac=1).drop_duplicates()
    data = data[['classes', 'title']].rename(columns={"classes":"label", "title":"text"})
    data['label'] = '__label__' +data['label'].astype(str)
    data['text'] = data['text'].apply(lambda k: k.lower().strip())
    data.to_csv('./data/whole.csv', sep='\t', index = False, header = False)
    data.iloc[0:int(len(data)*0.8)].to_csv('./data/train.csv', sep='\t', index = False, header = False)
    data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('./data/test.csv', sep='\t', index = False, header = False)
    data.iloc[int(len(data)*0.9):].to_csv('./data/dev.csv', sep='\t', index = False, header = False)
    return

In [3]:
def initialize_embeddings():
    """
    Summary:
        Stacks the list of pre-trained embedding vectors to be used as word representation (in concat.)
    Return:
        list: Returns list of pretrained embeddings vectors
    """
    word_embeddings = [
			WordEmbeddings('glove'),
			FlairEmbeddings('news-forward'),
			FlairEmbeddings('news-backward')
		]
    return word_embeddings
 
word_embeddings = initialize_embeddings()

In [None]:
import flair.datasets
corpus = flair.datasets.UD_ENGLISH()
ll = corpus.make_label_dictionary(label_type = 'upos')
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, rnn_type='LSTM', rnn_layers=1, bidirectional=False)

classifier = TextClassifier(document_embeddings, label_dictionary=ll, multi_label=False, label_type = 'upos')

trainer = ModelTrainer(classifier, corpus)

trainer.train('./model', max_epochs=20, patience=5, mini_batch_size=32, learning_rate=0.1)

2022-10-06 01:34:52,115 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu not found in cache, downloading to /tmp/tmplavk_8ha


1738438B [00:00, 108837039.44B/s]        

2022-10-06 01:34:52,149 copying /tmp/tmplavk_8ha to cache at /root/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2022-10-06 01:34:52,151 removing temp file /tmp/tmplavk_8ha
2022-10-06 01:34:52,248 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-test.conllu not found in cache, downloading to /tmp/tmpzkxkpi_b



1738935B [00:00, 74259525.00B/s]         

2022-10-06 01:34:52,303 copying /tmp/tmpzkxkpi_b to cache at /root/.flair/datasets/ud_english/en_ewt-ud-test.conllu
2022-10-06 01:34:52,305 removing temp file /tmp/tmpzkxkpi_b





2022-10-06 01:34:52,403 https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu not found in cache, downloading to /tmp/tmpx4kzj3tt


13686411B [00:00, 130812490.38B/s]        

2022-10-06 01:34:52,529 copying /tmp/tmpx4kzj3tt to cache at /root/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2022-10-06 01:34:52,539 removing temp file /tmp/tmpx4kzj3tt
2022-10-06 01:34:52,541 Reading data from /root/.flair/datasets/ud_english
2022-10-06 01:34:52,542 Train: /root/.flair/datasets/ud_english/en_ewt-ud-train.conllu
2022-10-06 01:34:52,542 Dev: /root/.flair/datasets/ud_english/en_ewt-ud-dev.conllu
2022-10-06 01:34:52,543 Test: /root/.flair/datasets/ud_english/en_ewt-ud-test.conllu





2022-10-06 01:35:03,813 Computing label dictionary. Progress:


12543it [00:00, 31560.54it/s]

2022-10-06 01:35:04,213 Dictionary created for label 'upos' with 18 values: NOUN (seen 34761 times), PUNCT (seen 23620 times), VERB (seen 22946 times), PRON (seen 18589 times), ADP (seen 17730 times), DET (seen 16314 times), ADJ (seen 13167 times), AUX (seen 12440 times), PROPN (seen 12345 times), ADV (seen 9462 times), CCONJ (seen 6690 times), PART (seen 5745 times), SCONJ (seen 4554 times), NUM (seen 4119 times), X (seen 704 times), SYM (seen 698 times), INTJ (seen 694 times)





2022-10-06 01:35:04,525 ----------------------------------------------------------------------------------------------------
2022-10-06 01:35:04,623 Model: "TextClassifier(
  (decoder): Linear(in_features=512, out_features=18, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (locked_dropout): LockedDropout(p=0.0)
  (word_dropout): WordDropout(p=0.0)
  (loss_function): CrossEntropyLoss()
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings(
        'glove'
        (embedding): Embedding(400001, 100)
      )
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)

100%|██████████| 32/32 [00:04<00:00,  7.45it/s]


2022-10-06 01:35:43,865 Evaluating as a multi-label problem: False
2022-10-06 01:35:43,983 DEV : loss 0.03687284141778946 - f1-score (micro avg)  0.0
2022-10-06 01:35:44,104 BAD EPOCHS (no improvement): 0
2022-10-06 01:35:44,106 ----------------------------------------------------------------------------------------------------
2022-10-06 01:35:45,416 epoch 2 - iter 19/196 - loss 0.03538036 - samples/sec: 930.34 - lr: 0.100000
2022-10-06 01:35:46,652 epoch 2 - iter 38/196 - loss 0.03549523 - samples/sec: 985.66 - lr: 0.100000
2022-10-06 01:35:47,929 epoch 2 - iter 57/196 - loss 0.03511959 - samples/sec: 954.20 - lr: 0.100000
2022-10-06 01:35:49,247 epoch 2 - iter 76/196 - loss 0.03482265 - samples/sec: 923.55 - lr: 0.100000
2022-10-06 01:35:50,598 epoch 2 - iter 95/196 - loss 0.03456914 - samples/sec: 901.93 - lr: 0.100000
2022-10-06 01:35:51,857 epoch 2 - iter 114/196 - loss 0.03440604 - samples/sec: 967.22 - lr: 0.100000
2022-10-06 01:35:53,093 epoch 2 - iter 133/196 - loss 0.0342940

100%|██████████| 32/32 [00:01<00:00, 18.94it/s]


2022-10-06 01:35:59,169 Evaluating as a multi-label problem: False
2022-10-06 01:35:59,284 DEV : loss 0.0336775965988636 - f1-score (micro avg)  0.0
2022-10-06 01:35:59,410 BAD EPOCHS (no improvement): 0
2022-10-06 01:35:59,412 ----------------------------------------------------------------------------------------------------
2022-10-06 01:36:00,716 epoch 3 - iter 19/196 - loss 0.03270931 - samples/sec: 934.73 - lr: 0.100000
2022-10-06 01:36:02,056 epoch 3 - iter 38/196 - loss 0.03219619 - samples/sec: 908.87 - lr: 0.100000
2022-10-06 01:36:03,352 epoch 3 - iter 57/196 - loss 0.03242623 - samples/sec: 940.44 - lr: 0.100000
2022-10-06 01:36:04,618 epoch 3 - iter 76/196 - loss 0.03231119 - samples/sec: 962.21 - lr: 0.100000
2022-10-06 01:36:05,914 epoch 3 - iter 95/196 - loss 0.03228811 - samples/sec: 939.83 - lr: 0.100000
2022-10-06 01:36:07,233 epoch 3 - iter 114/196 - loss 0.03212000 - samples/sec: 923.95 - lr: 0.100000
2022-10-06 01:36:08,541 epoch 3 - iter 133/196 - loss 0.03221477

100%|██████████| 32/32 [00:01<00:00, 20.00it/s]


2022-10-06 01:36:14,880 Evaluating as a multi-label problem: False
2022-10-06 01:36:14,988 DEV : loss 0.03132303059101105 - f1-score (micro avg)  0.0
2022-10-06 01:36:15,108 BAD EPOCHS (no improvement): 0
2022-10-06 01:36:15,109 ----------------------------------------------------------------------------------------------------
2022-10-06 01:36:16,433 epoch 4 - iter 19/196 - loss 0.03072634 - samples/sec: 920.39 - lr: 0.100000
2022-10-06 01:36:17,746 epoch 4 - iter 38/196 - loss 0.03095300 - samples/sec: 928.12 - lr: 0.100000
2022-10-06 01:36:19,076 epoch 4 - iter 57/196 - loss 0.03107124 - samples/sec: 915.63 - lr: 0.100000
2022-10-06 01:36:20,380 epoch 4 - iter 76/196 - loss 0.03114262 - samples/sec: 934.63 - lr: 0.100000
2022-10-06 01:36:21,650 epoch 4 - iter 95/196 - loss 0.03104533 - samples/sec: 958.97 - lr: 0.100000
2022-10-06 01:36:22,940 epoch 4 - iter 114/196 - loss 0.03089644 - samples/sec: 944.76 - lr: 0.100000
2022-10-06 01:36:24,246 epoch 4 - iter 133/196 - loss 0.0308109

100%|██████████| 32/32 [00:01<00:00, 19.54it/s]


2022-10-06 01:36:30,467 Evaluating as a multi-label problem: False
2022-10-06 01:36:30,577 DEV : loss 0.029220497235655785 - f1-score (micro avg)  0.0
2022-10-06 01:36:30,696 BAD EPOCHS (no improvement): 0
2022-10-06 01:36:30,698 ----------------------------------------------------------------------------------------------------
2022-10-06 01:36:31,957 epoch 5 - iter 19/196 - loss 0.02973221 - samples/sec: 967.55 - lr: 0.100000
2022-10-06 01:36:33,230 epoch 5 - iter 38/196 - loss 0.02945966 - samples/sec: 957.38 - lr: 0.100000
2022-10-06 01:36:34,499 epoch 5 - iter 57/196 - loss 0.02918639 - samples/sec: 960.08 - lr: 0.100000
2022-10-06 01:36:35,777 epoch 5 - iter 76/196 - loss 0.02936975 - samples/sec: 952.95 - lr: 0.100000
2022-10-06 01:36:37,040 epoch 5 - iter 95/196 - loss 0.02931810 - samples/sec: 964.67 - lr: 0.100000
2022-10-06 01:36:38,365 epoch 5 - iter 114/196 - loss 0.02914773 - samples/sec: 919.54 - lr: 0.100000
2022-10-06 01:36:39,628 epoch 5 - iter 133/196 - loss 0.029113

100%|██████████| 32/32 [00:01<00:00, 19.59it/s]


2022-10-06 01:36:45,964 Evaluating as a multi-label problem: False
2022-10-06 01:36:46,080 DEV : loss 0.026699304580688477 - f1-score (micro avg)  0.0
2022-10-06 01:36:46,208 BAD EPOCHS (no improvement): 0
2022-10-06 01:36:46,211 ----------------------------------------------------------------------------------------------------
2022-10-06 01:36:47,527 epoch 6 - iter 19/196 - loss 0.02762813 - samples/sec: 926.17 - lr: 0.100000
2022-10-06 01:36:48,818 epoch 6 - iter 38/196 - loss 0.02729154 - samples/sec: 943.75 - lr: 0.100000
2022-10-06 01:36:50,074 epoch 6 - iter 57/196 - loss 0.02734169 - samples/sec: 969.83 - lr: 0.100000
2022-10-06 01:36:51,391 epoch 6 - iter 76/196 - loss 0.02727860 - samples/sec: 925.33 - lr: 0.100000
2022-10-06 01:36:52,777 epoch 6 - iter 95/196 - loss 0.02692910 - samples/sec: 879.16 - lr: 0.100000
2022-10-06 01:36:54,073 epoch 6 - iter 114/196 - loss 0.02686944 - samples/sec: 939.68 - lr: 0.100000
2022-10-06 01:36:55,368 epoch 6 - iter 133/196 - loss 0.026773

100%|██████████| 32/32 [00:01<00:00, 19.60it/s]


2022-10-06 01:37:01,817 Evaluating as a multi-label problem: False
2022-10-06 01:37:01,931 DEV : loss 0.024091824889183044 - f1-score (micro avg)  0.0
2022-10-06 01:37:02,054 BAD EPOCHS (no improvement): 0
2022-10-06 01:37:02,056 ----------------------------------------------------------------------------------------------------
2022-10-06 01:37:03,360 epoch 7 - iter 19/196 - loss 0.02513254 - samples/sec: 934.74 - lr: 0.100000
2022-10-06 01:37:04,654 epoch 7 - iter 38/196 - loss 0.02516564 - samples/sec: 941.37 - lr: 0.100000
2022-10-06 01:37:05,943 epoch 7 - iter 57/196 - loss 0.02462709 - samples/sec: 945.07 - lr: 0.100000
2022-10-06 01:37:07,211 epoch 7 - iter 76/196 - loss 0.02474032 - samples/sec: 961.69 - lr: 0.100000
2022-10-06 01:37:08,542 epoch 7 - iter 95/196 - loss 0.02473245 - samples/sec: 915.49 - lr: 0.100000
2022-10-06 01:37:09,863 epoch 7 - iter 114/196 - loss 0.02454133 - samples/sec: 922.14 - lr: 0.100000
2022-10-06 01:37:11,187 epoch 7 - iter 133/196 - loss 0.024454

100%|██████████| 32/32 [00:01<00:00, 19.71it/s]


2022-10-06 01:37:17,470 Evaluating as a multi-label problem: False
2022-10-06 01:37:17,581 DEV : loss 0.020406417548656464 - f1-score (micro avg)  0.0
2022-10-06 01:37:17,707 BAD EPOCHS (no improvement): 0
2022-10-06 01:37:17,709 ----------------------------------------------------------------------------------------------------
2022-10-06 01:37:18,963 epoch 8 - iter 19/196 - loss 0.02169285 - samples/sec: 971.51 - lr: 0.100000
2022-10-06 01:37:20,209 epoch 8 - iter 38/196 - loss 0.02178150 - samples/sec: 978.50 - lr: 0.100000
2022-10-06 01:37:23,385 epoch 8 - iter 57/196 - loss 0.02229920 - samples/sec: 383.07 - lr: 0.100000
2022-10-06 01:37:24,707 epoch 8 - iter 76/196 - loss 0.02255851 - samples/sec: 921.44 - lr: 0.100000
2022-10-06 01:37:26,002 epoch 8 - iter 95/196 - loss 0.02236575 - samples/sec: 940.90 - lr: 0.100000
2022-10-06 01:37:27,257 epoch 8 - iter 114/196 - loss 0.02221352 - samples/sec: 970.46 - lr: 0.100000
2022-10-06 01:37:28,533 epoch 8 - iter 133/196 - loss 0.022125

100%|██████████| 32/32 [00:01<00:00, 19.10it/s]


2022-10-06 01:37:34,837 Evaluating as a multi-label problem: False
2022-10-06 01:37:34,948 DEV : loss 0.01813221164047718 - f1-score (micro avg)  0.0
2022-10-06 01:37:35,067 BAD EPOCHS (no improvement): 0
2022-10-06 01:37:35,070 ----------------------------------------------------------------------------------------------------
2022-10-06 01:37:36,421 epoch 9 - iter 19/196 - loss 0.01861413 - samples/sec: 902.19 - lr: 0.100000
2022-10-06 01:37:37,732 epoch 9 - iter 38/196 - loss 0.01965456 - samples/sec: 928.81 - lr: 0.100000
2022-10-06 01:37:39,060 epoch 9 - iter 57/196 - loss 0.01955340 - samples/sec: 917.44 - lr: 0.100000
2022-10-06 01:37:40,313 epoch 9 - iter 76/196 - loss 0.01944323 - samples/sec: 972.80 - lr: 0.100000
2022-10-06 01:37:41,653 epoch 9 - iter 95/196 - loss 0.01903195 - samples/sec: 909.14 - lr: 0.100000
2022-10-06 01:37:42,929 epoch 9 - iter 114/196 - loss 0.01893997 - samples/sec: 954.62 - lr: 0.100000
2022-10-06 01:37:44,255 epoch 9 - iter 133/196 - loss 0.0188187

100%|██████████| 32/32 [00:01<00:00, 19.22it/s]


2022-10-06 01:37:50,525 Evaluating as a multi-label problem: False
2022-10-06 01:37:50,641 DEV : loss 0.014614132232964039 - f1-score (micro avg)  0.0
2022-10-06 01:37:50,768 BAD EPOCHS (no improvement): 0
2022-10-06 01:37:50,872 ----------------------------------------------------------------------------------------------------
2022-10-06 01:37:52,192 epoch 10 - iter 19/196 - loss 0.01594162 - samples/sec: 923.45 - lr: 0.100000


In [4]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('./model/final-model.pt')

mapping = {
    'sad': '&#x1F61E',
    'smile': '&#x1F600',
    'food': '&#x1F37D',
    'heart': '&#10084;',
    'baseball': '&#x26be;'
}


def emoji(text):
        data = text
        if not len(data.strip()):
            return ''
        sentence = Sentence(data)
        classifier.predict(sentence)
        print (str(sentence.labels))
        if 'sad'in str(sentence.labels):
            return mapping['sad'] 
        elif 'smile' in str(sentence.labels):
            return mapping['smile'] 
        elif 'food' in str(sentence.labels):
            return mapping['food']
        elif 'heart' in str(sentence.labels):
            print (1)
            return mapping['heart']
        elif 'baseball' in str(sentence.labels):
            return mapping['baseball']

2022-10-06 01:38:14,249 loading file ./model/final-model.pt


In [None]:
print(emoji('i ran home so quickly and smiled the whole way!'))
print(emoji('the sad little boy walked home alone in the heavy rain.'))


['Sentence: "i ran home so quickly and smiled the whole way !"'/'PRON' (0.9784)]
&#x1F600
['Sentence: "the sad little boy walked home alone in the heavy rain ."'/'DET' (0.635)]
&#x1F61E
