In [95]:
from typing import Iterator, List, Dict

import torch
import torch.optim as optim
import numpy as np

from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator, BasicIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import TextClassifierPredictor

In [122]:
class InsReader(DatasetReader):
    def text_to_instance(self, sentence: str, label: int = None) -> Instance:
        if not isinstance(sentence, list):
            sentence = sentence.split()
        
        sentence_field = TextField([Token(word) for word in sentence], {"tokens": SingleIdTokenIndexer()})
        fields = {"tokens": sentence_field}
        
        if label is not None:
            label_field = LabelField(label=label, skip_indexing=True)
            fields["label"] = label_field

        return Instance(fields)

    def _read(self, file_path: str) -> Iterator[Instance]:
        text_path = file_path + '.text'
        labels_path = file_path + '.labels'
        
        with open(text_path) as text_f, open(labels_path) as labels_f:
            for line_t, line_l in zip(text_f, labels_f):
                sentence = line_t.strip()
                label = int(line_l.strip())
                yield self.text_to_instance(sentence, label)

In [123]:
data_path = '/Users/fursovia/Documents/texar/examples/text_style_transfer/data/insurance_cropped/'
reader = InsReader()

train_dataset = reader.read(data_path + 'insurance.train')
dev_dataset = reader.read(data_path + 'insurance.dev')
test_dataset = reader.read(data_path + 'insurance.test')

266051it [00:13, 20256.82it/s]
57011it [00:08, 6536.81it/s] 
57012it [00:01, 51277.39it/s]


In [124]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset)

100%|██████████| 323062/323062 [00:01<00:00, 167660.77it/s]


In [125]:
iterator = BasicIterator(batch_size=64)
iterator.index_with(vocab)

# Model

In [74]:
from allennlp.models.basic_classifier import BasicClassifier
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder

In [73]:
EMBEDDING_DIM = 16

token_embedding = Embedding(
    num_embeddings=vocab.get_vocab_size('tokens'),
    embedding_dim=EMBEDDING_DIM
)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
body = BagOfEmbeddingsEncoder(embedding_dim=EMBEDDING_DIM)

In [76]:
model = BasicClassifier(
    vocab=vocab, 
    text_field_embedder=word_embeddings, 
    seq2vec_encoder=body,
    num_labels=2
)

In [77]:
model

BasicClassifier(
  (_text_field_embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (_seq2vec_encoder): BagOfEmbeddingsEncoder()
  (_classification_layer): Linear(in_features=16, out_features=2, bias=True)
  (_loss): CrossEntropyLoss()
)

In [92]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [93]:
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=dev_dataset,
    patience=3,
    num_epochs=10
)

In [94]:
trainer.train()


  0%|          | 0/4158 [00:00<?, ?it/s][A
accuracy: 0.0469, loss: 0.7359 ||:   0%|          | 1/4158 [00:00<32:26,  2.14it/s][A
accuracy: 0.2168, loss: 0.7138 ||:   0%|          | 8/4158 [00:00<22:58,  3.01it/s][A
accuracy: 0.4023, loss: 0.6881 ||:   0%|          | 16/4158 [00:00<16:20,  4.23it/s][A
accuracy: 0.5787, loss: 0.6544 ||:   1%|          | 26/4158 [00:00<11:37,  5.92it/s][A
accuracy: 0.6759, loss: 0.6228 ||:   1%|          | 35/4158 [00:00<08:21,  8.22it/s][A
accuracy: 0.7480, loss: 0.5856 ||:   1%|          | 46/4158 [00:00<06:02, 11.36it/s][A
accuracy: 0.7855, loss: 0.5567 ||:   1%|▏         | 55/4158 [00:01<04:27, 15.35it/s][A
accuracy: 0.8156, loss: 0.5265 ||:   2%|▏         | 65/4158 [00:01<03:18, 20.57it/s][A
accuracy: 0.8387, loss: 0.4982 ||:   2%|▏         | 75/4158 [00:01<02:31, 26.90it/s][A
accuracy: 0.8559, loss: 0.4731 ||:   2%|▏         | 85/4158 [00:01<01:59, 34.18it/s][A
accuracy: 0.8685, loss: 0.4529 ||:   2%|▏         | 94/4158 [00:01<01:36, 41.

{'best_epoch': 2,
 'peak_cpu_memory_MB': 1576.292352,
 'training_duration': '0:02:55.976958',
 'training_start_epoch': 0,
 'training_epochs': 4,
 'epoch': 4,
 'training_accuracy': 0.9852471894486395,
 'training_loss': 0.05954500032598247,
 'training_cpu_memory_MB': 1576.292352,
 'validation_accuracy': 0.984827489431864,
 'validation_loss': 0.06498578472343074,
 'best_validation_accuracy': 0.9852133798740594,
 'best_validation_loss': 0.0647929108724642}

In [126]:
predictor = TextClassifierPredictor(model=model, dataset_reader=reader)

In [127]:
sentence = "a_2030 a_710 a_1 a_1978 a_1688 a_1737"
predictor.predict(sentence)

{'logits': [2.871548652648926, -2.553658962249756],
 'probs': [0.9956151247024536, 0.004384840372949839],
 'label': '0'}