In [1]:
from allennlp.models.encoder_decoders.composed_seq2seq import ComposedSeq2Seq
from typing import Dict, Tuple

from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
 
import torch
import torch.nn as nn

from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.attention.additive_attention import AdditiveAttention
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding


from typing import Dict
import csv

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.token import Token
from allennlp.data.tokenizers.character_tokenizer import CharacterTokenizer
from allennlp.data.tokenizers import Tokenizer
from allennlp.data.iterators import BucketIterator, BasicIterator

from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq

import torch
import torch.nn as nn

from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.attention.additive_attention import AdditiveAttention
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.training.trainer import Trainer
from allennlp.common.util import END_SYMBOL, START_SYMBOL

In [2]:
class LovelyModel(SimpleSeq2Seq):

    def _prepare_output_projections(self,
                                    last_predictions: torch.Tensor,
                                    state: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
        encoder_outputs = state["encoder_outputs"]
        source_mask = state["source_mask"]
        decoder_hidden = state["decoder_hidden"]
        decoder_context = state["decoder_context"]
        embedded_input = self._source_embedder._token_embedders['tokens'](last_predictions)
        if self._attention:
            attended_input = self._prepare_attended_input(decoder_hidden, encoder_outputs, source_mask)
            decoder_input = torch.cat((attended_input, embedded_input), -1)
        else:
            decoder_input = embedded_input
        decoder_hidden, decoder_context = self._decoder_cell(
                decoder_input,
                (decoder_hidden, decoder_context))

        state["decoder_hidden"] = decoder_hidden
        state["decoder_context"] = decoder_context

        output_projections = self._output_projection_layer(decoder_hidden)
        return output_projections, state

    def forward(self,  # type: ignore
                source_tokens: Dict[str, torch.LongTensor],
                target_tokens: Dict[str, torch.LongTensor] = None, **kwargs):
        del kwargs
        return super().forward(source_tokens, target_tokens)


In [3]:
def get_baseline_model(vocab: Vocabulary) -> SimpleSeq2Seq:
    emb_dim = 64
    hidden_dim = 32
    token_embedding = Embedding(
        num_embeddings=vocab.get_vocab_size('tokens'),
        embedding_dim=emb_dim
    )

    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(nn.LSTM(emb_dim, hidden_dim, batch_first=True))

    model = LovelyModel(
        vocab=vocab,
        source_embedder=word_embeddings,
        encoder=lstm,
        max_decoding_steps=20,
        attention=AdditiveAttention(vector_dim=hidden_dim, matrix_dim=hidden_dim)
    )

    return model

In [4]:
class MyReader(DatasetReader):

    def _read(self, file_path):
        with open(cached_path(file_path), "r") as file:
            for line in file:
                yield self.text_to_instance(line.strip())

    def text_to_instance(
        self,
        text: str
    ) -> Instance:
        fields: Dict[str, Field] = {}
        tokenized = [START_SYMBOL] + text.split() + [END_SYMBOL]
        fields["source_tokens"] = TextField([Token(word) for word in tokenized], {"tokens": SingleIdTokenIndexer()})
        fields["target_tokens"] = fields["source_tokens"]
        return Instance(fields)

In [5]:
train_path = '../../texar/examples/text_style_transfer/data/insurance_cropped/insurance.train.text'
test_path = '../../texar/examples/text_style_transfer/data/insurance_cropped/insurance.test.text'



In [6]:
reader = MyReader()

In [6]:
train_dataset = reader.read(train_path)
test_dataset = reader.read(test_path)

266051it [00:14, 18056.54it/s]
57012it [00:03, 15913.60it/s]


In [7]:
# vocab = Vocabulary.from_instances(train_dataset + test_dataset)
vocab = Vocabulary.from_files('vocab_seq2seq/')

In [8]:
iterator = BasicIterator(batch_size=256)
iterator.index_with(vocab)

In [8]:
model = get_baseline_model(vocab)

In [10]:
model.cuda(2)

LovelyModel(
  (_source_embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (_encoder): PytorchSeq2SeqWrapper(
    (_module): LSTM(64, 32, batch_first=True)
  )
  (_attention): AdditiveAttention()
  (_target_embedder): Embedding()
  (_decoder_cell): LSTMCell(96, 32)
  (_output_projection_layer): Linear(in_features=32, out_features=2149, bias=True)
)

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [12]:
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=test_dataset,
    patience=2,
    num_epochs=10,
    cuda_device=2
)

In [13]:
# results = trainer.train()

In [14]:
# with open("model_seq2seq.th", 'wb') as f:
#     torch.save(model.state_dict(), f)

# vocab.save_to_files("vocab_seq2seq")

# Predictions

In [1]:
import sys
sys.path.append('..')

from allennlp.predictors import Seq2SeqPredictor, TextClassifierPredictor
from adat.utils import load_weights
from adat.models import get_basic_classification_model, get_basic_seq2seq_model
from allennlp.data.vocabulary import Vocabulary

from adat.dataset import InsuranceReader, Seq2SeqReader

In [2]:
seq2seq_reader = Seq2SeqReader()
seq2seq_vocab = Vocabulary.from_files('vocab_seq2seq')
seq2seq_model = get_basic_seq2seq_model(seq2seq_vocab)
load_weights(seq2seq_model, 'model_seq2seq.th')

In [3]:
class_reader = InsuranceReader()
class_vocab = Vocabulary.from_files('vocab_classification')
class_model = get_basic_classification_model(class_vocab)
load_weights(class_model, 'model_classification.th')

In [4]:
seq2seq_predictor = Seq2SeqPredictor(seq2seq_model, seq2seq_reader)
class_predictor = TextClassifierPredictor(class_model, class_reader)

In [5]:
! head {test_path}

head: cannot open '{test_path}' for reading: No such file or directory


In [6]:
sequences = """a_1139
a_1943 a_1978 a_1 a_1149 a_1138 a_1286 a_1158 a_2001 a_1938
a_1 a_1667 a_340 a_1978 a_1669 a_2001
a_876 a_1213 a_1020 a_1129 a_1120 a_1121 a_1215
a_1119 a_1137 a_1139
a_737 a_734 a_1191 a_1111 a_644 a_1257 a_1128 a_19 a_1978 a_733 a_20 a_1 a_39 a_35 a_755
a_1656 a_2014 a_1257 a_2013 a_2013 a_340 a_549 a_340 a_1191 a_340 a_340 a_362 a_1 a_2014 a_1111
a_1
a_1257 a_1191 a_549 a_362 a_1191 a_1927 a_1138 a_1111 a_1929 a_1656 a_1920 a_1 a_2001 a_1257
a_1978 a_645""".split('\n')

In [7]:
def predict_sequence(sequence: str, seq_to_seq_predictor: Seq2SeqPredictor = seq2seq_predictor) -> str:
    return ' '.join(seq_to_seq_predictor.predict(sequence)['predicted_tokens'])

In [8]:
for seq in sequences:
    predicted_seq = predict_sequence(seq)
    print(f'Input = {seq}\nOutput = {predicted_seq}\n')

Input = a_1139
Output = a_1139

Input = a_1943 a_1978 a_1 a_1149 a_1138 a_1286 a_1158 a_2001 a_1938
Output = a_1943 a_1978 a_1 a_1149 a_1138 a_1286 a_1158 a_2001 a_1938

Input = a_1 a_1667 a_340 a_1978 a_1669 a_2001
Output = a_1 a_1667 a_340 a_1978 a_1669 a_2001

Input = a_876 a_1213 a_1020 a_1129 a_1120 a_1121 a_1215
Output = a_876 a_1213 a_1020 a_1129 a_1120 a_1121 a_1215

Input = a_1119 a_1137 a_1139
Output = a_1119 a_1137 a_1139

Input = a_737 a_734 a_1191 a_1111 a_644 a_1257 a_1128 a_19 a_1978 a_733 a_20 a_1 a_39 a_35 a_755
Output = a_737 a_734 a_1191 a_1111 a_644 a_1257 a_1128 a_1128 a_1111 a_733 a_20 a_1 a_39 a_35 a_35 a_810

Input = a_1656 a_2014 a_1257 a_2013 a_2013 a_340 a_549 a_340 a_1191 a_340 a_340 a_362 a_1 a_2014 a_1111
Output = a_1656 a_2014 a_1257 a_2013 a_2013 a_340 a_549 a_340 a_1191 a_340 a_340 a_362 a_1 a_2014 a_1111 a_1111

Input = a_1
Output = a_1

Input = a_1257 a_1191 a_549 a_362 a_1191 a_1927 a_1138 a_1111 a_1929 a_1656 a_1920 a_1 a_2001 a_1257
Output = a_1257

# Gradient test

In [10]:
from allennlp.data.iterators import BasicIterator

In [11]:
seq2seq_iterator = BasicIterator()
seq2seq_iterator.index_with(seq2seq_vocab)

In [12]:
class_iterator = BasicIterator()
class_iterator.index_with(class_vocab)

In [13]:
seq2seq_reader.text_to_instance('a_1978 a_645')

<allennlp.data.instance.Instance at 0x7fc94efab438>

In [None]:
seq2seq_model()