In [3]:
from pytorch_pretrained_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            text_b = line[4]
            label = line[0]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples 

In [7]:
import csv
processor = MrpcProcessor()
examples = processor.get_dev_examples('./mrpc_data/')
examples[0]

<__main__.InputExample at 0x120241198>

In [8]:
examples[0].text_a

"He said the foodservice pie business doesn 't fit the company 's long-term growth strategy ."

In [9]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features


In [10]:
label_list = processor.get_labels()
label_list

['0', '1']

In [13]:
features = convert_examples_to_features(examples, label_list, 256, tokenizer)
features[0]

<__main__.InputFeatures at 0x10f4da2b0>

In [12]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [14]:
features[0].input_ids

[101,
 2002,
 2056,
 1996,
 9440,
 2121,
 7903,
 2063,
 11345,
 2449,
 2987,
 1005,
 1056,
 4906,
 1996,
 2194,
 1005,
 1055,
 2146,
 1011,
 2744,
 3930,
 5656,
 1012,
 102,
 1000,
 1996,
 9440,
 2121,
 7903,
 2063,
 11345,
 2449,
 2515,
 2025,
 4906,
 2256,
 2146,
 1011,
 2744,
 3930,
 5656,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [15]:
from pytorch_pretrained_bert import BertModel
import torch.nn as nn
bert = BertModel.from_pretrained('bert-base-uncased')

In [22]:
import torch
input_id = torch.tensor([features[0].input_ids], dtype=torch.long)
segment_ids = torch.tensor([features[0].segment_ids], dtype=torch.long)
input_mask = torch.tensor([features[0].input_mask], dtype=torch.long)

input_id

tensor([[  101,  2002,  2056,  1996,  9440,  2121,  7903,  2063, 11345,  2449,
          2987,  1005,  1056,  4906,  1996,  2194,  1005,  1055,  2146,  1011,
          2744,  3930,  5656,  1012,   102,  1000,  1996,  9440,  2121,  7903,
          2063, 11345,  2449,  2515,  2025,  4906,  2256,  2146,  1011,  2744,
          3930,  5656,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [23]:
outputs = bert(input_id, segment_ids, input_mask, output_all_encoded_layers=False)

In [25]:
outputs[0].size()

torch.Size([1, 256, 768])

In [26]:
outputs[1].size()

torch.Size([1, 768])

In [27]:
sequence_output = outputs[0]
sequence_output.size()

torch.Size([1, 256, 768])

In [29]:
input_mask_expanded = input_mask.unsqueeze(-1).expand(sequence_output.size()).float()

In [31]:
input_mask_expanded

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [32]:
sequence_output

tensor([[[-8.8142e-01,  1.8816e-01, -4.3147e-01,  ..., -7.9133e-01,
           9.3272e-01,  4.5988e-02],
         [ 3.5908e-01, -1.1404e-01, -8.3745e-01,  ...,  3.6647e-01,
           5.5146e-01, -5.4043e-01],
         [-2.1388e-01, -1.3653e-01, -2.3459e-01,  ...,  2.8952e-01,
          -1.3266e-01, -2.1964e-02],
         ...,
         [ 1.5847e-01,  1.4766e-01,  2.0529e-01,  ..., -4.3482e-03,
           2.7301e-01, -7.3293e-02],
         [-5.5554e-02, -2.3640e-01,  2.7843e-01,  ...,  4.0018e-01,
           2.9027e-01, -1.1342e-01],
         [-1.8808e-01, -3.5333e-01,  7.2856e-02,  ...,  3.7532e-01,
           2.2211e-01, -3.4611e-01]]], grad_fn=<ThAddBackward>)

In [33]:
sequence_output * input_mask_expanded

tensor([[[-0.8814,  0.1882, -0.4315,  ..., -0.7913,  0.9327,  0.0460],
         [ 0.3591, -0.1140, -0.8374,  ...,  0.3665,  0.5515, -0.5404],
         [-0.2139, -0.1365, -0.2346,  ...,  0.2895, -0.1327, -0.0220],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.0000]]],
       grad_fn=<ThMulBackward>)

In [36]:
sum_embedding = torch.sum(sequence_output* input_mask_expanded, 1)
sum_embedding

tensor([[ -11.0636,   -1.2235,    9.7954,    4.0469,   16.4276,    2.8415,
            8.0583,   30.3949,    5.9854,   -0.6195,    8.9612,  -15.4292,
            1.9297,   14.9294,  -10.7767,   11.7282,    8.9775,    7.8893,
            2.0201,    9.1194,   -8.1628,  -12.7329,    6.7113,   19.1018,
           19.1903,   -1.1186,    0.1073,    4.8895,   -5.6436,   -7.7154,
           12.7934,   18.9433,  -25.5996,  -20.2311,   15.5622,   -1.7229,
          -14.8629,   -6.9054,  -28.3170,   -6.0899,  -30.1646,   -7.5330,
          -10.3559,    9.4625,   -6.4273,  -15.9965,   -4.8906,    1.1210,
           -3.6906,   -0.6172,   -4.7344,    5.3236,   -3.8340,  -16.4559,
           -3.4027,   32.8624,   -3.6920,  -16.6033,  -18.7660,  -11.5474,
            5.8602,    1.1615,   -0.6110,  -27.9097,   -1.7852,   -1.7783,
           -8.5259,   29.5530,  -46.9679,   -4.1414,  -20.3478,  -16.2187,
            5.5075,    4.7283,  -11.6058,    5.7296,  -10.4476,    9.2362,
            2.1492,   -7.

In [37]:
sum_mask = input_mask_expanded.sum(1)
sum_mask

tensor([[44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44., 44.,
         44., 44., 44., 44., 44., 44., 44., 44., 44.

In [38]:
sum_mask.size()

torch.Size([1, 768])

In [40]:
mean_out = sum_embedding / sum_mask
mean_out.size()

torch.Size([1, 768])

In [41]:
mean_out.size()

torch.Size([1, 768])

In [44]:
y = torch.cat((mean_out, mean_out), 1)
y.size()

torch.Size([1, 1536])

In [1]:
from util import TrecProcessor
processor = TrecProcessor()
examples = processor.get_train_examples(data_dir='./trec_data/')
len(examples)

04/01/2020 15:05:25 - INFO - util -   LOOKING AT ./trec_data/train.tsv


5914

In [6]:
examples[0].text_b

'the IRON LADY ; A Biography of Margaret Thatcher by Hugo Young -LRB- Farrar , Straus & Giroux -RRB-'