In [0]:
# # Run this only once on a machine

# !pip install pytorch-pretrained-bert
# !pip install livelossplot
# !wget "https://competitions.codalab.org/my/datasets/download/69a3e8d0-b836-48b8-8795-36a6865a1c04"
# !unzip 69a3e8d0-b836-48b8-8795-36a6865a1c04
# !rm 69a3e8d0-b836-48b8-8795-36a6865a1c04
# !ls -lh

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
file1 = drive.CreateFile({'title': 'best_val.bin'})

In [0]:
import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import CrossEntropyLoss
import torch.nn as nn
from livelossplot import PlotLosses
import random
import csv
import string

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


Using TensorFlow backend.


In [0]:
# class BertSequence(nn.Module):
  
#   def __init__(self, config, num_labels):
#     super(BertSequence, self).__init__()
#     self.config = config
#     self.num_labels = num_labels
#     self.bert = BertModel(config)
#     self.dropout = nn.Dropout(config.hidden_dropout_prob)
#     self.classifier = nn.Sequential(nn.Linear(config.hidden_size, config.hidden_size),
#                                     nn.ReLU(),
#                                     nn.Linear(config.hidden_size, self.num_labels))
#     for module in self.modules():
#       if isinstance(module, (nn.Linear, nn.Embedding)):
#               module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
#       elif isinstance(module, BertLayerNorm):
#           module.bias.data.normal_(mean=0.0, std=self.config.initializer_range)
#           module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
#       if isinstance(module, nn.Linear) and module.bias is not None:
#           module.bias.data.zero_()


In [0]:
class BertSequence(nn.Module):
  
  def __init__(self, config, num_labels, layers):
    
    # layers:
    # number of linear layers : 1 or 2
    
    # configs :
    # bert-base-uncased: 12-layer, 768-hidden, 12-heads, 110M parameters
    # bert-large-uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
    # bert-base-cased: 12-layer, 768-hidden, 12-heads , 110M parameters
    # bert-large-cased: 24-layer, 1024-hidden, 16-heads, 340M parameters
    
    super(BertSequence, self).__init__()
    
    assert layers in [1,2]
    
    self.layers = layers
    
    self.hidden = 768
    if self.layers == 1 :
      self.hidden = num_labels
    
    self.num_labels = num_labels
    self.config = config
    self.bert = BertForSequenceClassification.from_pretrained(config, num_labels = self.hidden)
    
    if self.layers == 2:
      self.relu = nn.ReLU()
      self.layer = nn.Linear(self.hidden, self.num_labels)
      self.layer.weight.data.normal_(mean=0.0, std=0.02)
      self.layer.bias.data.zero_()
    
  def forward(self, input_ids, segment_ids, input_mask):
    
    out = self.bert(input_ids, segment_ids, input_mask)
    
    if self.layers == 2:
      out = self.relu(out)
      out = self.layer(out)
    
    return out

In [0]:
batch_size = 16
gradient_accumulation_steps = 4
num_train_epochs = 2000
lr = 2e-2
max_seq_length = 256
warmup_proportion = 0.1 
# Proportion of training to perform linear learning rate warmup for. ""E.g., 0.1 = 10% of training.")
lr_chng_iter = 3500
eval_iter = 1000
val_split = 0.1
best_val = 0
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

In [0]:
lines = []

num_examples = 64000

with open('data.tsv', "r", encoding='utf-8') as f:
  reader = csv.reader(f, delimiter="\t")
  for line in reader:
    
      num_examples-=1
      if num_examples < 0:
        break
        
      lines.append(line)
      

# Punctuation etc. stuff is handled by tokenizer

In [0]:
random.shuffle(lines)
train_lines = lines[:int((1-val_split)*len(lines))]
val_lines = lines[int((1-val_split)*len(lines)):]
print('Train examples = ', len(train_lines))
print('Val examples = ', len(val_lines))
del lines

Train examples =  57600
Val examples =  6400


In [0]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


In [0]:
train_examples = [InputExample(guid = line[0], text_a = line[1], text_b = line[2], label = line[3]) for line in train_lines]
val_examples = [InputExample(guid = line[0], text_a = line[1], text_b = line[2], label = line[3]) for line in val_lines]
del train_lines
del val_lines

In [0]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [0]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [0]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        
        if ex_index % 1000 == 0:
          print('{} examples done out of {}'.format(ex_index, len(examples)))
        
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:7
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[int(example.label)]

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features

In [0]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    
#     correct_one = np.sum(np.where(outputs == 1)[0] == np.where(labels == 1)) #count of number of times the model predicted 1 and the label is also 1
    correct_one = np.sum((outputs == labels) * outputs)
    precision =  correct_one/np.sum(outputs == 1)
    recall = correct_one/np.sum(labels==1)
    f1 = 2*precision*recall/(precision+recall)
#     print("f1: ", f1, " precision:", precision, " recall:", recall)
#     print("labels:", labels)
#     print("out:", out)
    
    return [np.sum(outputs == labels), f1, precision, recall, correct_one]

In [0]:
def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

In [0]:
model = BertSequence('bert-base-uncased', num_labels = 2, layers=1).cuda()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
num_train_steps = int(len(train_examples)/batch_size) # steps in 1 epoch
t_total = num_train_steps*num_train_epochs # total number of steps in training
optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=warmup_proportion, t_total=t_total)

In [0]:
torch.backends.cudnn.benchmark = True    # would speed up runtime hopefully
label_list = [0,1]         # label map

In [0]:
train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

del train_features
_ = model.train()

0 examples done out of 57600
1000 examples done out of 57600
2000 examples done out of 57600
3000 examples done out of 57600
4000 examples done out of 57600
5000 examples done out of 57600
6000 examples done out of 57600
7000 examples done out of 57600
8000 examples done out of 57600
9000 examples done out of 57600
10000 examples done out of 57600
11000 examples done out of 57600
12000 examples done out of 57600
13000 examples done out of 57600
14000 examples done out of 57600
15000 examples done out of 57600
16000 examples done out of 57600
17000 examples done out of 57600
18000 examples done out of 57600
19000 examples done out of 57600
20000 examples done out of 57600
21000 examples done out of 57600
22000 examples done out of 57600
23000 examples done out of 57600
24000 examples done out of 57600
25000 examples done out of 57600
26000 examples done out of 57600
27000 examples done out of 57600
28000 examples done out of 57600
29000 examples done out of 57600
30000 examples done out

In [0]:
val_features = convert_examples_to_features(val_examples, label_list, max_seq_length, tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in val_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in val_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in val_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in val_features], dtype=torch.long)
val_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

0 examples done out of 6400
1000 examples done out of 6400
2000 examples done out of 6400
3000 examples done out of 6400
4000 examples done out of 6400
5000 examples done out of 6400
6000 examples done out of 6400


In [0]:
def eval_model(model, epoch, iteration, loss, label_list, max_seq_length, loss_fct):
  
  global best_val
  global gradient_accumulation_steps
  global file1 
  
  _ = model.eval()
  
  eval_loss, eval_accuracy, f1, precision, recall = 0, 0, 0, 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  outputs_all = np.array([])
  label_ids_all = np.array([])
  
  for input_ids, input_mask, segment_ids, label_ids in val_dataloader:
    input_ids = input_ids.cuda()
    input_mask = input_mask.cuda()
    segment_ids = segment_ids.cuda()
    label_ids = label_ids.cuda()
    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask)
    
    outputs = np.argmax(logits.detach().cpu().numpy(), axis=1)
    outputs_all = np.append(outputs_all, outputs)
    label_ids_all = np.append(label_ids_all, label_ids.cpu().numpy())
    
    tmp_eval_loss = loss_fct(logits.view(-1, 2), label_ids.view(-1))
    tmp_eval_loss /= gradient_accumulation_steps
    eval_loss += tmp_eval_loss.mean().item()
    
    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1
    
  eval_loss = eval_loss / nb_eval_steps
  
  correct_one = np.sum((outputs_all == label_ids_all) * outputs_all)
  precision =  correct_one/np.sum(outputs_all)
  recall = correct_one/np.sum(label_ids_all)
  f1 = 2*precision*recall/(precision+recall)
  
  print('total 1s in output = ', np.sum(outputs_all))
  print('total 1s in labels = ', np.sum(label_ids_all))
  print('f1 = ', f1)
  
  if np.isnan(f1):
    f1 = 0
  
  if f1 >= best_val:
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    output_model_file = "best_val.bin"
    torch.save({'stat_dict': model_to_save.state_dict(), 
                'eval_loss' : eval_loss,
                'eval_accuracy' : eval_accuracy,
                'f1' : f1,
                'precision' : precision,
                'recall' : recall}
                , output_model_file)
    best_val = f1

    file1.SetContentFile('best_val.bin')
    file1.Upload()
    print('Best f1 = ', f1, ' precision = ', precision, ' recall = ', recall, ' loss = ', eval_loss)
      
 

In [0]:
weights = [0.1, 0.9]
class_weights = torch.FloatTensor(weights).cuda()
loss_fct = CrossEntropyLoss(weight = class_weights)

total_step = 0

for epoch in range(num_train_epochs):
  print('epoch = ', epoch)
  for iteration, batch in enumerate(train_dataloader):
    input_ids, input_mask, segment_ids, label_ids = batch
    input_ids = input_ids.cuda()
    input_mask = input_mask.cuda()
    segment_ids = segment_ids.cuda()
    label_ids = label_ids.cuda()
    
    logits = model(input_ids, segment_ids, input_mask)
    
    train_accuracy_params = accuracy(logits.detach().cpu().numpy(), label_ids.cpu().numpy())
    train_accuracy = train_accuracy_params[0]
    f1 = train_accuracy_params[1]
    precision = train_accuracy_params[2]
    recall = train_accuracy_params[3]
    correct_one = train_accuracy_params[4]
    
    loss = loss_fct(logits.view(-1, 2), label_ids.view(-1))
    loss = loss / gradient_accumulation_steps
    
    loss.backward()
    
    if (iteration + 1) % gradient_accumulation_steps == 0:
          optimizer.step()
          optimizer.zero_grad()

    if iteration % lr_chng_iter == 0:
      new_lr = lr * warmup_linear(total_step/t_total, warmup_proportion)
      for param_group in optimizer.param_groups:
          param_group['lr'] = new_lr
    
    total_step += 1
    
    if iteration % 10 == 0:
      print("iteration:", iteration, " loss:", loss.item(), "train_accuracy:", train_accuracy, " f1:", f1, " precision:", precision, " recall:", recall, " correct_one:", correct_one)
    
    if (iteration + 1) % eval_iter == 0:
        eval_model(model, epoch, iteration, loss, label_list, max_seq_length, loss_fct)
        _ = model.train()
    
    

epoch =  0


  


iteration: 0  loss: 0.17802616953849792 train_accuracy: 13  f1: nan  precision: 0.0  recall: 0.0  correct_one: 0


  
  import sys


iteration: 10  loss: 0.1949089616537094 train_accuracy: 12  f1: nan  precision: 0.0  recall: 0.0  correct_one: 0
iteration: 20  loss: 0.16716812551021576 train_accuracy: 13  f1: nan  precision: 0.0  recall: 0.0  correct_one: 0
iteration: 30  loss: 0.14277902245521545 train_accuracy: 16  f1: 1.0  precision: 1.0  recall: 1.0  correct_one: 1
iteration: 40  loss: 0.14504100382328033 train_accuracy: 14  f1: nan  precision: 0.0  recall: nan  correct_one: 0
iteration: 50  loss: 0.13729235529899597 train_accuracy: 14  f1: nan  precision: 0.0  recall: nan  correct_one: 0
iteration: 60  loss: 0.18230295181274414 train_accuracy: 14  f1: 0.5  precision: 1.0  recall: 0.3333333333333333  correct_one: 1
iteration: 70  loss: 0.18294058740139008 train_accuracy: 11  f1: nan  precision: 0.0  recall: 0.0  correct_one: 0
iteration: 80  loss: 0.1831817924976349 train_accuracy: 12  f1: nan  precision: 0.0  recall: 0.0  correct_one: 0
iteration: 90  loss: 0.20220918953418732 train_accuracy: 11  f1: nan  preci

# **Make Prediction on Test Set**

In [0]:
# # Run this only once on a machine

# !pip install pytorch-pretrained-bert
# !pip install livelossplot
# !wget "https://competitions.codalab.org/my/datasets/download/69a3e8d0-b836-48b8-8795-36a6865a1c04"
# !unzip 69a3e8d0-b836-48b8-8795-36a6865a1c04
# !rm 69a3e8d0-b836-48b8-8795-36a6865a1c04
# !ls -lh

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import CrossEntropyLoss
import torch.nn as nn
from livelossplot import PlotLosses
import random
import csv
import string

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


Using TensorFlow backend.


In [0]:
file = drive.CreateFile({'id': '1ksD6TjCMGPShqMuQa3hMaKReURYZdr_V'})
file.GetContentFile('best_val.bin')

In [0]:
class BertSequence(nn.Module):
  
  def __init__(self, config, num_labels, layers):
    
    # layers:
    # number of linear layers : 1 or 2
    
    # configs :
    # bert-base-uncased: 12-layer, 768-hidden, 12-heads, 110M parameters
    # bert-large-uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
    # bert-base-cased: 12-layer, 768-hidden, 12-heads , 110M parameters
    # bert-large-cased: 24-layer, 1024-hidden, 16-heads, 340M parameters
    
    super(BertSequence, self).__init__()
    
    assert layers in [1,2]
    
    self.layers = layers
    
    self.hidden = 768
    if self.layers == 1 :
      self.hidden = num_labels
    
    self.num_labels = num_labels
    self.config = config
    self.bert = BertForSequenceClassification.from_pretrained(config, num_labels = self.hidden)
    
    if self.layers == 2:
      self.relu = nn.ReLU()
      self.layer = nn.Linear(self.hidden, self.num_labels)
      self.layer.weight.data.normal_(mean=0.0, std=0.02)
      self.layer.bias.data.zero_()
    
  def forward(self, input_ids, segment_ids, input_mask):
    
    out = self.bert(input_ids, segment_ids, input_mask)
    
    if self.layers == 2:
      out = self.relu(out)
      out = self.layer(out)
    
    return out

In [0]:
model = BertSequence('bert-base-uncased', num_labels = 2, layers=2).cuda()
modelCheckpoint = torch.load("best_val.bin", map_location=lambda storage, loc: storage)
model.load_state_dict(modelCheckpoint['stat_dict'])

In [0]:
lines = []
num_examples = 10500

with open('eval1_unlabelled.tsv', "r", encoding='utf-8') as f:
  reader = csv.reader(f, delimiter="\t")
  for line in reader:
    num_examples-=1
    if num_examples < 0:
      break

    lines.append(line)

In [0]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


In [0]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, id_):
        self.id_ = id_
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [0]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [0]:
def convert_examples_to_features(examples, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        
        if ex_index % 1000 == 0:
          print('{} examples done out of {}'.format(ex_index, len(examples)))
        
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:7
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                             id_ = example.guid))
    return features

In [0]:
val_examples = [InputExample(guid = line[0], text_a = line[1], text_b = line[2]) for line in lines]

In [0]:
label_list = [0,1]
batch_size = 10
max_seq_length = 256

In [0]:
val_features = convert_examples_to_features(val_examples, max_seq_length, tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in val_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in val_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in val_features], dtype=torch.long)

0 examples done out of 10500
1000 examples done out of 10500
2000 examples done out of 10500
3000 examples done out of 10500
4000 examples done out of 10500
5000 examples done out of 10500
6000 examples done out of 10500
7000 examples done out of 10500
8000 examples done out of 10500
9000 examples done out of 10500
10000 examples done out of 10500


In [0]:
all_ids = torch.tensor([int(f.id_) for f in val_features], dtype=torch.long)
val_data = TensorDataset(all_ids, all_input_ids, all_input_mask, all_segment_ids)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [0]:
_ = model.eval()

In [0]:
file = open('answers.tsv', 'w')
i = 0
for all_ids, input_ids, input_mask, segment_ids in val_dataloader:
  i+=1
  if i% 100 == 0:
    print('Done ', i)
  input_ids = input_ids.cuda()
  input_mask = input_mask.cuda()
  segment_ids = segment_ids.cuda()

  logits = model(input_ids, segment_ids, input_mask)
  score = logits[:,1].detach().cpu().numpy()  
  file.write('{}\t'.format(all_ids[0].item()))
  for x in score[:-1]:
    file.write('{}\t'.format(x))
  file.write('{}\n'.format(score[-1]))  
  
file.close()

Done  100
Done  200
Done  300
Done  400
Done  500
Done  600
Done  700
Done  800
Done  900
Done  1000


In [0]:
input_ids.shape

torch.Size([10, 256])

In [0]:
file1 = drive.CreateFile({'title': 'answers.tsv'})

In [0]:
file1.SetContentFile('answers.tsv')
file1.Upload()

In [0]:
!ls

adc.json     best_val.bin  eval1_unlabelled.tsv
answers.tsv  data.tsv	   sample_data


In [0]:
# linse = []
# with open('answers.tsv', 'r') as f:
#   reader = csv.reader(f, delimiter="\t")
#   for line in reader:
#       linse.append(line)
# linse

[['1135787',
  '-0.1890496015548706',
  '-0.20414724946022034',
  '-0.20256561040878296',
  '-0.19391247630119324',
  '-0.1895381510257721',
  '-0.19488048553466797',
  '-0.1877790093421936',
  '-0.20241910219192505',
  '-0.20091071724891663',
  '-0.20497986674308777']]