In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install transformers



In [None]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv("./Stage_1_Results.csv", usecols=['pairID1','pairID2','premise','hypothesis','propositional_logic_rule'])

In [None]:
from transformers import RobertaForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli")

In [None]:

max_len = 0

# For every sentence...
for sent in df["hypothesis"]:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  82


In [None]:

max_len = 0

# For every sentence...
for sent in df["premise"]:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  186


In [None]:
model = RobertaForSequenceClassification.from_pretrained("ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli", num_labels=3)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [None]:
input_ids_test_1 = []
attention_masks_test_1 = []

# For every sentence...
for sent in df["premise"]:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 200,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids_test_1.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_test_1.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_test_1 = torch.cat(input_ids_test_1, dim=0)
attention_masks_test_1 = torch.cat(attention_masks_test_1, dim=0)

# # Print sentence 0, now as a list of IDs.
print('Original: ', df["premise"][0])
print('Token IDs:', input_ids_test_1[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  If this church choir sings to the masses as they sing joyous songs from the book at a church, then the church is filled with song. The church is not filled with song.
Token IDs: tensor([    0,  1106,    42,  2352, 18558, 22707,     7,     5, 15444,    25,
           51,  7884,  5823,  1827,  3686,    31,     5,  1040,    23,    10,
         2352,     6,   172,     5,  2352,    16,  3820,    19,  2214,     4,
           20,  2352,    16,    45,  3820,    19,  2214,     4,     2,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,   

In [None]:
input_ids_test_2 = []
attention_masks_test_2 = []

# For every sentence...
for sent in df["hypothesis"]:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 90,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids_test_2.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_test_2.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_test_2 = torch.cat(input_ids_test_2, dim=0)
attention_masks_test_2 = torch.cat(attention_masks_test_2, dim=0)

# # Print sentence 0, now as a list of IDs.
print('Original: ', df["hypothesis"][0])
print('Token IDs:', input_ids_test_2[0])



Original:  This church choir does not sing to the masses as they sing joyous songs from the book at a church.
Token IDs: tensor([    0,   713,  2352, 18558,   473,    45,  7884,     7,     5, 15444,
           25,    51,  7884,  5823,  1827,  3686,    31,     5,  1040,    23,
           10,  2352,     4,     2,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1])


In [None]:
input_ids_test = torch.cat((input_ids_test_1, input_ids_test_2), dim=1)
attention_mask_test = torch.cat((attention_masks_test_1, attention_masks_test_2), dim=1)

In [None]:
import numpy as np

labels1 = np.zeros(30312)
labels1 = labels1.astype(int)
labels1 = torch.tensor(labels1)

In [None]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32
prediction_data = TensorDataset(input_ids_test, attention_mask_test, labels1)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions , true_labels = [], []
i=0
# Predict
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)

  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch

  # Telling the model not to compute or store gradients, saving memory and
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  pred_labels = np.argmax(logits, axis=1)
  # Store predictions and true labels
  predictions.extend(pred_labels.tolist())
  i+=32
  print(i)
print('DONE.')

Predicting labels for 30,312 test sentences...
32
64
96
128
160
192
224
256
288
320
352
384
416
448
480
512
544
576
608
640
672
704
736
768
800
832
864
896
928
960
992
1024
1056
1088
1120
1152
1184
1216
1248
1280
1312
1344
1376
1408
1440
1472
1504
1536
1568
1600
1632
1664
1696
1728
1760
1792
1824
1856
1888
1920
1952
1984
2016
2048
2080
2112
2144
2176
2208
2240
2272
2304
2336
2368
2400
2432
2464
2496
2528
2560
2592
2624
2656
2688
2720
2752
2784
2816
2848
2880
2912
2944
2976
3008
3040
3072
3104
3136
3168
3200
3232
3264
3296
3328
3360
3392
3424
3456
3488
3520
3552
3584
3616
3648
3680
3712
3744
3776
3808
3840
3872
3904
3936
3968
4000
4032
4064
4096
4128
4160
4192
4224
4256
4288
4320
4352
4384
4416
4448
4480
4512
4544
4576
4608
4640
4672
4704
4736
4768
4800
4832
4864
4896
4928
4960
4992
5024
5056
5088
5120
5152
5184
5216
5248
5280
5312
5344
5376
5408
5440
5472
5504
5536
5568
5600
5632
5664
5696
5728
5760
5792
5824
5856
5888
5920
5952
5984
6016
6048
6080
6112
6144
6176
6208
6240
6272
6304
63

In [None]:
label = []
for i in predictions:
  if i==0:
    label.append("entailment")
  elif i==1:
    label.append("contradiction")
  else:
    label.append("neutral")

In [None]:
import pandas as pd
new_series = pd.Series(label, name='Predicted Labels')
df_test = pd.concat([df, new_series], axis=1)

In [None]:
df_test

Unnamed: 0,pairID1,pairID2,premise,hypothesis,propositional_logic_rule,Predicted Labels
0,2677109430.jpg#1r1e,,If this church choir sings to the masses as th...,This church choir does not sing to the masses ...,Modus Tollens,neutral
1,2677109430.jpg#1r1e,,If this church choir sings to the masses as th...,"If the church is not filled with song, then th...",Transportation 1,contradiction
2,2677109430.jpg#1r1e,,"If the church is not filled with song, then th...",If this church choir sings to the masses as th...,Transportation 2,entailment
3,2677109430.jpg#1r1e,,If this church choir sings to the masses as th...,Either this church choir does not sing to the ...,Material Implication 1,neutral
4,2677109430.jpg#1r1e,,Either this church choir does not sing to the ...,If this church choir sings to the masses as th...,Material Implication 2,neutral
...,...,...,...,...,...,...
30307,4378810163.jpg#4r1e,152881593.jpg#1r1e,"If two women are observing something together,...",Either no two women are observing something to...,Destructive Dilemma,entailment
30308,4378810163.jpg#4r1e,152881593.jpg#1r1e,"If two women are observing something together,...",Either two girls are looking at something or n...,Bidirectional Dilemma,entailment
30309,2677109430.jpg#1r1e,152881593.jpg#1r1e,If this church choir sings to the masses as th...,Either the church is filled with song or a man...,Constructive Dilemma,entailment
30310,2677109430.jpg#1r1e,152881593.jpg#1r1e,If this church choir sings to the masses as th...,Either this church choir does not sing to the ...,Destructive Dilemma,entailment


In [None]:
df_test.to_csv('Stage_2_RoBERTA_Multi_Task_Results.csv')