In [3]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [4]:
import pandas as pd

df = pd.read_csv('./cola_public/raw/in_domain_train.tsv', delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

df 

Unnamed: 0,sentence_source,label,label_notes,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.
...,...,...,...,...
8546,ad03,0,*,Poseidon appears to own a dragon
8547,ad03,0,*,Digitize is my happiest memory
8548,ad03,1,,It is easy to slay the Gorgon.
8549,ad03,1,,I had the strangest feeling that I knew you.


In [8]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

labels = df.label.values
sentences = df.sentence.values

In [9]:
print(' Original: ', sentences[0])

print('Tokenized: ', tokenizer.tokenize(df.sentence[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df.sentence[0])))

 Original:  Our friends won't buy this analysis, let alone the next one we propose.
Tokenized:  ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
Token IDs:  [2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]


In [10]:
# [SEP] token is a special token that is used to separate two sentences.
# [CLS] token is a special token that is used in classification tasks.

input_ids = []
for sent in sentences:
    encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
    input_ids.append(encoded_sent)

print('Original: ', sentences[0])
print('Token IDs: ', input_ids[0])

Original:  Our friends won't buy this analysis, let alone the next one we propose.
Token IDs:  [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102]


In [11]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  47


In [19]:
from torch.nn.utils.rnn import pad_sequence

MAX_LEN = 64

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids = [torch.tensor(sentence).clone().detach() for sentence in input_ids]

padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)

if padded_input_ids.size(1) > MAX_LEN:
    padded_input_ids = padded_input_ids[:, :MAX_LEN]
    
print('padded_input_ids shape: ', padded_input_ids.shape)

print('Padding !', padded_input_ids[0])


Padding/truncating all sentences to 64 values...

Padding token: "[PAD]", ID: 0
padded_input_ids shape:  torch.Size([8551, 47])
Padding ! tensor([  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
         2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])


  input_ids = [torch.tensor(sentence).clone().detach() for sentence in input_ids]


In [20]:
attention_masks = []

for sent in padded_input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

In [21]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset, TensorDataset

train_idx, val_idx = train_test_split(list(range(len(labels))), test_size=0.2, random_state=42)
dataset = TensorDataset(padded_input_ids, torch.tensor(labels), torch.tensor(attention_masks))

train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

7,695 training samples
  856 validation samples


In [24]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)