In [23]:
import torch

if torch.cuda.is_available():
  device = torch.device('cuda')
  print('There are %d GPU(s) available' % torch.cuda.device_count())
  print(torch.cuda.get_device_name(0))
else:
  device = torch.device('cpu')
  print('No GPU available, CPU instaed')

device

There are 1 GPU(s) available
Tesla T4


device(type='cuda')

In [24]:
!pip install transformers
!pip install wget



In [25]:
import wget
import os

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'
if not os.path.exists('./cola_public_1.1.zip'):
  wget.download(url, './cola_public_1.1.zip')
if not os.path.exists('./cola_public/'):
  !unzip cola_public_1.1.zip

In [26]:
import pandas as pd

df = pd.read_csv('cola_public/raw/in_domain_train.tsv', delimiter='\t', header=None, names= ['sentence_source', 'label', 'label_notes', 'sentence'])

df.sample(10)



Unnamed: 0,sentence_source,label,label_notes,sentence
2389,l-93,1,,Angela characterized Shelly as a lifesaver.
5048,ks08,1,,They're not finding it a stress being in the s...
3133,l-93,0,*,Paul exhaled on Mary.
5955,c_13,0,*,I ordered if John drink his beer.
625,bc01,1,,Press the stamp against the pad completely.
3542,ks08,0,*,They can very.
6915,m_02,1,,This arch is supporting the weight of the tower.
2908,l-93,1,,That new handle detaches easily.
5857,c_13,1,,The Brazilians pumped the oil across the river.
4191,ks08,1,,It is a wooden desk.


In [27]:
df.loc[df.label==0].sample(10)[['sentence', 'label']]

Unnamed: 0,sentence,label
6770,We realised that Dr Jones died because he ate ...,0
1652,Here's a pole for you to kiss the girl who tie...,0
3258,Jennifer baked at the potatoes.,0
4651,Kim is resembled by the model in nearly every ...,0
2672,The book sent to Peter.,0
3748,Have in our class the kids arrived safely?,0
6377,"Often, any lion is majestic.",0
1356,The boy Bill and who I watched was vain.,0
1279,The tall nurse who Tony has a Fiat and yearns ...,0
2970,The president declared Smith as press secretary.,0


In [28]:
sentences= df.sentence.values
labels = df.label.values

# DATA prepare

In [29]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [30]:
print('original', sentences[0])
print('tokenised', tokenizer.tokenize(sentences[0]))
print('toden IDs', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

original Our friends won't buy this analysis, let alone the next one we propose.
tokenised ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
toden IDs [2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]


In [31]:
[tokenizer.encode(sentences[0], add_special_tokens=True)]

[[101,
  2256,
  2814,
  2180,
  1005,
  1056,
  4965,
  2023,
  4106,
  1010,
  2292,
  2894,
  1996,
  2279,
  2028,
  2057,
  16599,
  1012,
  102]]

In [32]:
max_len = 0

for sent in sentences:
  input_ids = tokenizer.encode(sent, add_special_tokens=True)
  max_len = max(max_len, len(input_ids))

max_len

47

In [33]:
len(sentences)

8551

In [34]:
input_ids = []
attention_masks = []

for sent in sentences:
  encoded_dict = tokenizer.encode_plus(
      sent,
      add_special_tokens=True,
      max_length=64,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
  )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print(sentences[0])
print(input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Our friends won't buy this analysis, let alone the next one we propose.
tensor([  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
         2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [35]:
print(len(input_ids), len(attention_masks), len(labels))

8551 8551 8551


In [36]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_data, val_data = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))



7,695 training samples
  856 validation samples


In [37]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
    train_data,
    sampler = RandomSampler(train_data),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_data,
    sampler = SequentialSampler(val_data),
    batch_size=batch_size
)

# BERT prepare

In [38]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [39]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                  )




In [40]:
from transformers import get_linear_schedule_with_warmup

epochs = 2

total_steps = len(train_dataloader) * epochs
scheduler =  get_linear_schedule_with_warmup(optimizer,
                                             num_warmup_steps = 0,
                                             num_training_steps = total_steps)

# Training loop

**Training:**
* Распаковать обучающие данные и лейблы
* Загрузить данные на GPU для ускорения
* Занулить градиенты с предыдущего шага
* Forward pass (скормить данные в нейросеть и пробросить их вперед)
* Backward pass (back propagation - посчитать градиенты по всем параметрам с помощью обратного распространения ошибки)
* Обновить параметры с помощью optimizer.step()
* Посчитать статистики, чтобы следить за обучением
⛅

**Evaluation:**
* Распаковать валидационные данные и лейблы
* Загрузить данные на GPU для ускорения
* Forward pass (скормить данные в нейросеть и пробросить вперед)
* Посчиатть loss и статистики для валиданционных данных, чтобы следить за обучением



In [41]:
import numpy as np
import datetime

def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)


def  format_time(elapsed):
  elapsed_rounded = int(round(elapsed))
  return str(datetime.timedelta(seconds=elapsed_rounded))


In [42]:
import random
import numpy as np
import time

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):
  print('')
  print('======= EPOCH {:} / {:} ======'.format(epoch_i + 1, epochs))
  print('Training...')

  t0 = time.time()
  total_train_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0)
      #Report progress
      print('   Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_data), elapsed))

    #batch  contains 3 pytorch tensors: [0] input ids, [1] aattention masks, [2] labels
    #print(batch[0])
    #print(batch[1])
    #print(batch[2])
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    model.zero_grad()
    #Forward pass
    res = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
    loss = res['loss']
    logits = res['logits'] #вероятности классов для батча

    total_train_loss += loss.item()
    loss.backward()
    #нормы градиентов обрезаем до 1.0, чтобы предотвратить проблему взрывающихся градиентов
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step() #обновление весов
    scheduler.step()

    avg_train_loss = total_train_loss/len(train_dataloader)
    train_time = format_time(time.time() - t0)

  print(' Average training loss: {0:.2f}'.format(avg_train_loss))
  print(' Training epoch took: {:}'.format(train_time))


  #Validation
  print("Running Validation...")

  t0 = time.time()
  model.eval()
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0

  for batch in validation_dataloader:
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)

      with torch.no_grad():
          # Forward pass, calculate logit predictions.
          result = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels,
                        return_dict=True)
      loss = result.loss
      logits = result.logits
      total_eval_loss += loss.item()
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      total_eval_accuracy += flat_accuracy(logits, label_ids)

  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

  avg_val_loss = total_eval_loss / len(validation_dataloader)
  validation_time = format_time(time.time() - t0)

  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))

  training_stats.append(
      {
          'epoch': epoch_i + 1,
          'Training Loss': avg_train_loss,
          'Valid. Loss': avg_val_loss,
          'Valid. Accur.': avg_val_accuracy,
          'Training Time': train_time,
          'Validation Time': validation_time
      }
  )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))





Training...
   Batch    40 of 7,695. Elapsed: 0:00:12.
   Batch    80 of 7,695. Elapsed: 0:00:24.
   Batch   120 of 7,695. Elapsed: 0:00:36.
   Batch   160 of 7,695. Elapsed: 0:00:48.
   Batch   200 of 7,695. Elapsed: 0:01:00.
   Batch   240 of 7,695. Elapsed: 0:01:13.
 Average training loss: 0.49
 Training epoch took: 0:01:13
Running Validation...
  Accuracy: 0.80
  Validation Loss: 0.49
  Validation took: 0:00:03

Training...
   Batch    40 of 7,695. Elapsed: 0:00:12.
   Batch    80 of 7,695. Elapsed: 0:00:25.
   Batch   120 of 7,695. Elapsed: 0:00:37.
   Batch   160 of 7,695. Elapsed: 0:00:50.
   Batch   200 of 7,695. Elapsed: 0:01:02.
   Batch   240 of 7,695. Elapsed: 0:01:15.
 Average training loss: 0.31
 Training epoch took: 0:01:15
Running Validation...
  Accuracy: 0.82
  Validation Loss: 0.47
  Validation took: 0:00:03

Training complete!
Total training took 0:02:33 (h:mm:ss)


In [43]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('display.precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap (doesn't seem to work in Colab).
df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.49,0.49,0.8,0:01:13,0:00:03
2,0.31,0.47,0.82,0:01:15,0:00:03


# Test

In [44]:
import pandas as pd

df = pd.read_csv('./cola_public/raw/out_of_domain_dev.tsv', delimiter='\t', header=None, names= ['sentence_source', 'label', 'label_notes', 'sentence'])

sentences = df.sentence.values
labels = df.label.values

input_ids = []
attention_masks = []

for sent in sentences:
  encoded_dict = tokenizer.encode_plus(
      sent,
      add_special_tokens = True,
      max_length = 64,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt',
  )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

test_data = TensorDataset(input_ids, attention_masks, labels)




In [45]:
prediction_sampler = SequentialSampler(test_data)
prediction_dataloader = DataLoader(test_data, sampler=prediction_sampler, batch_size=batch_size)

print('Predicting labels for {:,} test sentences...'.format(len(test_data)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions , true_labels = [], []

# Predict
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)

  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch

  # Telling the model not to compute or store gradients, saving memory and
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 516 test sentences...
    DONE.


In [46]:
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [47]:
from sklearn.metrics import f1_score

# Calculate the F1
f1 = f1_score(flat_true_labels, flat_predictions)

print('F1 Score: %.3f' % f1)

F1 Score: 0.871
