In [1]:
pip install transformers tensorflow torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [95]:
#import necessary packages
from transformers import BertTokenizer, BertForMaskedLM
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [96]:
#create a tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [97]:
#read data from ast folder
train_data = open("Dataset/ast/train.jsonl").read()
test_data = open("Dataset/ast/test.jsonl").read()
valid_data = open("Dataset/ast/valid.jsonl").read()

In [98]:
#encode data
train_tokens = tokenizer.encode(train_data, add_special_tokens=True)
test_tokens = tokenizer.encode(test_data, add_special_tokens=True)
valid_tokens = tokenizer.encode(valid_data, add_special_tokens=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (5118569 > 512). Running this sequence through the model will result in indexing errors


In [99]:
#create attention masks
train_masks = [1] * len(train_tokens)
test_masks = [1] * len(test_tokens)
valid_masks = [1] * len(valid_tokens)

In [100]:
#convert data into torch tensors
train_tokens_tensor = torch.tensor(train_tokens)
test_tokens_tensor = torch.tensor(test_tokens)
valid_tokens_tensor = torch.tensor(valid_tokens)

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)
valid_masks_tensor = torch.tensor(valid_masks)

In [101]:
#create datasets
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor)
test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor)
valid_dataset = TensorDataset(valid_tokens_tensor, valid_masks_tensor)

In [102]:
#create dataloaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
test_dataloader = DataLoader(test_dataset, sampler=RandomSampler(test_dataset), batch_size=32)
valid_dataloader = DataLoader(valid_dataset, sampler=RandomSampler(valid_dataset), batch_size=32)

In [103]:
#create transformer model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [104]:
#train the model
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [105]:
#fit the model
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 10

In [113]:
for epoch in range(10):
  #set the model in training mode
  model.train()
  
  #loop over all batches
  for step, batch in enumerate(train_dataloader):
    #unpack the batch
    b_input_ids = b_input_ids.view(-1, b_input_ids.size(-1))
    b_input_mask = b_input_mask.view(-1, b_input_mask.size(-1))
    
    #zero the gradients
    optimizer.zero_grad()

    #forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
  
    #calculate the loss
    loss = torch.mean(outputs[0])
    #backward pass
    loss.backward()
  
    #update the weights
    optimizer.step()
    
    # print loss
    if step % 100 == 0:
        print ("Epoch {}/{}.. ".format(epoch+1, 10),
               "Step: {}/{}.. ".format(step, total_steps),
               "Loss: {:.4f}".format(loss.item()))

Epoch 1/10..  Step: 0/1599560..  Loss: -26.2834
Epoch 1/10..  Step: 100/1599560..  Loss: -33.9249
Epoch 1/10..  Step: 200/1599560..  Loss: -37.1984
Epoch 1/10..  Step: 300/1599560..  Loss: -40.5627
Epoch 1/10..  Step: 400/1599560..  Loss: -44.1324
Epoch 1/10..  Step: 500/1599560..  Loss: -47.8848
Epoch 1/10..  Step: 600/1599560..  Loss: -51.8034
Epoch 1/10..  Step: 700/1599560..  Loss: -55.8103
Epoch 1/10..  Step: 800/1599560..  Loss: -59.9160
Epoch 1/10..  Step: 900/1599560..  Loss: -64.0738
Epoch 1/10..  Step: 1000/1599560..  Loss: -68.2545


KeyboardInterrupt: ignored

In [None]:
#evaluate the model
#set the model in evaluation mode
model.eval()

In [None]:
#initialize accuracy
accuracy = 0

In [None]:
#loop over all batches
for batch in valid_dataloader:
  #unpack the batch
  b_input_ids = b_input_ids.view(-1, b_input_ids.size(-1))
  b_input_mask = b_input_mask.view(-1, b_input_mask.size(-1))

  #forward pass
  with torch.no_grad():
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  #calculate the accuracy
  predictions = torch.argmax(outputs[0], dim=1).flatten()
  accuracy += (predictions == b_input_ids).float().mean()

In [None]:
#print the accuracy
print ("Accuracy: {:.2f}%".format(accuracy*100))