In [5]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


In [2]:

# Load the pre-trained CodeBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# Load the pre-trained CodeBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# Load the training dataset
train_df = pd.read_csv("train_data.csv")

# Encode the training dataset using the CodeBERT tokenizer
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)

# Convert the training labels to tensors
train_labels = torch.tensor(train_df['label'].tolist())

# Initialize the training dataset
train_dataset = [{"input_ids": train_encodings['input_ids'][i],
                  "attention_mask": train_encodings['attention_mask'][i],
                  "label": train_labels[i]} for i in range(len(train_df))]

# Split the training dataset into training and validation datasets
train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2, random_state=42)


Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

In [3]:

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

In [4]:
# Fine-tune the model on the training dataset
trainer.train()


***** Running training *****
  Num examples = 13352
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2505
  Number of trainable parameters = 124647170
  0%|          | 0/2505 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 201326592 bytes.

In [None]:

# Load the test dataset
test_df = pd.read_csv("test_data.csv")

# Encode the test dataset using the CodeBERT tokenizer
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True)

# Convert the test labels to tensors
test_labels = torch.tensor(test_df['label'].tolist())

# Initialize the test dataset
test_dataset = [{"input_ids": test_encodings['input_ids'][i],
                  "attention_mask": test_encodings['attention_mask'][i],
                  "label": test_labels[i]} for i in range(len(test_df))]

# Evaluate the model on the test dataset
trainer.evaluate(test_dataset)
