# Code to check the BLEU score of Code T5 - base model on our dataset WITHOUT training on 40k+ records

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing necessary libraries
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from transformers import AutoModelForCausalLM, AutoTokenizer, RobertaTokenizer, T5ForConditionalGeneration
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [None]:
# Loading the pre-trained model Code T5 - base from Salesforce
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Salesforce/codet5-base"

# Initializing the Code T5 - base model's tokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# Initializing the pre-trained Code T5 - base model
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
# Checking if a CUDA-enabled GPU is available, if not, using the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Moving the model to the specified device (CUDA GPU or CPU)
model.to(device)

# We can see the architecture of Code T5 - base in the output

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
import json
import re
from sklearn.model_selection import train_test_split

def load_and_clean_jsonl(file_name):
    """This function loads and cleans a JSON Lines (JSONL) file,
    extracting 'text' and 'code' fields from each line's JSON object."""
    data = []
    with open(file_name, 'r') as file:
        for line in file:
            json_object = json.loads(line)
            data.append({'text': json_object['text'], 'code': json_object['code']})
    return data

# Function to split the data into train, validation, and test in the ratio 70-10-20
def split_data(data, test_size=0.2, val_size=0.1):
    """Function to split the data into train, validation, and test"""
    train_data, test_data = train_test_split(data, test_size=test_size + val_size)
    val_size_adjusted = val_size / (test_size + val_size)
    val_data, test_data = train_test_split(test_data, test_size=val_size_adjusted)
    return train_data, test_data, val_data

# Load and clean the data
data = load_and_clean_jsonl('drive/MyDrive/combined_file40kplus_withoutcodesearch.jsonl')

# Split the data
train_data, test_data, val_data = split_data(data)

In [None]:
len(train_data)

32608

In [None]:
len(test_data)

4659

In [None]:
len(val_data)

9317

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

# Defining a custom dataset class for text and code pairs
class TextCodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        """Initializing the dataset with input data, tokenizer, and max length"""
        self.data = data
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """Returning the length of the dataset"""
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Tokenize and pad text and code
        text_encodings = self.tokenizer(item['text'], padding='max_length', max_length=self.max_length, truncation=True, return_tensors='pt')
        code_encodings = self.tokenizer(item['code'], padding='max_length', max_length=self.max_length, truncation=True, return_tensors='pt')

        # return text_encodings, code_encodings
        return text_encodings['input_ids'].squeeze(), code_encodings['input_ids'].squeeze()


# Creating dataset instances for training, testing, and validation
train_dataset = TextCodeDataset(train_data, tokenizer)
test_dataset = TextCodeDataset(test_data, tokenizer)
val_dataset = TextCodeDataset(val_data, tokenizer)

# Creating DataLoader instances using batch size 8 due to GPU constraints
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
def generate_code(model, tokenizer, input_ids, max_length=256):
    """Function to generate code from given inout text"""
    model.eval()
    with torch.no_grad():
        # Generate output sequences
        outputs = model.generate(input_ids=input_ids, max_length=max_length)
    return outputs

In [None]:
l1 = list()
l2 = list()

In [None]:
# Importing necessary modules for BLEU score calculation and evaluation
from nltk.translate.bleu_score import sentence_bleu
import torch

# Function to evaluate the model and display examples
def evaluate_and_display_examples(model, tokenizer, test_loader, num_examples=3):
    # Set the model to evaluation mode
    model.eval()

    # Initialize counters for examples shown and total BLEU score
    examples_shown = 0
    total_bleu_score = 0

    # Turn off gradient computation during evaluation
    with torch.no_grad():
        # Iterate through batches in the test loader
        for batch in test_loader:
            input_ids, labels = [b.to(device) for b in batch]

            # Generate code using the model
            generated_ids = generate_code(model, tokenizer, input_ids)

            # Convert generated and label IDs to text
            generated_texts = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            label_texts = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in labels]

            # Calculate BLEU score and display examples
            for gen_text, label_text in zip(generated_texts, label_texts):
                # Calculate BLEU score for each example
                bleu_score = sentence_bleu([label_text.split()], gen_text.split())
                total_bleu_score += bleu_score

    # Calculate the average BLEU score over the entire test dataset
    avg_bleu_score = total_bleu_score / len(test_loader.dataset)

    # Print the average BLEU score
    print(f"Average BLEU Score on Test Data: {avg_bleu_score}")

# Evaluate the model and display examples
evaluate_and_display_examples(model, tokenizer, test_loader)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score on Test Data: 0.0002593699584921753


## The model Code T5 - base without any fine tuning on a particular data set, got a sentence BLEU score of approximately 0.03%