#Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import Libraries

In [2]:
from glob import glob
import numpy as np
import tensorflow as tf
from transformers import AdamWeightDecay

In [30]:
ROOT_PATH = "/content/drive/MyDrive/Deep Learning/TestCaseGeneration/"
DATASET_PATH = "/content/drive/MyDrive/Deep Learning/TestCaseGeneration/Dataset/"
MODEL_NAME = "google/flan-t5-small"
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 100

In [4]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, max_length=MAX_LEN, padding="max_length")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [5]:
inputs = tokenizer("Generate test cases for:", return_tensors="tf")
outputs = tokenizer.batch_decode(model.generate(**inputs, max_new_tokens=128), skip_special_tokens=True)
print(outputs)

['a syringe']


#Preprocess Dataset

In [6]:
files = glob(DATASET_PATH+"*.txt")

In [7]:
files

['/content/drive/MyDrive/Deep Learning/TestCaseGeneration/Dataset/SWIFT ISO 20022.txt',
 '/content/drive/MyDrive/Deep Learning/TestCaseGeneration/Dataset/KYC validation.txt',
 '/content/drive/MyDrive/Deep Learning/TestCaseGeneration/Dataset/money laundering.txt',
 '/content/drive/MyDrive/Deep Learning/TestCaseGeneration/Dataset/fraud detection and risk scoring.txt',
 '/content/drive/MyDrive/Deep Learning/TestCaseGeneration/Dataset/loan and credit risk assessment.txt']

In [8]:
tokenizer.encode_plus("hello world")

{'input_ids': [21820, 296, 1], 'attention_mask': [1, 1, 1]}

In [9]:
def preprocess_data(file_path, content):
  labels = list(map(lambda x: x.rstrip('\n'), content.split('/')))
  filename = file_path.split('/')[-1]
  context = filename.split('.')[0]
  input = f"Generate test case for: {context}"
  inputs = [input] * len(labels)

  inputs = tokenizer.batch_encode_plus(inputs, max_length=MAX_LEN, padding="max_length")
  labels = tokenizer.batch_encode_plus(labels, max_length=MAX_LEN, padding="max_length")

  return inputs["input_ids"], inputs["attention_mask"], labels["input_ids"], labels["attention_mask"]

In [10]:
input_ids = []
attention_mask = []
labels = []
labels_attention_mask = []


for file_path in files:
  with open(file_path, "r") as file:
    data = file.read()

  x_input_ids, x_attention_mask, y_labels, y_attention_mask = preprocess_data(file_path, data)
  input_ids.extend(x_input_ids)
  attention_mask.extend(x_attention_mask)
  labels.extend(y_labels)
  labels_attention_mask.extend(y_attention_mask)

In [11]:
len(input_ids)

103

In [12]:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
labels = np.array(labels)
labels_attention_mask = np.array(labels_attention_mask)

In [13]:
attention_mask.shape

(103, 128)

In [14]:
input_ids = tf.data.Dataset.from_tensor_slices(input_ids)
attention_mask = tf.data.Dataset.from_tensor_slices(attention_mask)
labels = tf.data.Dataset.from_tensor_slices(labels)
labels_attention_mask = tf.data.Dataset.from_tensor_slices(labels_attention_mask)

In [15]:
dataset = tf.data.Dataset.zip((input_ids, attention_mask, labels, labels_attention_mask))
dataset = dataset.shuffle(408)
dataset = dataset.map(lambda x, y, z, l: {"input_ids": x, "attention_mask": y,
                                            "labels": z, "decoder_attention_mask": l})
train_dataset = dataset.batch(BATCH_SIZE)

In [198]:
batch = next(iter(dataset.batch(BATCH_SIZE)))

In [203]:
batch

{'input_ids': <tf.Tensor: shape=(32, 128), dtype=int64, numpy=
 array([[6939, 2206,  794, ...,    0,    0,    0],
        [6939, 2206,  794, ...,    0,    0,    0],
        [6939, 2206,  794, ...,    0,    0,    0],
        ...,
        [6939, 2206,  794, ...,    0,    0,    0],
        [6939, 2206,  794, ...,    0,    0,    0],
        [6939, 2206,  794, ...,    0,    0,    0]])>,
 'attention_mask': <tf.Tensor: shape=(32, 128), dtype=int64, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>,
 'labels': <tf.Tensor: shape=(32, 128), dtype=int64, numpy=
 array([[    3, 16772,    10, ...,     0,     0,     0],
        [    3, 16772,    10, ...,     0,     0,     0],
        [    3, 16772,    10, ...,     0,     0,     0],
        ...,
        [    3, 16772,    10, ...,     0,     0,     0],
        [    3, 16772,    10, ..

#Train the Model

In [16]:
optimizer = AdamWeightDecay(learning_rate=3e-4)

In [17]:
model.compile(optimizer=optimizer, metrics=['accuracy'])

In [32]:
step = 0

print(f'Training started.')
print()

for epoch in range(EPOCHS):
    print(f'Epoch: {epoch+1}/{EPOCHS}')


    for b, batch in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            outputs = model(batch)
            logits = outputs.logits
            loss_value = outputs.loss

        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        print(f'Training loss {tf.reduce_mean(loss_value)}')

        step += 1

Training started.

Epoch: 1/100
Training loss 0.44780224561691284
Training loss 0.4243687689304352
Training loss 0.3624263107776642
Training loss 0.351685494184494
Epoch: 2/100
Training loss 0.34089797735214233
Training loss 0.38312390446662903
Training loss 0.3600236177444458
Training loss 0.28842368721961975
Epoch: 3/100
Training loss 0.30119726061820984
Training loss 0.34661662578582764
Training loss 0.3068312704563141
Training loss 0.26471856236457825
Epoch: 4/100
Training loss 0.26739344000816345
Training loss 0.27982640266418457
Training loss 0.28184428811073303
Training loss 0.27138200402259827
Epoch: 5/100
Training loss 0.23771168291568756
Training loss 0.2450016885995865
Training loss 0.241456538438797
Training loss 0.22638212144374847
Epoch: 6/100
Training loss 0.21822574734687805
Training loss 0.19510115683078766
Training loss 0.2182987779378891
Training loss 0.2093658447265625
Epoch: 7/100
Training loss 0.1806706190109253
Training loss 0.1850634217262268
Training loss 0.190

KeyboardInterrupt: 

In [33]:
input_sent = "Generate test case for: money laundering"
outputs = model.generate(**inputs, max_new_tokens=128)

In [34]:
tokenizer.batch_decode(outputs)

['<pad> Feature: Processing of Documents Fraud &amp; Abuse Detection Scenario: Detect multiple documents with suspicious circumstances Given a customer submits an application for a government document with incorrect residency in mind When the system detects multiple documents with suspicious circumstances Then the system should trigger an alert for suspicious activity</s>']

In [35]:
model.save_pretrained(ROOT_PATH+"modelBest")