In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import random
import time
from sklearn.model_selection import train_test_split
import tqdm

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [4]:
# Load the data
df = pd.read_csv('/content/sample_data/chac.csv')
print(f'Dataset size: {len(df)}')
df.head()

Dataset size: 1150


Unnamed: 0,input,output
0,Remind me to stop the bathroom light.,stop.bathroom_light
1,Don't open the news yet.,open.news
2,Could you please pause the dryer?,pause.dryer
3,Could you close my blinds?,close.blinds
4,Turn off the freezer now.,turn_off.freezer


In [5]:
print(df.columns)

Index(['input', 'output'], dtype='object')


In [6]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')

Training set size: 1035
Validation set size: 115


In [7]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
class TextToApiDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.inputs = df['input'].tolist()
        self.targets = df['output'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = self.inputs[index]
        target_text = self.targets[index]

        input_enc = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_enc = self.tokenizer.encode_plus(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = target_enc['input_ids'].flatten()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss computation

        return {
            'input_ids': input_enc['input_ids'].flatten(),
            'attention_mask': input_enc['attention_mask'].flatten(),
            'labels': labels
        }

In [9]:
train_dataset = TextToApiDataset(train_df, tokenizer)
val_dataset = TextToApiDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [10]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model = model.to(device)

In [11]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=5e-5)



In [12]:
def train_epoch(model, data_loader, optimizer, device, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm.tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch} Training Loss: {avg_loss}")

def eval_epoch(model, data_loader, device, epoch):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(data_loader, desc=f"Validation Epoch {epoch}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch} Validation Loss: {avg_loss}")


In [13]:
for epoch in range(1, EPOCHS + 1):
    train_epoch(model, train_loader, optimizer, device, epoch)
    eval_epoch(model, val_loader, device, epoch)

Training Epoch 1: 100%|██████████| 65/65 [00:21<00:00,  3.08it/s]


Epoch 1 Training Loss: 3.9493478243167583


Validation Epoch 1: 100%|██████████| 8/8 [00:00<00:00, 11.40it/s]


Epoch 1 Validation Loss: 1.2845119535923004


Training Epoch 2: 100%|██████████| 65/65 [00:17<00:00,  3.70it/s]


Epoch 2 Training Loss: 1.3916164086415217


Validation Epoch 2: 100%|██████████| 8/8 [00:00<00:00, 11.76it/s]


Epoch 2 Validation Loss: 0.45793716609477997


Training Epoch 3: 100%|██████████| 65/65 [00:17<00:00,  3.74it/s]


Epoch 3 Training Loss: 0.7047973252259768


Validation Epoch 3: 100%|██████████| 8/8 [00:00<00:00, 12.34it/s]


Epoch 3 Validation Loss: 0.24825651571154594


Training Epoch 4: 100%|██████████| 65/65 [00:17<00:00,  3.77it/s]


Epoch 4 Training Loss: 0.4700128983992797


Validation Epoch 4: 100%|██████████| 8/8 [00:00<00:00, 11.72it/s]


Epoch 4 Validation Loss: 0.15637657511979342


Training Epoch 5: 100%|██████████| 65/65 [00:17<00:00,  3.76it/s]


Epoch 5 Training Loss: 0.3269698238143554


Validation Epoch 5: 100%|██████████| 8/8 [00:00<00:00, 12.28it/s]


Epoch 5 Validation Loss: 0.10656485706567764


Training Epoch 6: 100%|██████████| 65/65 [00:17<00:00,  3.74it/s]


Epoch 6 Training Loss: 0.24544006425600787


Validation Epoch 6: 100%|██████████| 8/8 [00:00<00:00, 12.33it/s]


Epoch 6 Validation Loss: 0.0764949235599488


Training Epoch 7: 100%|██████████| 65/65 [00:17<00:00,  3.75it/s]


Epoch 7 Training Loss: 0.21119136351805468


Validation Epoch 7: 100%|██████████| 8/8 [00:00<00:00, 12.07it/s]


Epoch 7 Validation Loss: 0.05708373733796179


Training Epoch 8: 100%|██████████| 65/65 [00:17<00:00,  3.70it/s]


Epoch 8 Training Loss: 0.17059378125346625


Validation Epoch 8: 100%|██████████| 8/8 [00:00<00:00, 11.87it/s]


Epoch 8 Validation Loss: 0.042419017758220434


Training Epoch 9: 100%|██████████| 65/65 [00:17<00:00,  3.77it/s]


Epoch 9 Training Loss: 0.13522000920314056


Validation Epoch 9: 100%|██████████| 8/8 [00:00<00:00, 12.16it/s]


Epoch 9 Validation Loss: 0.03347595757804811


Training Epoch 10: 100%|██████████| 65/65 [00:17<00:00,  3.76it/s]


Epoch 10 Training Loss: 0.12351462219197017


Validation Epoch 10: 100%|██████████| 8/8 [00:00<00:00, 12.26it/s]

Epoch 10 Validation Loss: 0.027320016466546804





In [14]:
def generate_api_command(model, tokenizer, text, device, max_length=50):
    model.eval()
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )
    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return pred


In [15]:
test_sentences = [
    "Please turn off the kitchen lights",
    "Set the thermostat to 72 degrees",
    "Lock all the doors",
    "Open the garage door",
    "Play some jazz music in the living room",
    "Please lock the front door"
]

for sentence in test_sentences:
    api_command = generate_api_command(model, tokenizer, sentence, device)
    print(f"Input: {sentence}")
    print(f"API Command: {api_command}")
    print("-" * 50)


Input: Please turn off the kitchen lights
API Command: turn_off.kitchen_lights
--------------------------------------------------
Input: Set the thermostat to 72 degrees
API Command: set.thermostat.72
--------------------------------------------------
Input: Lock all the doors
API Command: lock.doors
--------------------------------------------------
Input: Open the garage door
API Command: open.garage_door
--------------------------------------------------
Input: Play some jazz music in the living room
API Command: play.music.jazz.in_the_living_room
--------------------------------------------------
Input: Please lock the front door
API Command: lock.front_door
--------------------------------------------------


In [16]:
model.save_pretrained('text_to_api_model')
tokenizer.save_pretrained('text_to_api_model')

('text_to_api_model/tokenizer_config.json',
 'text_to_api_model/special_tokens_map.json',
 'text_to_api_model/spiece.model',
 'text_to_api_model/added_tokens.json')