<a href="https://colab.research.google.com/github/eliazulai29/tensor/blob/main/t5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install SentencePiece


Collecting SentencePiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.99


In [3]:
# Step 1: Install the required libraries
!pip install transformers torch pandas

# Step 2: Define a Dataset Class
import pandas as pd
from torch.utils.data import Dataset
from transformers import T5Tokenizer

class NLPtoSQLDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=512):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['question']
        sql_query = self.data.iloc[idx]['sql']

        # Tokenizing the input and target texts
        input_encoding = self.tokenizer(
            question,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            sql_query,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding.input_ids.flatten(),
            'attention_mask': input_encoding.attention_mask.flatten(),
            'labels': target_encoding.input_ids.flatten()
        }

# Step 3: Create a Function to Load Dataset
def load_dataset(csv_file, tokenizer, max_length=512):
    return NLPtoSQLDataset(csv_file, tokenizer, max_length)

# Example usage
tokenizer = T5Tokenizer.from_pretrained('t5-small')
train_dataset = load_dataset('/content/data/val.csv', tokenizer)
val_dataset = load_dataset('/content/data/val.csv', tokenizer)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, AdamW
from tqdm.auto import tqdm

# Step 1: Set Up the Training Environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 2: Load the Model
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Step 3: Prepare the Data Loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Step 4: Set Up the Training Loop
optimizer = AdamW(model.parameters(), lr=5e-5)

def train(epoch, model, loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc=f"Training Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Average Training Loss: {total_loss / len(loader)}")

# Step 5: Train the Model
num_epochs = 1  # Set the number of epochs
for epoch in range(num_epochs):
    train(epoch, model, train_loader, optimizer)

# Step 6: Save the Model and Tokenizer
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

Using device: cuda


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training Epoch 0:   0%|          | 0/7045 [00:00<?, ?it/s]

Average Training Loss: 0.09672848823426769


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/spiece.model',
 './trained_model/added_tokens.json')

In [5]:
import os

# Specify the directory
directory = "./trained_model"

# Check if the directory exists
if os.path.exists(directory):
    # List all files in the directory
    files = os.listdir(directory)
    print("Files in 'trained_model' directory:")
    for file in files:
        print(file)
else:
    print("Directory does not exist:", directory)


Files in 'trained_model' directory:
generation_config.json
model.safetensors
special_tokens_map.json
config.json
tokenizer_config.json
added_tokens.json
spiece.model


In [9]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model
model = T5ForConditionalGeneration.from_pretrained("./trained_model", use_safetensors=True)
model = model.to(device)

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("./trained_model")

# Function for inference
def generate_sql_query(question):
    model.eval()
    input_ids = tokenizer.encode(question, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(input_ids)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Example usage
question = "Tell me what the notes are for israel "
sql_query = generate_sql_query(question)
print("Generated SQL Query:", sql_query)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated SQL Query: SELECT Notes FROM table WHERE Name = israel


In [13]:
question = "How many trc is up and ok"
sql_query = generate_sql_query(question)
print("Generated SQL Query:", sql_query)

Generated SQL Query: SELECT COUNT Trc FROM table WHERE Up = up AND OK


In [10]:
import os
from google.colab import files
import shutil

# Step 1: Compress the trained_model directory
shutil.make_archive('trained_model', 'zip', './trained_model')

# Step 2: Download the zip file
files.download('trained_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install --upgrade transformers

In [None]:
# from transformers import T5ForConditionalGeneration, T5Tokenizer
# import torch

# # Load the trained model (with SafeTensors)
# model = T5ForConditionalGeneration.from_pretrained("/content/trained_model", use_safetensors=True)

# # Load the tokenizer
# tokenizer = T5Tokenizer.from_pretrained("./trained_model")

# # Set the model to evaluation mode
# model.eval()

# # Function for inference
# def generate_sql_query(question):
#     input_ids = tokenizer.encode(question, return_tensors="pt")
#     with torch.no_grad():
#         generated_ids = model.generate(input_ids)
#     return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# # Example usage
# question = "Your example question here"
# sql_query = generate_sql_query(question)
# print("Generated SQL Query:", sql_query)