In [None]:
# text to sql

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("tscholak/3vnuv1vf")
model = AutoModelForSeq2SeqLM.from_pretrained("tscholak/3vnuv1vf")

# Tokenize the input text
input_text = "list names of film released in 2018 and rating more than 6? | IMDB | Movie: rating, year, title"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate the output
output = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

# Decode the output
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)
# Output: IMDB | select title from movie where rating > 6 and year = 2018

# split the output into two parts (sql and table name)
output_text = output_text.split("|")
sql = output_text[1].strip()

# print the sql
print(sql)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sqlite3
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Mean Pooling - Take attention mask into account for correct averaging


def mean_pooling(model_output, attention_mask):
    # First element of model_output contains all token embeddings
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(
        -1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(
    'sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Query sentence
query_sentence = input("Enter question: ")

# Connect to database and fetch table names and column names
conn = sqlite3.connect('/content/Db-IMDB.db')
cursor = conn.cursor()


# Get table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_names = cursor.fetchall()
table_names = [x[0] for x in table_names]

# generate string in this format: query_sentence | database_name | table_name1: column_name1(value1, value2, value3, ...), column_name2(value1, value2, value3, ...), ... | table_name2: column_name1(value1, value2, value3, ...), column_name2(value1, value2, value3, ...), ... | ...
input_text = query_sentence + " | IMDB | "
for table_name in table_names:
    input_text += table_name + ": "
    cursor.execute("PRAGMA table_info(" + table_name + ")")
    column_names = cursor.fetchall()
    column_names = [x[1] for x in column_names]
    for column_name in column_names:
        cursor.execute("SELECT DISTINCT " + column_name + " FROM " + table_name)
        values = cursor.fetchall()
        values = [x[0] for x in values]
        input_text += column_name + "("
        for value in values:
            input_text += str(value) + ", "
        input_text = input_text[:-2]
        input_text += "), "
    input_text = input_text[:-2]
    input_text += " | "

print(input_text)

In [None]:
# python code to generate a string in this format [question] | [db_id] | [table] : [column] ( [content] , [content] ) , [column] ( ... ) , [...] | [table] : ... | ...

def get_sql(query):
    # Tokenize the input text
    input_ids = tokenizer.encode(query, return_tensors="pt")

    # Generate the output
    output = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

    # Decode the output
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(output_text)
    # Output: IMDB | select title from movie where rating > 6 and year = 2018

    # split the output into two parts (sql and table name)
    output_text = output_text.split("|")
    sql = output_text[1].strip()

    # print the sql
    print(sql)
    return sql