In [1]:
!pip install transformers



In [2]:
import boto3
import pandas as pd
import json
import matplotlib.pyplot as plt
import time
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import os
from huggingface_hub import login

## load questions and sql queries in spider train

In [3]:
# Define your S3 bucket and file key
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'
file_key = 'train_spider.json'  # replace with your actual file path

# Initialize the S3 client
s3_client = boto3.client('s3')

# Load the file from S3
obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
json_train = json.load(obj['Body'])

# Load questions and answers into a pandas dataframe
pd.set_option('display.max_colwidth', None)

list_questions_and_sqls = []

for ele in json_train:
    list_questions_and_sqls.append([ele['question'], ele['query']])
    
df_questions_and_sqls = pd.DataFrame(list_questions_and_sqls, columns=['question', 'sql_query'])

## falcon-7b entity extraction

In [4]:
# Define model identifier from Hugging Face
model_id_falcon = "tiiuae/falcon-7b"

# Load the tokenizer and model
tokenizer_falcon = AutoTokenizer.from_pretrained(model_id_falcon)
model_falcon = AutoModelForCausalLM.from_pretrained(model_id_falcon, torch_dtype=torch.float16).to("cuda")

# Create a text generation pipeline
pipe_falcon = pipeline("text-generation", model=model_falcon, tokenizer=tokenizer_falcon, device=0)

def extract_entities_falcon(question):
    # Define a batch of input prompts
    beg_time = time.time()

    # Generate text for each input in the batch
    outputs = pipe_falcon(question, max_length=256, num_return_sequences=1, do_sample=True)

    # Print the generated outputs for each input
    #print(f"Input: {question}")
    #print(f"Output: {outputs[0]['generated_text']}\n")

    end_time = time.time()
    print('total time spent is {}'.format(end_time-beg_time))
    return outputs[0]['generated_text']

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## llama 3.2 3B

In [5]:
# # Enter your Hugging Face token here
# login("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")

In [6]:
# # Define model identifier and access token
# model_id = "meta-llama/Llama-3.2-3B"

# # Load the tokenizer and model using the access token
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

# def extract_entities_llama(question):
#     beg_time = time.time()

#     # Tokenize the input
#     inputs = tokenizer(question, return_tensors="pt").to("cuda")

#     # Generate text
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_length=512
#         )

#     # Decode and print the output
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     end_time = time.time()
#     print('total time spent is {} seconds'.format(end_time - beg_time))
#     #print(question)
#     #print(generated_text)
    
#     return generated_text

In [7]:
def question_map(question):
    
    returned_question = f"""The task is to extract the entities from the question \
    enclosed in double quotes at the end of this paragraph. \
    Only extract entities that are helpful to identify \
    relevant table names and column names in a database. \
    The table names and column names will be used in a SQL query. \
    Return your answer in a python dictionary whose keys are \
    entities_for_tables and entities_for_columns. \
    The value for each key will be a list of extracted entities. \
    Return the python dictionary only without anything extra. \
    For example, if the question is "How many heads of the departments are older than 56?", \
    the returned python dictionary should be \
    "{{\"entities_for_tables\": [\"departments\"], \"entities_for_columns\": [\"heads\", \"age\"]}}". \
    Now here is the question that you should extract entities from: \"{question}\"\
    """

    return returned_question

df_questions_and_sqls['question_for_entity_extraction'] = df_questions_and_sqls['question'].apply(question_map)

In [8]:
df_questions_and_sqls.head(5)

Unnamed: 0,question,sql_query,question_for_entity_extraction
0,How many heads of the departments are older than 56 ?,SELECT count(*) FROM head WHERE age > 56,"The task is to extract the entities from the question enclosed in double quotes at the end of this paragraph. Only extract entities that are helpful to identify relevant table names and column names in a database. The table names and column names will be used in a SQL query. Return your answer in a python dictionary whose keys are entities_for_tables and entities_for_columns. The value for each key will be a list of extracted entities. Return the python dictionary only without anything extra. For example, if the question is ""How many heads of the departments are older than 56?"", the returned python dictionary should be ""{""entities_for_tables"": [""departments""], ""entities_for_columns"": [""heads"", ""age""]}"". Now here is the question that you should extract entities from: ""How many heads of the departments are older than 56 ?"""
1,"List the name, born state and age of the heads of departments ordered by age.","SELECT name , born_state , age FROM head ORDER BY age","The task is to extract the entities from the question enclosed in double quotes at the end of this paragraph. Only extract entities that are helpful to identify relevant table names and column names in a database. The table names and column names will be used in a SQL query. Return your answer in a python dictionary whose keys are entities_for_tables and entities_for_columns. The value for each key will be a list of extracted entities. Return the python dictionary only without anything extra. For example, if the question is ""How many heads of the departments are older than 56?"", the returned python dictionary should be ""{""entities_for_tables"": [""departments""], ""entities_for_columns"": [""heads"", ""age""]}"". Now here is the question that you should extract entities from: ""List the name, born state and age of the heads of departments ordered by age."""
2,"List the creation year, name and budget of each department.","SELECT creation , name , budget_in_billions FROM department","The task is to extract the entities from the question enclosed in double quotes at the end of this paragraph. Only extract entities that are helpful to identify relevant table names and column names in a database. The table names and column names will be used in a SQL query. Return your answer in a python dictionary whose keys are entities_for_tables and entities_for_columns. The value for each key will be a list of extracted entities. Return the python dictionary only without anything extra. For example, if the question is ""How many heads of the departments are older than 56?"", the returned python dictionary should be ""{""entities_for_tables"": [""departments""], ""entities_for_columns"": [""heads"", ""age""]}"". Now here is the question that you should extract entities from: ""List the creation year, name and budget of each department."""
3,What are the maximum and minimum budget of the departments?,"SELECT max(budget_in_billions) , min(budget_in_billions) FROM department","The task is to extract the entities from the question enclosed in double quotes at the end of this paragraph. Only extract entities that are helpful to identify relevant table names and column names in a database. The table names and column names will be used in a SQL query. Return your answer in a python dictionary whose keys are entities_for_tables and entities_for_columns. The value for each key will be a list of extracted entities. Return the python dictionary only without anything extra. For example, if the question is ""How many heads of the departments are older than 56?"", the returned python dictionary should be ""{""entities_for_tables"": [""departments""], ""entities_for_columns"": [""heads"", ""age""]}"". Now here is the question that you should extract entities from: ""What are the maximum and minimum budget of the departments?"""
4,What is the average number of employees of the departments whose rank is between 10 and 15?,SELECT avg(num_employees) FROM department WHERE ranking BETWEEN 10 AND 15,"The task is to extract the entities from the question enclosed in double quotes at the end of this paragraph. Only extract entities that are helpful to identify relevant table names and column names in a database. The table names and column names will be used in a SQL query. Return your answer in a python dictionary whose keys are entities_for_tables and entities_for_columns. The value for each key will be a list of extracted entities. Return the python dictionary only without anything extra. For example, if the question is ""How many heads of the departments are older than 56?"", the returned python dictionary should be ""{""entities_for_tables"": [""departments""], ""entities_for_columns"": [""heads"", ""age""]}"". Now here is the question that you should extract entities from: ""What is the average number of employees of the departments whose rank is between 10 and 15?"""


In [11]:
asdf = extract_entities_falcon(df_questions_and_sqls.loc[10, 'question_for_entity_extraction'])
asdf

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


total time spent is 3.1597280502319336


'The task is to extract the entities from the question     enclosed in double quotes at the end of this paragraph.     Only extract entities that are helpful to identify     relevant table names and column names in a database.     The table names and column names will be used in a SQL query.     Return your answer in a python dictionary whose keys are     entities_for_tables and entities_for_columns.     The value for each key will be a list of extracted entities.     Return the python dictionary only without anything extra.     For example, if the question is "How many heads of the departments are older than 56?",     the returned python dictionary should be     "{"entities_for_tables": ["departments"], "entities_for_columns": ["heads", "age"]}".     Now here is the question that you should extract entities from: "How many acting statuses are there?"     The answer from the question is {"entities_for_tables":[ "actings", "actresses", "actors" ], "entities_for_columns": ["status"]}".\n