In [1]:
import boto3
import pandas as pd
import json
import matplotlib.pyplot as plt
import time
import re

## check extracted entities from questions by Claude Sonnet 3.5

In [2]:
# Define your S3 bucket and file key
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'
file_key = 'extracted_entities_in_questions_train.json'  # replace with your actual file path

# Initialize the S3 client
s3_client = boto3.client('s3')

# Load the file from S3
obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
questions_entities = json.load(obj['Body'])

In [3]:
len(questions_entities)

6962

In [4]:
questions = list(questions_entities.keys())

In [5]:
questions[0]

'How many heads of the departments are older than 56 ?'

In [6]:
questions_entities[questions[0]]

{'entities_for_tables': ['departments'],
 'entities_for_columns': ['heads', 'age']}

In [7]:
type(questions_entities[questions[0]])

dict

In [8]:
for key in questions_entities:
    if questions_entities[key] == {}:
        print(questions_entities[key])

## There are duplicated questions in the original training data, some of the SQL query answers are different for the same question

In [9]:
obj = s3_client.get_object(Bucket=bucket_name, Key='train_spider.json')
spider_train = json.load(obj['Body'])

question_query = {}

for dic in spider_train:
    question = dic['question']
    query = dic['query']
    question_query[question] = question_query.get(question, []) + [query]

for question in question_query:
    list_of_queries = question_query[question]
    if len(list_of_queries) > 1:
        print(question)
        print(len(list_of_queries))
        print(list_of_queries)

How many users are there?
2
['SELECT count(*) FROM user_profiles', 'SELECT count(*) FROM useracct']
How many employees do we have?
3
['SELECT count(*) FROM Employee', 'SELECT count(*) FROM Employees;', 'SELECT count(*) FROM Employees']
How many students are there?
4
['SELECT count(*) FROM Student', 'SELECT count(*) FROM list', 'SELECT count(*) FROM student', 'SELECT count(*) FROM student']
How many students does each advisor have?
2
['SELECT advisor ,  count(*) FROM Student GROUP BY advisor', 'SELECT Advisor ,  count(*) FROM STUDENT GROUP BY Advisor']
How many accounts do we have?
2
['SELECT count(*) FROM Accounts', 'SELECT count(*) FROM Accounts']
Count the number of accounts.
3
['SELECT count(*) FROM Accounts', 'SELECT count(*) FROM accounts', 'SELECT count(*) FROM Accounts']
How many customers do we have?
2
['SELECT count(*) FROM Customers', 'SELECT count(*) FROM CUSTOMERS']
Count the number of customers.
3
['SELECT count(*) FROM Customers', 'SELECT count(*) FROM Customers', 'SELECT

In [10]:
len(question_query)

6962

In [11]:
len(questions_entities)

6962

### Make sure the questions in the entity extraction task are the original questions

In [12]:
list(questions_entities.keys()) == list(question_query.keys())

True

## collect all things to be embedded by BERT

### questions and extracted entities from questions

In [13]:
questions, entities = {}, {}

for question in questions_entities:
    
    questions[question] = None
    
    for key in questions_entities[question]:
        for entity in questions_entities[question][key]:
            entities[entity] = None
        
print(len(questions))
print(len(entities))

6962
3572


### Get database names, table names, and schemas for each table. For each schema file, identity the "create table table_name (...)" structure.

In [14]:
import boto3
import json
import re

# Initialize S3 client
s3 = boto3.client('s3')

# S3 bucket and folder
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'
base_folder = 'database/'

# Regular expression to identify CREATE TABLE structures with matching brackets
table_regex = re.compile(r'CREATE TABLE ["`]?(\w+)["`]?\s*\((.*?)\);', re.DOTALL | re.IGNORECASE)

# Function to find and extract entire CREATE TABLE definitions
def extract_table_definitions(sql_content):
    tables = {}
    # Find all matches for CREATE TABLE structures
    for match in table_regex.finditer(sql_content):
        table_name = match.group(1)
        table_definition = f"CREATE TABLE {table_name} ({match.group(2)});"
        tables[table_name] = table_definition
    return tables

# S3 interaction: Fetch all SQL content from schema files, parse, and save the results
def get_schema_files_and_parse(bucket, folder):
    result = s3.list_objects_v2(Bucket=bucket, Prefix=folder)
    all_schemas = {}

    for item in result.get('Contents', []):
        if item['Key'].endswith('schema.sql'):
            # Download the SQL content from S3
            sql_content = s3.get_object(Bucket=bucket, Key=item['Key'])['Body'].read().decode('utf-8')
            
            # Parse tables and their definitions
            schema = extract_table_definitions(sql_content)
            
            # Extract the subfolder name and add schema with placeholder for subfolder name
            subfolder_name = item['Key'].split('/')[-2]
            all_schemas[subfolder_name] = schema or {}

    # Save the result as a JSON file and upload to S3
    output_key = 'database_parsed_schemas.json'
    output_json = json.dumps(all_schemas, indent=4)
    s3.put_object(Bucket=bucket, Key=output_key, Body=output_json)
    print(f"Parsed schemas saved to s3://{bucket}/{output_key}")

# Run the function
get_schema_files_and_parse(bucket_name, base_folder)

Parsed schemas saved to s3://sagemaker-studio-423623869859-3no3d9ie4hx/database_parsed_schemas.json


In [15]:
# folder_index = 100
# table_index = 0
# column_index = 3

# folders = list(all_schemas.keys())
# folder = folders[folder_index]
# print(folders)
# print(folder)
# print('\n')

# tables = list(all_schemas[folder].keys())
# table = tables[table_index]
# print(tables)
# print(table)
# print('\n')

# columns = all_schemas[folder][table]
# column = columns[column_index]
# print(columns)
# print(column)

### There are 166 subfolders but only 148 of them have schema.sql files

In [16]:
folder_prefix = 'database/'  # Specify the folder within the bucket

def count_subfolders(bucket, prefix):
    # Use list_objects_v2 with the Delimiter to identify subfolders
    response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter='/')
    
    # Extract common prefixes (subfolder names)
    subfolders = response.get('CommonPrefixes', [])
    
    # Count the subfolders
    return len(subfolders)

# Count and print the number of subfolders
total_subfolders = count_subfolders(bucket_name, folder_prefix)
print(f"Total number of sub-folders in '{bucket_name}' under '{folder_prefix}': {total_subfolders}")

Total number of sub-folders in 'sagemaker-studio-423623869859-3no3d9ie4hx' under 'database/': 166


### Inspect parsed schemas

In [17]:
file_key = 'database_parsed_schemas.json'  # replace with your actual file path

# Load the file from S3
obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
all_schemas = json.load(obj['Body'])

# Variable to control how many schemas to print
num_databases_to_print = 1

# Iterate and print first `num_schemas_to_print` schemas
for i, (subfolder, schemas) in enumerate(all_schemas.items()):
    if i >= num_databases_to_print:
        break 
    print(f"Subfolder: {subfolder}")
    for table_name, table_definition in schemas.items():
        print(f"  Table: {table_name}")
        print(f"    Definition: {table_definition}")


Subfolder: academic
  Table: author
    Definition: CREATE TABLE author (
"aid" int,
"homepage" text,
"name" text,
"oid" int,
primary key("aid")
);
  Table: conference
    Definition: CREATE TABLE conference (
"cid" int,
"homepage" text,
"name" text,
primary key ("cid")
);
  Table: domain
    Definition: CREATE TABLE domain (
"did" int,
"name" text,
primary key ("did")
);
  Table: domain_author
    Definition: CREATE TABLE domain_author (
"aid" int, 
"did" int,
primary key ("did", "aid"),
foreign key("aid") references `author`("aid"),
foreign key("did") references `domain`("did")
);
  Table: domain_conference
    Definition: CREATE TABLE domain_conference (
"cid" int,
"did" int,
primary key ("did", "cid"),
foreign key("cid") references `conference`("cid"),
foreign key("did") references `domain`("did")
);
  Table: journal
    Definition: CREATE TABLE journal (
"homepage" text,
"jid" int,
"name" text,
primary key("jid")
);
  Table: domain_journal
    Definition: CREATE TABLE domain_journ

In [18]:
list_of_databases, list_of_tables, list_of_columns = [], [], []
for database in all_schemas:
    list_of_databases.append(database)
    for table in all_schemas[database]:
        list_of_tables.append(table)
            
print('total number of databases: {}'.format(len(list_of_databases)))
print('total number of unique databases: {}'.format(len(list(set(list_of_databases)))))
print('total number of tables: {}'.format(len(list_of_tables)))
print('total number of unique tables: {}'.format(len(list(set(list_of_tables)))))

total number of databases: 148
total number of unique databases: 148
total number of tables: 749
total number of unique tables: 575


### Save databases, tables, columns in 3 separate json files and upload to S3

In [19]:
# import boto3
# import json

# # Initialize the S3 client
# s3 = boto3.client('s3')

# # S3 bucket details
# bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'

# # Assume all_schemas already exists, structured as {database_name: {table_name: [column_names]}}
# # Initialize dictionaries for databases, tables, and columns
# databases = {database_name: None for database_name in all_schemas.keys()}
# tables = {f"{database_name}.{table_name}": None for database_name, tables in all_schemas.items() for table_name in tables.keys()}
# columns = {f"{database_name}.{table_name}.{column_name}": None for database_name, tables in all_schemas.items() for table_name, column_names in tables.items() for column_name in column_names}

# # Convert the dictionaries to JSON format
# databases_json = json.dumps(databases, indent=4)
# tables_json = json.dumps(tables, indent=4)
# columns_json = json.dumps(columns, indent=4)

# # Define S3 keys for each file
# s3_keys = {
#     "databases.json": databases_json,
#     "tables.json": tables_json,
#     "columns.json": columns_json
# }

# # Save each dictionary to the S3 bucket as a JSON file
# for filename, data in s3_keys.items():
#     s3.put_object(Bucket=bucket_name, Key=filename, Body=data)
#     print(f"{filename} saved to s3://{bucket_name}/{filename}")


In [20]:
# len(databases)

In [21]:
# len(tables)

In [22]:
# len(columns)

In [23]:
# len(questions)

In [24]:
# len(entities)