In [91]:
import boto3
import pandas as pd
import json
import matplotlib.pyplot as plt
import time
import re

## check extracted entities from questions by Claude Sonnet 3.5

### make sure no questions are missed and all questions non-empty extracted entities

In [42]:
# Define your S3 bucket and file key
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'
file_key = 'extracted_entities_in_questions_train.json'  # replace with your actual file path

# Initialize the S3 client
s3_client = boto3.client('s3')

# Load the file from S3
obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
questions_entities = json.load(obj['Body'])

In [43]:
len(questions_entities)

6962

In [5]:
question = list(questions_entities.keys())

In [6]:
question[0]

'How many heads of the departments are older than 56 ?'

In [8]:
questions_entities[question[0]]

{'entities_for_tables': ['departments'],
 'entities_for_columns': ['heads', 'age']}

In [9]:
type(questions_entities[question[0]])

dict

In [50]:
for key in questions_entities:
    if questions_entities[key] == {}:
        print(questions_entities[key])

### make sure all questions are the same as the questions in the original dataset

In [20]:
obj = s3_client.get_object(Bucket=bucket_name, Key='train_spider.json')
spider_train = json.load(obj['Body'])

In [24]:
original_questions = set()
for ele in spider_train:
    original_questions.add(ele['question'])

In [47]:
for question in original_questions:
    if question not in questions_entities:
        print(question)

In [49]:
len(original_questions)

6962

## collect all things to be embedded by BERT

### questions and extracted entities from questions

In [184]:
questions, entities = {}, {}

In [185]:
for question in questions_entities:
    
    questions[question] = None
    
    for key in questions_entities[question]:
        for entity in questions_entities[question][key]:
            entities[entity] = None
        
print(len(questions))
print(len(entities))

6962
3572


### database names, table names, column names

In [161]:
import boto3
import json
import re

# Initialize S3 client
s3 = boto3.client('s3')

# S3 bucket and folder
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'
base_folder = 'database/'

# Regular expression to identify CREATE TABLE structures with matching brackets
table_regex = re.compile(r'CREATE TABLE ["`]?(\w+)["`]?\s*\((.*?)\);', re.DOTALL | re.IGNORECASE)

# Helper function to parse column names from table content
def parse_columns(content):
    # Remove everything starting from "PRIMARY KEY" or similar constraints
    primary_key_start = content.upper().find("PRIMARY KEY")
    if primary_key_start != -1:
        content = content[:primary_key_start]
    
    # Split by commas and take the first word as the column name
    columns = []
    segments = content.split(',')
    for segment in segments:
        words = segment.strip().split()
        if words:
            column_name = words[0].strip('`"')  # Remove any quotes around column name
            columns.append(column_name)
    return columns

# Function to find and parse all CREATE TABLE definitions
def extract_table_definitions(sql_content):
    tables = {}
    # Find all matches for CREATE TABLE structures
    for match in table_regex.finditer(sql_content):
        table_name = match.group(1)
        content = match.group(2)

        # Parse columns from content and store in the dictionary
        columns = parse_columns(content)
        tables[table_name] = columns
    return tables

# S3 interaction: Fetch all SQL content from schema files, parse, and save the results
def get_schema_files_and_parse(bucket, folder):
    result = s3.list_objects_v2(Bucket=bucket, Prefix=folder)
    all_schemas = {}

    for item in result.get('Contents', []):
        if item['Key'].endswith('schema.sql'):
            # Download the SQL content from S3
            sql_content = s3.get_object(Bucket=bucket, Key=item['Key'])['Body'].read().decode('utf-8')
            
            # Parse tables and columns
            schema = extract_table_definitions(sql_content)
            
            # Extract the subfolder name and add schema with placeholder for subfolder name
            subfolder_name = item['Key'].split('/')[-2]
            all_schemas[subfolder_name] = schema or {}

            # Print the subfolder name after parsing is complete
            print(f"Parsing completed for subfolder: {subfolder_name}")

    # Save the result as a JSON file and upload to S3
    output_key = 'database_parsed_schemas.json'
    output_json = json.dumps(all_schemas, indent=4)
    s3.put_object(Bucket=bucket, Key=output_key, Body=output_json)
    print(f"Parsed schemas saved to s3://{bucket}/{output_key}")

# Run the function
get_schema_files_and_parse(bucket_name, base_folder)


Parsing completed for subfolder: academic
Parsing completed for subfolder: activity_1
Parsing completed for subfolder: aircraft
Parsing completed for subfolder: allergy_1
Parsing completed for subfolder: apartment_rentals
Parsing completed for subfolder: architecture
Parsing completed for subfolder: assets_maintenance
Parsing completed for subfolder: baseball_1
Parsing completed for subfolder: battle_death
Parsing completed for subfolder: behavior_monitoring
Parsing completed for subfolder: bike_1
Parsing completed for subfolder: body_builder
Parsing completed for subfolder: book_2
Parsing completed for subfolder: browser_web
Parsing completed for subfolder: candidate_poll
Parsing completed for subfolder: cinema
Parsing completed for subfolder: city_record
Parsing completed for subfolder: climbing
Parsing completed for subfolder: club_1
Parsing completed for subfolder: coffee_shop
Parsing completed for subfolder: college_3
Parsing completed for subfolder: company_employee
Parsing compl

In [94]:
# folder_index = 100
# table_index = 0
# column_index = 3

# folders = list(all_schemas.keys())
# folder = folders[folder_index]
# print(folders)
# print(folder)
# print('\n')

# tables = list(all_schemas[folder].keys())
# table = tables[table_index]
# print(tables)
# print(table)
# print('\n')

# columns = all_schemas[folder][table]
# column = columns[column_index]
# print(columns)
# print(column)

### inspect all columns

In [162]:
# Extract unique subfolder names
subfolders = set()
for item in response.get('Contents', []):
    # Get the key and extract the subfolder name
    key = item['Key']
    parts = key.split('/')
    if len(parts) > 2:  # Ensure there's a subfolder level
        subfolders.add(parts[1])  # Add the subfolder name to the set

# Count the number of unique subfolders
total_subfolders = len(subfolders)
print(f"Total number of unique subfolders: {total_subfolders}")

Total number of unique subfolders: 166


In [163]:
len(all_schemas)

166

In [164]:
for subfolder, tables in all_schemas.items():
    print(f"Subfolder: {subfolder}")
    for table, columns in tables.items():
        print(f"  Table: {table}")
        for column in columns:
            print(f"    Column: {column}")

Subfolder: academic
  Table: author
    Column: homepage
    Column: name
    Column: oid
  Table: conference
    Column: homepage
    Column: name
  Table: domain
    Column: name
  Table: domain_author
    Column: did
  Table: domain_conference
    Column: did
  Table: journal
    Column: jid
    Column: name
  Table: domain_journal
    Column: jid
  Table: keyword
    Column: kid
  Table: domain_keyword
    Column: kid
  Table: publication
    Column: cid
    Column: citation_num
    Column: jid
    Column: pid
    Column: reference_num
    Column: title
    Column: year
  Table: domain_publication
    Column: pid
  Table: organization
    Column: homepage
    Column: name
    Column: oid
  Table: publication_keyword
    Column: kid
  Table: writes
    Column: pid
  Table: cite
    Column: citing
Subfolder: activity_1
  Table: Activity
  Table: Participates_in
    Column: actid
  Table: Faculty_Participates_in
    Column: actid
  Table: Student
  Table: Faculty
Subfolder: aircraft
 

## upload results back to S3

In [186]:
import boto3
import json

# Initialize the S3 client
s3 = boto3.client('s3')

# S3 bucket details
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'

# Assume all_schemas already exists, structured as {database_name: {table_name: [column_names]}}
# Initialize dictionaries for databases, tables, and columns
databases = {database_name: None for database_name in all_schemas.keys()}
tables = {f"{database_name}.{table_name}": None for database_name, tables in all_schemas.items() for table_name in tables.keys()}
columns = {f"{database_name}.{table_name}.{column_name}": None for database_name, tables in all_schemas.items() for table_name, column_names in tables.items() for column_name in column_names}

# Convert the dictionaries to JSON format
databases_json = json.dumps(databases, indent=4)
tables_json = json.dumps(tables, indent=4)
columns_json = json.dumps(columns, indent=4)

# Define S3 keys for each file
s3_keys = {
    "databases.json": databases_json,
    "tables.json": tables_json,
    "columns.json": columns_json
}

# Save each dictionary to the S3 bucket as a JSON file
for filename, data in s3_keys.items():
    s3.put_object(Bucket=bucket_name, Key=filename, Body=data)
    print(f"{filename} saved to s3://{bucket_name}/{filename}")


databases.json saved to s3://sagemaker-studio-423623869859-3no3d9ie4hx/databases.json
tables.json saved to s3://sagemaker-studio-423623869859-3no3d9ie4hx/tables.json
columns.json saved to s3://sagemaker-studio-423623869859-3no3d9ie4hx/columns.json


In [187]:
len(databases)

166

In [188]:
len(tables)

751

In [189]:
len(columns)

2143

In [190]:
len(questions)

6962

In [191]:
len(entities)

3572