In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m111.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17

In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [3]:
import boto3
import pandas as pd
import json
import matplotlib.pyplot as plt
import time
import torch
import torch.nn.functional as F
import ast
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification, AutoModel
import os
# from huggingface_hub import login
from sentence_transformers import SentenceTransformer


# Configure pandas to display all columns and their full content without truncation
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Do not truncate column contents
pd.set_option('display.expand_frame_repr', False)  # Avoid wrapping to the next line

Matplotlib is building the font cache; this may take a moment.


## load questions and sql queries in spider train

In [4]:
# Define your S3 bucket and file key
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'

# Initialize the S3 client
s3_client = boto3.client('s3')

# Load the file from S3
obj = s3_client.get_object(Bucket=bucket_name, Key='df_question_entities_tables.csv')
df_question_entities_tables = pd.read_csv(obj['Body'])
df_question_entities_tables['entities_for_tables'] = df_question_entities_tables['entities_for_tables'].apply(ast.literal_eval)
df_question_entities_tables['entities_for_columns'] = df_question_entities_tables['entities_for_columns'].apply(ast.literal_eval)
df_question_entities_tables['tables'] = df_question_entities_tables['tables'].apply(ast.literal_eval)
df_question_entities_tables['entities'] = df_question_entities_tables['entities'].apply(ast.literal_eval)

obj = s3_client.get_object(Bucket=bucket_name, Key='df_schema_table.csv')
df_schema_table = pd.read_csv(obj['Body'])

In [5]:
df_question_entities_tables.head(3)

Unnamed: 0,question,entities_for_tables,entities_for_columns,query,tables,entities
0,How many heads of the departments are older than 56 ?,[departments],"[heads, age]",SELECT count(*) FROM head WHERE age > 56,[head],"[departments, heads, age]"
1,"List the name, born state and age of the heads of departments ordered by age.",[heads of departments],"[name, born state, age]","SELECT name , born_state , age FROM head ORDER BY age",[head],"[heads of departments, name, born state, age]"
2,"List the creation year, name and budget of each department.",[department],"[creation year, name, budget]","SELECT creation , name , budget_in_billions FROM department",[department],"[department, creation year, name, budget]"


In [6]:
df_schema_table.head(3)

Unnamed: 0,database,table,processed_database,processed_table,database_and_table
0,academic,author,academic,author,academic author
1,academic,conference,academic,conference,academic conference
2,academic,domain,academic,domain,academic domain


In [7]:
things_to_be_embedded_list = []


for ind, row in df_question_entities_tables.iterrows():
    things_to_be_embedded_list += [row['question']]
    things_to_be_embedded_list += row['entities']

for ind, row in df_schema_table.iterrows():
    things_to_be_embedded_list += [row['database_and_table']]
    things_to_be_embedded_list += [row['processed_table']]
    
things_to_be_embedded_list = list(set(things_to_be_embedded_list))
things_to_be_embedded_list = [thing.lower() for thing in things_to_be_embedded_list]

In [8]:
len(things_to_be_embedded_list)

11685

## embedding

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from tqdm import tqdm

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to process sentences in batches
def process_in_batches(sentences, batch_size, tokenizer, model):
    all_embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing Batches"):
        batch_sentences = sentences[i:i + batch_size]
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, max_length=384, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)
        batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
        all_embeddings.append(batch_embeddings)
    return torch.cat(all_embeddings, dim=0)

# Sentences we want sentence embeddings for
sentences = things_to_be_embedded_list

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Define batch size
batch_size = 1024

# Process sentences in batches and combine results
sentence_embeddings = process_in_batches(sentences, batch_size, tokenizer, model)

print("Sentence embeddings:")
print(sentence_embeddings)

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Processing Batches: 100%|██████████| 12/12 [12:01<00:00, 60.16s/it]

Sentence embeddings:
tensor([[-0.0133,  0.0423, -0.0209,  ..., -0.0277, -0.0360,  0.0106],
        [-0.0254,  0.0018, -0.0047,  ...,  0.0306,  0.0238, -0.0254],
        [-0.0306,  0.0286, -0.0199,  ..., -0.0018, -0.0163, -0.0330],
        ...,
        [-0.0018,  0.0136, -0.0257,  ...,  0.0022, -0.0106,  0.0069],
        [ 0.0033, -0.0902, -0.0366,  ...,  0.0204,  0.0122, -0.0327],
        [ 0.0535, -0.0309, -0.0035,  ..., -0.0168, -0.0150, -0.0371]])





In [None]:
sentence_embeddings.shape

torch.Size([11685, 768])

In [None]:
len(things_to_be_embedded_list)

11685

In [None]:
# Create dictionary
embedded_dict = {
    item: sentence_embeddings[i].tolist()
    for i, item in enumerate(things_to_be_embedded_list)
}

# Save to JSON
local_file = 'embeddings.json'
with open(local_file, 'w') as f:
    json.dump(embedded_dict, f)

# Upload JSON to S3
s3_client.upload_file(local_file, bucket_name, local_file)

print(f"File {local_file} successfully uploaded to s3://{bucket_name}/{local_file}")

File embeddings.json successfully uploaded to s3://sagemaker-studio-423623869859-3no3d9ie4hx/embeddings.json
