In [1]:
!pip install transformers



In [2]:
!pip install sentence-transformers



In [3]:
import boto3
import pandas as pd
import json
import matplotlib.pyplot as plt
import time
import torch
import torch.nn.functional as F
import ast
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification, AutoModel
import os
# from huggingface_hub import login
from sentence_transformers import SentenceTransformer


# Configure pandas to display all columns and their full content without truncation
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Do not truncate column contents
pd.set_option('display.expand_frame_repr', False)  # Avoid wrapping to the next line

## load questions and sql queries in spider train

In [4]:
# Define your S3 bucket and file key
bucket_name = 'sagemaker-studio-423623869859-3no3d9ie4hx'

# Initialize the S3 client
s3_client = boto3.client('s3')

# Load the file from S3
obj = s3_client.get_object(Bucket=bucket_name, Key='df_question_entities_tables.csv')
df_question_entities_tables = pd.read_csv(obj['Body'])
df_question_entities_tables['entities_for_tables'] = df_question_entities_tables['entities_for_tables'].apply(ast.literal_eval)
df_question_entities_tables['entities_for_columns'] = df_question_entities_tables['entities_for_columns'].apply(ast.literal_eval)
df_question_entities_tables['tables'] = df_question_entities_tables['tables'].apply(ast.literal_eval)
df_question_entities_tables['entities'] = df_question_entities_tables['entities'].apply(ast.literal_eval)

obj = s3_client.get_object(Bucket=bucket_name, Key='df_schema_table.csv')
df_schema_table = pd.read_csv(obj['Body'])

In [5]:
df_question_entities_tables.head(3)

Unnamed: 0,question,entities_for_tables,entities_for_columns,query,tables,entities
0,How many heads of the departments are older than 56 ?,[departments],"[heads, age]",SELECT count(*) FROM head WHERE age > 56,[head],"[departments, heads, age]"
1,"List the name, born state and age of the heads of departments ordered by age.",[heads of departments],"[name, born state, age]","SELECT name , born_state , age FROM head ORDER BY age",[head],"[heads of departments, name, born state, age]"
2,"List the creation year, name and budget of each department.",[department],"[creation year, name, budget]","SELECT creation , name , budget_in_billions FROM department",[department],"[department, creation year, name, budget]"


In [6]:
df_schema_table.head(3)

Unnamed: 0,database,table,processed_database,processed_table,database_and_table
0,academic,author,academic,author,academic author
1,academic,conference,academic,conference,academic conference
2,academic,domain,academic,domain,academic domain


In [7]:
things_to_be_embedded_list = []

for ind, row in df_question_entities_tables.iterrows():
    things_to_be_embedded_list += row['entities']

for ind, row in df_schema_table.iterrows():
    things_to_be_embedded_list += [row['database_and_table']]
    
things_to_be_embedded_list = list(set(things_to_be_embedded_list))
things_to_be_embedded_list = [thing.lower() for thing in things_to_be_embedded_list]

In [8]:
len(things_to_be_embedded_list)

4321

## embedding

In [9]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = things_to_be_embedded_list

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


Sentence embeddings:
tensor([[ 0.0024,  0.0341, -0.0257,  ...,  0.0125, -0.0375, -0.0033],
        [ 0.0142, -0.0551,  0.0238,  ..., -0.0071, -0.0390, -0.0623],
        [-0.0161,  0.0240, -0.0345,  ..., -0.0205, -0.0257, -0.0360],
        ...,
        [ 0.0191, -0.0588, -0.0216,  ...,  0.0334, -0.0394, -0.0032],
        [ 0.0119,  0.0751,  0.0122,  ..., -0.0035, -0.0596,  0.0056],
        [ 0.0129,  0.0743, -0.0126,  ..., -0.0088, -0.0164, -0.0336]])


In [10]:
sentence_embeddings.shape

torch.Size([4321, 768])

In [11]:
len(things_to_be_embedded_list)

4321

In [12]:
# Create dictionary
embedded_dict = {
    item: sentence_embeddings[i].tolist()
    for i, item in enumerate(things_to_be_embedded_list)
}

# Save to JSON
local_file = 'embeddings.json'
with open(local_file, 'w') as f:
    json.dump(embedded_dict, f)

# Upload JSON to S3
s3_client.upload_file(local_file, bucket_name, local_file)

print(f"File {local_file} successfully uploaded to s3://{bucket_name}/{local_file}")

File embeddings.json successfully uploaded to s3://sagemaker-studio-423623869859-3no3d9ie4hx/embeddings.json
