In [None]:
import sqlite3
from transformers import BertTokenizer, BertModel

# Load the database and create a connection
conn = sqlite3.connect("/content/Db-IMDB.db")
cursor = conn.cursor()

# Get all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()


# Store the table names and column names in a dictionary, whree the key is the table name and the value is a list of column names
table_dict = {}
for table in tables:
    cursor.execute("PRAGMA table_info({})".format(table[0]))
    table_dict[table[0]] = [column[1] for column in cursor.fetchall()]

# Print the table names and column names
for table in table_dict:
    print(table)
    print(table_dict[table])

# Get English Question from user
question = input("Enter your question: ")

# Tokenize the question
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_question = tokenizer.tokenize(question)

# Semantic search for the question to find the semantically matching table names and column names of the database
# The semantic search is done using the BERT model
def semantic_search(question, table_dict):
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()
    question_embedding = model(tokenizer(question, return_tensors="pt")["input_ids"])[0]
    table_dict_embeddings = {}
    for table in table_dict:
        table_dict_embeddings[table] = model(tokenizer(" ".join(table_dict[table]), return_tensors="pt")["input_ids"])[0]
    table_dict_scores = {}
    for table in table_dict_embeddings:
        table_dict_scores[table] = model.cosine_similarity(question_embedding, table_dict_embeddings[table])
    return table_dict_scores

# Get the table name and column name with the highest score
def get_best_match(table_dict_scores):
    best_match = max(table_dict_scores, key=table_dict_scores.get)
    best_match_score = table_dict_scores[best_match]
    return best_match, best_match_score

# Print the table name and column name with the highest score
table_dict_scores = semantic_search(question, table_dict)
best_match, best_match_score = get_best_match(table_dict_scores)
print("Best match: ", best_match)
print("Best match score: ", best_match_score)


# close the connection
conn.close()

In [None]:
# Load the BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess the English question
question = "List all movies where ?"
input_ids = torch.tensor(tokenizer.encode(
    question)).unsqueeze(0)  # Batch size 1

# Generate a semantic representation of the question
output = model(input_ids)
print(output)

In [None]:
import sqlite3
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Query sentence
query_sentence = 'what are the names of movies?'

# Connect to database and fetch table names and column names
conn = sqlite3.connect('/content/Db-IMDB.db')
cursor = conn.cursor()

table_names = [table_info[0] for table_info in cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()]
column_names = []
for table_name in table_names:
    cursor.execute(f"PRAGMA table_info({table_name});")
    column_names.extend([column_info[1] for column_info in cursor.fetchall()])

# Tokenize query sentence, table names, and column names
query_sentence_encoded = tokenizer([query_sentence], padding=True, truncation=True, return_tensors='pt')
table_names_encoded = tokenizer(table_names, padding=True, truncation=True, return_tensors='pt')
column_names_encoded = tokenizer(column_names, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings for query sentence, table names, and column names
with torch.no_grad():
    query_sentence_output = model(**query_sentence_encoded)
    table_names_output = model(**table_names_encoded)
    column_names_output = model(**column_names_encoded)

# Perform pooling for query sentence, table names, and column names
query_sentence_embedding = mean_pooling(query_sentence_output, query_sentence_encoded['attention_mask'])
table_names_embeddings = mean_pooling(table_names_output, table_names_encoded['attention_mask'])
column_names_embeddings = mean_pooling(column_names_output, column_names_encoded['attention_mask'])

# Normalize embeddings for query sentence, table names, and column names
query_sentence_embedding = F.normalize(query_sentence_embedding, p=2, dim=1)
table_names_embeddings = F.normalize(table_names_embeddings, p=2, dim=1)
column_names_embeddings = F.normalize(column_names_embeddings, p=2, dim=1)

# Find the most similar table names and column names by computing the cosine similarity between the query sentence embedding and the table names and column names embeddings
cosine_similarities_tables = torch.nn.functional.cosine_similarity(query_sentence_embedding, table_names_embeddings, dim=1)
most_similar_table_names_indices = cosine_similarities_tables.argsort(descending=True)
most_similar_table_names = [table_names[i] for i in most_similar_table_names_indices]

cosine_similarities_columns = torch.nn.functional.cosine_similarity(query_sentence_embedding, column_names_embeddings, dim=1)
most_similar_column_names_indices = cosine_similarities_columns.argsort(descending=True)
most_similar_column_names = [column_names[i] for i in most_similar_column_names_indices]

print(f"Most similar table names to '{query_sentence}': {most_similar_table_names}")
print(f"Most similar column names to '{query_sentence}': {most_similar_column_names}")

# Print the most similar table names with there cosine similarity scores in descending order
for i in range(len(most_similar_table_names)):
    print(f"Table name: {most_similar_table_names[i]}, cosine similarity score: {cosine_similarities_tables[most_similar_table_names_indices[i]]}")

# Print the highest cosine similarity score table name, and print the highest cosine similarity score column name of the highest cosine similarity score table name
print(f"Table name with highest cosine similarity score: {most_similar_table_names[0]}")
print(f"Column name with highest cosine similarity score: {most_similar_column_names[0]}")



# Close database connection
conn.close()