In [4]:
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
import os
import json
import os
from openai import OpenAI

client = OpenAI()
EMBEDDING_MODEL = "text-embedding-3-small"

# Load environment variables from .env file in parent directory
load_dotenv(dotenv_path='../../.env')

# Retrieve database credentials from environment variables
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')

def create_embedding(data):

    response = client.embeddings.create(model=EMBEDDING_MODEL, input=[data])
    embedding = [e.embedding for e in response.data][0]

    return embedding

class DatabaseConnection:
    def __init__(self):
        self.connection = None
        self.connect()
    
    def connect(self):
        try:
            self.connection = mysql.connector.connect(
                host=db_host,
                user=db_user,
                password=db_password
            )
            if self.connection.is_connected():
                print("Connected to MySQL server")
        except Error as e:
            print(f"Error: {e}")
            self.connection = None
    
    def close(self):
        if self.connection and self.connection.is_connected():
            self.connection.close()
            print("MySQL connection is closed")

    def get_databases(self):
        if not self.connection or not self.connection.is_connected():
            print("No active MySQL connection")
            return []

        try:
            cursor = self.connection.cursor()
            cursor.execute("SHOW DATABASES")
            databases = cursor.fetchall()
            return [db[0] for db in databases]
        
        except Error as e:
            print(f"Error: {e}")
            return []
        
        finally:
            cursor.close()

    def get_tables(self, db_name):
        if not self.connection or not self.connection.is_connected():
            print("No active MySQL connection")
            return []

        try:
            cursor = self.connection.cursor()
            cursor.execute(f"USE {db_name}")
            cursor.execute("SHOW TABLES")
            tables = cursor.fetchall()
            return [table[0] for table in tables]
        
        except Error as e:
            print(f"Error: {e}")
            return []
        
        finally:
            cursor.close()

    def describe_table(self, db_name, table_name):
        if not self.connection or not self.connection.is_connected():
            print("No active MySQL connection")
            return None

        try:
            cursor = self.connection.cursor()
            cursor.execute(f"USE {db_name}")
            cursor.execute(f"DESCRIBE {table_name}")
            schema = cursor.fetchall()
            return schema
        
        except Error as e:
            print(f"Error: {e}")
            return None
        
        finally:
            cursor.close()

embeddings = []

db_conn = DatabaseConnection()
tables = ['job', 'job_type_config', 'shedlock']
for table in tables:
    schema = db_conn.describe_table('hyperface_platform_dev', table)

    for row in schema:
        # convert row into text description
        row_description = f"The table {table} has a column {row[0]} of type {row[1]}"
        if row[2] == 'YES':
            row_description += " that can be null"
        else:
            row_description += " that cannot be null"
        if row[3] == 'PRI':
            row_description += " and is a primary key"
        if row[4]:
            row_description += f" with a default value of {row[4]}"
        if row[5]:
            row_description += f" and has the extra attribute {row[5]}"

        print('create_embedding')
        embedding = create_embedding(row_description)
        embeddings.append((table, embedding))

db_conn.close()

Connected to MySQL server
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
create_embedding
MySQL connection is closed


In [10]:
# saving to avoid API call
import pickle
pickle.dump(embeddings, open('pickles/per_field_chunking.pickle', 'ab'))


In [15]:
# run this cell if file already exists
from openai import OpenAI

client = OpenAI()
EMBEDDING_MODEL = "text-embedding-3-small"

import pickle
embeds = pickle.load(open('pickles/per_field_chunking.pickle', 'rb'))

def create_embedding(data):

    response = client.embeddings.create(model=EMBEDDING_MODEL, input=[data])
    embedding = [e.embedding for e in response.data][0]

    return embedding

In [12]:
query = 'What is the status of the most recently created job?'
query_embed = create_embedding(query)

In [13]:
from scipy import spatial

relatedness_fn = lambda x, y: 1 - spatial.distance.cosine(x, y)
score = []

for table, embedding in embeds:
    score.append((table, relatedness_fn(query_embed, embedding)))

score.sort(key=lambda x: x[1], reverse=True)
score

[('job', 0.4398334302075029),
 ('job', 0.4033416455172433),
 ('job', 0.4011145512433245),
 ('job', 0.3970788530236249),
 ('job', 0.3903694832422804),
 ('job', 0.379970496892571),
 ('job_type_config', 0.37784049382201224),
 ('job_type_config', 0.37115011995085756),
 ('job_type_config', 0.3681487174713822),
 ('job', 0.36201979294882725),
 ('job', 0.35962956797435075),
 ('job', 0.3358276958250459),
 ('job', 0.33502117478335314),
 ('job_type_config', 0.3236556879342578),
 ('job_type_config', 0.31895634406910356),
 ('job', 0.3162342008158281),
 ('job_type_config', 0.30476063614253746),
 ('job', 0.3016656491570855),
 ('job', 0.296449378552444),
 ('job', 0.29586377237000927),
 ('job_type_config', 0.292151425682769),
 ('shedlock', 0.290097331982616),
 ('job', 0.28982830981307206),
 ('job', 0.28731027323675185),
 ('job_type_config', 0.28092386185181595),
 ('job', 0.2801905573535389),
 ('job_type_config', 0.2789525013094113),
 ('job_type_config', 0.2706383321666499),
 ('job_type_config', 0.27018

In [14]:
query = 'What is the batch size of the most recently created job?'
query_embed = create_embedding(query)

from scipy import spatial

relatedness_fn = lambda x, y: 1 - spatial.distance.cosine(x, y)
score = []

for table, embedding in embeds:
    score.append((table, relatedness_fn(query_embed, embedding)))

# group by table
max_score = {}
for table, s in score:
    if table not in max_score or s > max_score[table]:
        max_score[table] = s

score = [(k, v) for k, v in max_score.items()]

score.sort(key=lambda x: x[1], reverse=True)
score

[('job_type_config', 0.5779927323198352),
 ('job', 0.3325697380943399),
 ('job_type_config', 0.32120782236582635),
 ('job', 0.29660478050594374),
 ('job_type_config', 0.2917379845298529),
 ('job', 0.2792354286856674),
 ('job', 0.27488679664031634),
 ('job_type_config', 0.27164758401965905),
 ('job_type_config', 0.2674477065255738),
 ('job', 0.26348026219522325),
 ('job', 0.2630251384021842),
 ('job', 0.2620354851172739),
 ('job_type_config', 0.26052673693044304),
 ('job', 0.25931627836842264),
 ('job', 0.25883002498458885),
 ('job_type_config', 0.2567758426566171),
 ('job_type_config', 0.2559751500034678),
 ('job_type_config', 0.2550739857512214),
 ('job', 0.25440320143264306),
 ('job', 0.24817023599288068),
 ('job_type_config', 0.23887930979092997),
 ('job', 0.23758586386455827),
 ('job_type_config', 0.23493785810644308),
 ('job', 0.23406117062399756),
 ('job', 0.23184754685694497),
 ('job_type_config', 0.2315998324430717),
 ('job', 0.23124288453104735),
 ('job_type_config', 0.2297428

In [16]:
query = 'Is the most recently modified job encrypted?'
query_embed = create_embedding(query)

from scipy import spatial

relatedness_fn = lambda x, y: 1 - spatial.distance.cosine(x, y)
score = []

for table, embedding in embeds:
    score.append((table, relatedness_fn(query_embed, embedding)))

score.sort(key=lambda x: x[1], reverse=True)
score

[('job_type_config', 0.5175739728939045),
 ('job_type_config', 0.4815507332280401),
 ('job_type_config', 0.42334487728636727),
 ('job', 0.4127968242304896),
 ('job', 0.36021329202667074),
 ('job', 0.352656118078436),
 ('shedlock', 0.3518127326604361),
 ('shedlock', 0.3440607382322173),
 ('shedlock', 0.3344497513215412),
 ('job_type_config', 0.32922027910573093),
 ('job', 0.3213312502707235),
 ('job', 0.317928655247269),
 ('job_type_config', 0.3168386764192914),
 ('job_type_config', 0.3144720221196563),
 ('job', 0.3124554018163377),
 ('job', 0.31132618951989577),
 ('job', 0.31112269556451144),
 ('job_type_config', 0.3033364030065959),
 ('job_type_config', 0.30185680514904534),
 ('job', 0.29892971600355256),
 ('job_type_config', 0.2967510534282838),
 ('job_type_config', 0.2961033022525734),
 ('job', 0.29537855647416955),
 ('job', 0.2942562214061928),
 ('job_type_config', 0.29366950540480075),
 ('job', 0.29295276587979835),
 ('job_type_config', 0.2912813682923403),
 ('job_type_config', 0.