<a href="https://colab.research.google.com/github/cooolbabu/GoogleGemini101/blob/main/OpenAI/SimpleEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple embeddings

# Installation


In [1]:
!pip install openai -q
!pip install supabase -q
!pip install pyodbc -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.9/266.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m334.7/334.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

## Get API Keys and setup

In [2]:
from openai import OpenAI

from supabase import create_client, Client

from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

import pandas as pd
import pyodbc

import requests
import json
from google.colab import userdata

# setup the gemini pro
# gemini_llm = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3, google_api_key=userdata.get('Gemini_API_Key'))

# setup Supabase connection
supabase_url = userdata.get('SUPABASE_URL')
supabase_key = userdata.get('SUPABASE_KEY')
supabase_client = create_client(supabase_url, supabase_key)

# OpenAI key
openai_api_key = userdata.get('OPENAI_API_KEY')
openai_client = OpenAI(api_key=openai_api_key)


## Helper functions to run code


In [5]:
from openai import OpenAI
import torch

# Function to get embedding from text using OpenAI's embedding model
def get_embedding(text, model="text-embedding-3-small"):
 return openai_client.embeddings.create(input=[text], model=model).data[0].embedding



In [6]:
# Original input as is - Score is 0.8646

sql_query_1 = 'SELECT "Genre"."Name" AS "Genre", EXTRACT(QUARTER FROM "Invoice"."InvoiceDate") AS "Quarter", SUM("InvoiceLine"."UnitPrice" * "InvoiceLine"."Quantity") AS "Total Sales" FROM "Genre" INNER JOIN "Track" ON "Genre"."GenreId" = "Track"."GenreId" INNER JOIN "InvoiceLine" ON "Track"."TrackId" = "InvoiceLine"."TrackId" INNER JOIN "Invoice" ON "InvoiceLine"."InvoiceId" = "Invoice"."InvoiceId" GROUP BY "Genre", "Quarter"'

sql_query_2 = 'SELECT "Genre"."Name", SUM("InvoiceLine"."UnitPrice" * "InvoiceLine"."Quantity") AS "Total Sales" FROM "Genre" INNER JOIN "Track" ON "Genre"."GenreId" = "Track"."GenreId" INNER JOIN "InvoiceLine" ON "Track"."TrackId" = "InvoiceLine"."TrackId" GROUP BY "Genre"."Name" ORDER BY "Total Sales" DESC LIMIT 5;'

# Get the embeddings and convert them into tensors
sql_query_1_embedding = torch.FloatTensor(get_embedding(text=sql_query_1))
sql_query_2_embedding = torch.FloatTensor(get_embedding(text=sql_query_2))

# Get the dot product/similarity score
torch.dot(sql_query_1_embedding, sql_query_2_embedding)


tensor(0.8646)

In [7]:
# Removed the first word SELECT from both statements
#  - New Score is 0.8691
#  - Expectation - No change to score
#  - Is 0.055 change significant - Maybe not. At what point do we say that the change is significant ?

sql_query_1 = '"Genre"."Name" AS "Genre", EXTRACT(QUARTER FROM "Invoice"."InvoiceDate") AS "Quarter", SUM("InvoiceLine"."UnitPrice" * "InvoiceLine"."Quantity") AS "Total Sales" FROM "Genre" INNER JOIN "Track" ON "Genre"."GenreId" = "Track"."GenreId" INNER JOIN "InvoiceLine" ON "Track"."TrackId" = "InvoiceLine"."TrackId" INNER JOIN "Invoice" ON "InvoiceLine"."InvoiceId" = "Invoice"."InvoiceId" GROUP BY "Genre", "Quarter"'

sql_query_2 = '"Genre"."Name", SUM("InvoiceLine"."UnitPrice" * "InvoiceLine"."Quantity") AS "Total Sales" FROM "Genre" INNER JOIN "Track" ON "Genre"."GenreId" = "Track"."GenreId" INNER JOIN "InvoiceLine" ON "Track"."TrackId" = "InvoiceLine"."TrackId" GROUP BY "Genre"."Name" ORDER BY "Total Sales" DESC LIMIT 5;'

# Get the embeddings and convert them into tensors
sql_query_1_embedding = torch.FloatTensor(get_embedding(text=sql_query_1))
sql_query_2_embedding = torch.FloatTensor(get_embedding(text=sql_query_2))

# Get the dot product/similarity score
torch.dot(sql_query_1_embedding, sql_query_2_embedding)

tensor(0.8691)

In [8]:
# Here I removed the double quotes from both statements
# Score went even further up
# Need to test this again.

sql_query_1 = 'SELECT Genre.Name AS Genre, EXTRACT(QUARTER FROM Invoice.InvoiceDate) AS Quarter, SUM(InvoiceLine.UnitPrice * InvoiceLine.Quantity) AS Total Sales FROM Genre" INNER JOIN Track ON Genre.GenreId = Track.GenreId INNER JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId INNER JOIN Invoice ON InvoiceLine.InvoiceId = Invoice.InvoiceId GROUP BY Genre, Quarter'

sql_query_2 = 'SELECT Genre.Name, SUM(InvoiceLine.UnitPrice * InvoiceLine.Quantity) AS Total Sales FROM Genre INNER JOIN Track ON Genre.GenreId = Track.GenreId INNER JOIN InvoiceLine ON Track.TrackId = InvoiceLine.TrackId GROUP BY Genre.Name ORDER BY Total Sales DESC LIMIT 5;'

# Get the embeddings and convert them into tensors
sql_query_1_embedding = torch.FloatTensor(get_embedding(text=sql_query_1))
sql_query_2_embedding = torch.FloatTensor(get_embedding(text=sql_query_2))

# Get the dot product/similarity score
torch.dot(sql_query_1_embedding, sql_query_2_embedding)

tensor(0.8727)