In [1]:
import sys
!{sys.executable} -m pip install openai-whisper
import whisper
import torch
import gradio as gr
import pandas as pd
import sqlite3
import subprocess
import os

from transformers import AutoTokenizer, AutoModelForCausalLM

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m358.4/803.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton>=2 (from openai-whisper)
  Downloading triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Downloading triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (170.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.5/170.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?2

In [2]:
from google.colab import files
files.upload()


Saving categories.csv to categories.csv
Saving customers.csv to customers.csv
Saving employees.csv to employees.csv
Saving order_details.csv to order_details.csv
Saving orders.csv to orders.csv
Saving products.csv to products.csv
Saving shippers.csv to shippers.csv


{'categories.csv': b'categoryID,categoryName,description\r\n1,Beverages,"Soft drinks, coffees, teas, beers, and ales"\r\n2,Condiments,"Sweet and savory sauces, relishes, spreads, and seasonings"\r\n3,Confections,"Desserts, candies, and sweet breads"\r\n4,Dairy Products,Cheeses\r\n5,Grains & Cereals,"Breads, crackers, pasta, and cereal"\r\n6,Meat & Poultry,Prepared meats\r\n7,Produce,Dried fruit and bean curd\r\n8,Seafood,Seaweed and fish\r\n',
 'customers.csv': b"customerID,companyName,contactName,contactTitle,city,country\r\nALFKI,Alfreds Futterkiste,Maria Anders,Sales Representative,Berlin,Germany\r\nANATR,Ana Trujillo Emparedados y helados,Ana Trujillo,Owner,Mexico City,Mexico\r\nANTON,Antonio Moreno Taquer\xeda,Antonio Moreno,Owner,Mexico City,Mexico\r\nAROUT,Around the Horn,Thomas Hardy,Sales Representative,London,UK\r\nBERGS,Berglunds snabbk\xf6p,Christina Berglund,Order Administrator,Lule\xe5,Sweden\r\nBLAUS,Blauer See Delikatessen,Hanna Moos,Sales Representative,Mannheim,German

In [3]:
import os
import gradio as gr
import pandas as pd
import sqlite3
import whisper
from openai import OpenAI
import re


os.environ["OPENAI_API_KEY"] = ""
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
print("✅ OpenAI client initialized.")


DATA_FOLDER = "/content/"
DB_PATH = "northwind.db"

TABLES = {
    "Employees": "employees.csv",
    "Customers": "customers.csv",
    "Orders": "orders.csv",
    "OrderDetails": "order_details.csv",
    "Products": "products.csv",
    "Categories": "categories.csv",
    "Shippers": "shippers.csv"
}

for f in TABLES.values():
    if not os.path.exists(os.path.join(DATA_FOLDER, f)):
        raise FileNotFoundError(f"❌ Missing {f}")


def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].str.replace(r'[\x00-\x1f]', '', regex=True)
    return df

conn = sqlite3.connect(DB_PATH)
for table, file in TABLES.items():
    df = pd.read_csv(os.path.join(DATA_FOLDER, file), encoding="latin1")
    df.columns = df.columns.str.replace(" ", "")
    df = clean_dataframe(df)
    df.to_sql(table, conn, if_exists="replace", index=False)

    print(f"✅ Loaded & cleaned {table}")
    print(f"\n--- {table} Table Preview ---")
    print(pd.read_sql(f"SELECT * FROM {table} LIMIT 10", conn))
    print("-------------------------------\n")

conn.close()


whisper_model = whisper.load_model("base")
print("✅ Whisper model loaded")

SCHEMA = """
Employees(EmployeeID, LastName, FirstName, Title)
Customers(CustomerID, CompanyName, Country)
Orders(OrderID, CustomerID, EmployeeID, OrderDate)
OrderDetails(OrderID, ProductID, UnitPrice, Quantity, Discount)
Products(ProductID, ProductName, SupplierID, CategoryID, UnitPrice)
Categories(CategoryID, CategoryName)
Shippers(ShipperID, CompanyName)
"""

# -----------------------------
# 9️⃣ Voice → SQL Logic (FIXED)
# -----------------------------
def voice_to_sql(audio_path):
    if audio_path is None:
        return "❌ No audio provided.", None

    text = whisper_model.transcribe(audio_path)["text"].strip()

    prompt = f"""
You are an expert SQLite SQL generator.

STRICT RULES:
- Use ONLY information stated in the question
- NEVER invent values or column names
- If the question mentions "sold", "sales", or "ordered":
  YOU MUST JOIN OrderDetails
- A product is SOLD only if it exists in OrderDetails
- Products table alone does NOT indicate sales
- Use DISTINCT when duplicates are possible
- Generate ONE valid SQLite SQL statement
- NO explanations
- NO markdown

Schema:
{SCHEMA}

Question:
{text}

SQL:
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300
    )

    sql = response.choices[0].message.content.strip()
    sql = re.sub(r"```.*?\n|```", "", sql, flags=re.DOTALL).strip()

    try:
        conn = sqlite3.connect(DB_PATH)

        if sql.upper().startswith("SELECT"):
            df = pd.read_sql(sql, conn)
            conn.close()
            return f"""
🎤 Question:
{text}

🧾 SQL:
{sql}

📊 Result (first 20 rows):
{df.head(20)}
""", None

        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        affected = cursor.rowcount
        conn.close()

        return f"""
🎤 Question:
{text}

🧾 SQL:
{sql}

✅ Executed successfully
Rows affected: {affected}
""", None

    except Exception as e:
        return f"""
❌ SQL ERROR

Question:
{text}

SQL:
{sql}

Error:
{e}
""", None

# -----------------------------
# 🔟 Gradio UI
# -----------------------------
iface = gr.Interface(
    fn=voice_to_sql,
    inputs=gr.Audio(type="filepath", label="🎤 Speak your database question"),
    outputs=[
        gr.Textbox(label="Result"),
        gr.File(label="⬇️ Download Updated CSV")
    ],
    title="🎓 Voice-to-SQL Assistant (Northwind)",
    description="Whisper + GPT | Correct SOLD logic using OrderDetails"
)

# -----------------------------
# 1️⃣1️⃣ Launch
# -----------------------------
iface.launch(share=True, debug=False)


✅ OpenAI client initialized.
✅ Loaded & cleaned Employees

--- Employees Table Preview ---
   employeeID      employeeName                 title      city country  \
0           1     Nancy Davolio  Sales Representative  New York     USA   
1           2     Andrew Fuller  Vice President Sales  New York     USA   
2           3   Janet Leverling  Sales Representative  New York     USA   
3           4  Margaret Peacock  Sales Representative  New York     USA   
4           5   Steven Buchanan         Sales Manager    London      UK   
5           6    Michael Suyama  Sales Representative    London      UK   
6           7       Robert King  Sales Representative    London      UK   
7           8    Laura Callahan         Sales Manager  New York     USA   
8           9    Anne Dodsworth  Sales Representative    London      UK   

   reportsTo  
0        8.0  
1        NaN  
2        8.0  
3        8.0  
4        2.0  
5        5.0  
6        5.0  
7        2.0  
8        5.0  
--------

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 102MiB/s]


✅ Whisper model loaded
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://88480595eae01d4ad7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


