In [1]:
print("hello world") #See if your kernel crasher or not

hello world


In [1]:
import psutil
import os

def check_memory():
    process = psutil.Process(os.getpid())
    print(f"Memory Usage: {process.memory_info().rss / (1024 * 1024)} MB")

check_memory()

Memory Usage: 60.19921875 MB


In [3]:
from transformers import pipeline

test_pipeline = pipeline("text-generation", model="distilgpt2", device=0)
print(test_pipeline("Hello, how are you?", max_length=20, num_return_sequences=1))


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
'NoneType' object has no attribute 'exists'

In [None]:
# Import required libraries
from transformers import pipeline
from datasketch import MinHashLSHForest, MinHash
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
import numpy as np

# Step 1: Keyword Extraction
def extract_keywords(question):
    """
    Extract keywords from a natural language question using a smaller LLM.
    """
    # Load a smaller LLM for keyword extraction
    keyword_extractor = pipeline("text-generation", model="distilgpt2")
    
    # Generate keywords
    prompt = f"Extract keywords from the following question: {question}"
    keywords = keyword_extractor(prompt, max_length=50, num_return_sequences=1)
    return keywords[0]['generated_text'].split()  # Simple split for demo purposes

# Step 2: Locality-Sensitive Hashing (LSH)
def create_lsh_forest(database_words):
    """
    Create an LSH Forest and add database words to it.
    """
    forest = MinHashLSHForest(num_perm=128)
    
    for i, word in enumerate(database_words):
        m = MinHash(num_perm=128)
        for char in word:
            m.update(char.encode('utf-8'))
        forest.add(i, m)
    
    # Index the forest
    forest.index()
    return forest

def find_similar_words(query_word, forest, database_words, top_k=5):
    """
    Find similar words in the database using LSH.
    """
    m = MinHash(num_perm=32)
    for char in query_word:
        m.update(char.encode('utf-8'))
    results = forest.query(m, top_k)
    return [database_words[i] for i in results]

# Step 3: Re-ranking
def re_rank_words(query_word, similar_words, word_embeddings):
    """
    Re-rank similar words based on embedding similarity and edit distance.
    """
    query_embedding = word_embeddings[query_word]
    scores = []
    
    for word in similar_words:
        # Cosine similarity
        cosine_sim = cosine_similarity([query_embedding], [word_embeddings[word]])[0][0]
        # Edit distance
        edit_dist = levenshtein_distance(query_word, word)
        # Combined score (higher is better)
        combined_score = cosine_sim - (edit_dist / 10)  # Adjust weights as needed
        scores.append((word, combined_score))
    
    # Sort by combined score
    scores.sort(key=lambda x: x[1], reverse=True)
    return [word for word, _ in scores]

# Step 4: SQL Query Generation
def generate_sql(keywords, re_ranked_words):
    """
    Generate an SQL query using extracted keywords and re-ranked words.
    """
    table = "customers"  # Replace with your table name
    columns = ["customer_id", "product", "year"]  # Replace with your columns
    sql = f"SELECT * FROM {table} WHERE "
    conditions = []
    
    for keyword, word in zip(keywords, re_ranked_words):
        if keyword in columns:
            conditions.append(f"{keyword} = '{word}'")
    
    sql += " AND ".join(conditions)
    return sql

# Main Pipeline
def text_to_sql_pipeline(question, database_words, word_embeddings):
    """
    Full pipeline to convert a natural language question into an SQL query.
    """
    # Step 1: Extract keywords
    keywords = extract_keywords(question)
    print("Extracted Keywords:", keywords)
    
    # Step 2: Create LSH Forest and find similar words
    forest = create_lsh_forest(database_words)
    similar_words = []
    for keyword in keywords:
        similar_words.extend(find_similar_words(keyword, forest, database_words))
    print("Similar Words:", similar_words)
    
    # Step 3: Re-rank words
    re_ranked_words = []
    for keyword in keywords:
        re_ranked_words.extend(re_rank_words(keyword, similar_words, word_embeddings))
    print("Re-ranked Words:", re_ranked_words)
    
    # Step 4: Generate SQL query
    sql_query = generate_sql(keywords, re_ranked_words)
    return sql_query

# Example Usage
if __name__ == "__main__":
    # Example database words (replace with your actual database words)
    database_words = ["customer", "product", "2023", "purchase", "order", "year"]
    
    # Example word embeddings (replace with actual embeddings from your LLM)
    word_embeddings = {
        "customer": np.random.rand(32),
        "product": np.random.rand(32),
        "2023": np.random.rand(32),
        "purchase": np.random.rand(32),
        "order": np.random.rand(32),
        "year": np.random.rand(32),
    }
    
    # Example question
    question = "Find all customers who bought products in 2023."
    
    # Run the pipeline
    sql_query = text_to_sql_pipeline(question, database_words, word_embeddings)
    print("Generated SQL Query:", sql_query)

In [None]:
#make sure ollama for python is installed

In [2]:
!pip install ollama
#having an older torch audio like 2.5.1 should not affect much. Even thhough ollama recommend 2.6.0

Collecting ollama
  Downloading ollama-0.4.7-py3-none-any.whl.metadata (4.7 kB)
Collecting pydantic<3.0.0,>=2.9.0 (from ollama)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3.0.0,>=2.9.0->ollama)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.2 (from pydantic<3.0.0,>=2.9.0->ollama)
  Downloading pydantic_core-2.27.2-cp310-cp310-win_amd64.whl.metadata (6.7 kB)
Collecting typing-extensions>=4.12.2 (from pydantic<3.0.0,>=2.9.0->ollama)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading ollama-0.4.7-py3-none-any.whl (13 kB)
Downloading pydantic-2.10.6-py3-none-any.whl (431 kB)
Downloading pydantic_core-2.27.2-cp310-cp310-win_amd64.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 2.0/2.0 MB 15.8 MB/s eta 0:00:00
Downloading annotated_types-0.7.0-py3-none-any.wh

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.6.0+cu118 requires torch==2.6.0+cu118, but you have torch 2.5.1 which is incompatible.


In [1]:
import ollama

In [2]:
# Using Ollama to run Llama3.2 for keyword extraction as first try for the few shot examples
def extract_keywords(text):
    response = ollama.chat(
        model='llama3.2',
        messages=[
            {"role": "user", "content": f"Extract keywords from this text: '{text}'"}
        ]
    )
    return response['message']['content']

# Example input
question = "What are the top-performing stocks in the technology sector for 2024?"
keywords = extract_keywords(question)
print("Extracted Keywords:", keywords)

Extracted Keywords: Here are the extracted keywords:

1. Technology
2. Stocks
3. Top-performing
4. Sector
5. 2024


In [1]:
from datasets import load_dataset

#SPIDER dataset from Hugging Face download
dataset = load_dataset("CM/spider")

#preview
print(dataset)

AttributeError: module 'pyarrow' has no attribute '__version__'

In [1]:
import pyarrow
from datasets import load_dataset
##I had to downgrade pyarrow to 12.01. as the version 19.0 had an error that prevented the from running. Still the prompt warned about datasets needing at least the pyarrow 1.5 version
#Downgrading did not worked, then I noticed the dataset installation i had was old, and procced to update pyarrow and datasets
ds = load_dataset("CM/spider")

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

In [5]:
#Okay this indicates that the data set was successfully downloaded
print(ds)

DatasetDict({
    train: Dataset({
        features: ['db_id', 'query', 'question', 'schema', 'query_res'],
        num_rows: 7672
    })
    test: Dataset({
        features: ['db_id', 'query', 'question', 'schema', 'query_res'],
        num_rows: 665
    })
})


In [2]:
import pandas as pd

# Convert the training set to a pandas DataFrame
df_train = ds['train'].to_pandas()

# Display the first 10 rows
print(df_train.head(10))

#As visualized in the huggingface dataset viewer, the dataset has 6 features(or columns)

                   db_id                                              query  \
0  department_management         SELECT count(*) FROM head WHERE age  >  56   
1  department_management  SELECT name ,  born_state ,  age FROM head ORD...   
2  department_management  SELECT creation ,  name ,  budget_in_billions ...   
3  department_management  SELECT max(budget_in_billions) ,  min(budget_i...   
4  department_management  SELECT avg(num_employees) FROM department WHER...   
5  department_management  SELECT name FROM head WHERE born_state != 'Cal...   
6  department_management  SELECT DISTINCT T1.creation FROM department AS...   
7  department_management  SELECT born_state FROM head GROUP BY born_stat...   
8  department_management  SELECT creation FROM department GROUP BY creat...   
9  department_management  SELECT T1.name ,  T1.num_employees FROM depart...   

                                            question  \
0  How many heads of the departments are older th...   
1  List the name, 

In [10]:
import ollama
def extract_keywords(text):
    few_shot_examples = """Extract keywords from the following questions. Examples:
    
    Question: "How many heads of the departments are older than 56?"
    Keywords: ["heads", "departments", "older", "56"]
    
    Question: "List the name, born state, and age of the heads of departments ordered by age."
    Keywords: ["name", "born state", "age", "heads", "departments", "ordered", "age"]
    
    Question: "What is the average number of employees of the departments where budget is over 1 billion?"
    Keywords: ["average", "number of employees", "departments", "budget", "over", "1 billion"]
    
    Now extract keywords from this new question:
    Question: "{text}"
    Keywords:
    """

    response = ollama.chat(
        model='llama3.2',
        messages=[
            {"role": "user", "content": few_shot_examples.format(text=text)}
        ]
    )
    return response['message']['content']

# Test with an example from the dataset
question = "What are the distinct creation years of the departments?"
keywords = extract_keywords(question)
print("Extracted Keywords:", keywords)


Extracted Keywords: Here are the extracted keywords:

["creation", "years", "departments"]


In [5]:
unique_db_ids = ds["train"].unique("db_id")
print(unique_db_ids)
#this will list all unique entries in db_id column, remember that is necessary to either use the train or test split

['department_management', 'farm', 'student_assessment', 'bike_1', 'book_2', 'musical', 'product_catalog', 'flight_1', 'allergy_1', 'store_1', 'journal_committee', 'customers_card_transactions', 'race_track', 'coffee_shop', 'insurance_fnol', 'medicine_enzyme_interaction', 'university_basketball', 'phone_1', 'match_season', 'climbing', 'body_builder', 'election_representative', 'apartment_rentals', 'game_injury', 'soccer_1', 'performance_attendance', 'debate', 'insurance_and_eClaims', 'customers_and_invoices', 'wedding', 'theme_gallery', 'riding_club', 'gymnast', 'browser_web', 'wrestler', 'school_finance', 'protein_institute', 'cinema', 'products_for_hire', 'phone_market', 'gas_company', 'party_people', 'pilot_record', 'cre_Doc_Control_Systems', 'local_govt_in_alabama', 'machine_repair', 'entrepreneur', 'perpetrator', 'csu_1', 'candidate_poll', 'movie_1', 'county_public_safety', 'local_govt_mdm', 'party_host', 'storm_record', 'election', 'news_report', 'restaurant_1', 'customer_deliveri

In [14]:
import pandas as pd
import ollama

# Function to extract keywords using an LLM
def extract_keywords(text):
    few_shot_prompt = """Extract keywords from the following questions. Examples:
    
    Question: "How many heads of the departments are older than 56?"
    Keywords: ["heads", "departments", "older", "56"]
    
    Question: "List the name, born state, and age of the heads of departments ordered by age."
    Keywords: ["name", "born state", "age", "heads", "departments", "ordered", "age"]
    
    Question: "What is the average number of employees of the departments where budget is over 1 billion?"
    Keywords: ["average", "number of employees", "departments", "budget", "over", "1 billion"]
    
    Now extract keywords from this new question:
    Question: "{text}"
    Keywords:
    """

    response = ollama.chat(
        model='llama3.2',
        messages=[{"role": "user", "content": few_shot_prompt.format(text=text)}]
    )
    return response['message']['content']

# Convert dataset to Pandas DataFrame
df_train = ds['train'].to_pandas()

# Group dataset by db_id
grouped = df_train.groupby("db_id")

few_shot_examples = []

# Loop through each topic
for db_id, group in grouped:
    sampled_questions = group.sample(n=min(4, len(group)), random_state=42)

    for _, row in sampled_questions.iterrows():
        # Extract keywords using LLM
        extracted_keywords = extract_keywords(row["question"])

        # Store formatted example
        few_shot_examples.append(
            f'Question: "{row["question"]}"\nKeywords: {extracted_keywords}\n'
        )

# Save the examples
with open("few_shot_prompt.txt", "w") as f:
    f.write("\n".join(few_shot_examples))

print("Few-shot examples saved. Here’s a preview:\n")
print("\n".join(few_shot_examples[:8]))  # Show first 8 examples


Few-shot examples saved. Here’s a preview:

Question: "return me the authors who have papers in VLDB conference before 2002 ."
Keywords: Keywords:

["authors", "papers", "VLDB", "conference", "before", "2002"]

Question: "return me all the papers, which contain the keyword " Natural Language " ."
Keywords: Here are the extracted keywords:

["papers", "keyword", "Natural Language"]

Question: "return me the author in the " University of Michigan " whose papers have the most total citations ."
Keywords: Here are the extracted keywords:

Keywords: ["author", "University of Michigan", "papers", "total citations"]

Question: "return me the number of papers on VLDB conference ."
Keywords: Here are the extracted keywords:

["papers", "VLDB conference", "number", "return"]

Question: "What is the first and last name of the student participating in the most activities?"
Keywords: Here are the extracted keywords:

Keywords: ["student", "participating in", "most", "activities"]

Question: "How ma

In [12]:
print(grouped["question"].count())

db_id
academic             181
activity_1            88
aircraft              46
allergy_1             98
apartment_rentals     80
                    ... 
voter_2               72
wedding               20
workshop_paper        30
wrestler              40
yelp                 111
Name: question, Length: 133, dtype: int64


In [17]:
import pandas as pd
import numpy as np
import ollama
import random
import Levenshtein
from datasketch import MinHash, MinHashLSH
from sentence_transformers import SentenceTransformer

#list ot store the keywords
keyword_store = []

# Loop through each topic
for db_id, group in grouped:
    sampled_questions = group.sample(n=min(4, len(group)), random_state=42)

    for _, row in sampled_questions.iterrows():
        extracted_keywords = extract_keywords(row["question"])  # Extract keywords from LLM
        keyword_list = extracted_keywords.strip("[]").replace('"', '').split(", ")  

        # Store formatted example
        few_shot_examples.append((row["question"], keyword_list))

        # Collect all extracted keywords for LSH indexing
        keyword_store.extend(keyword_list)

# Initialize LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
index = {}

# Add all extracted keywords to LSH
for idx, word in enumerate(set(keyword_store)):  # Unique keywords only
    minhash = MinHash(num_perm=128)
    minhash.update(word.encode("utf8"))  
    lsh.insert(str(idx), minhash)
    index[str(idx)] = word  

# Function to retrieve similar keywords
def retrieve_similar_keywords(query, top_k=5):
    query_minhash = MinHash(num_perm=128)
    query_minhash.update(query.encode("utf8"))

    # Retrieve candidates from LSH
    candidate_ids = lsh.query(query_minhash)

    # Compute similarity
    ranked_results = []
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)[0]

    for idx in candidate_ids:
        candidate_word = index[idx]
        candidate_embedding = embedding_model.encode([candidate_word], convert_to_numpy=True)[0]

        # Semantic similarity
        semantic_sim = np.dot(query_embedding, candidate_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(candidate_embedding))

        # Typo distance
        edit_dist = Levenshtein.distance(query, candidate_word)

        ranked_results.append((candidate_word, semantic_sim, edit_dist))

    # Sort: higher similarity + lower typo distance
    ranked_results.sort(key=lambda x: (-x[1], x[2]))

    return ranked_results[:top_k]

# Save the few-shot examples
with open("few_shot_prompt.txt", "w") as f:
    for question, keywords in few_shot_examples:
        f.write(f'Question: "{question}"\nKeywords: {keywords}\n\n')

print("Few-shot examples saved. Here’s a preview:\n")
for question, keywords in few_shot_examples[:8]:
    print(f'Question: "{question}"\nKeywords: {keywords}\n')

# Example retrieval
test_word = "faculty"
similar_words = retrieve_similar_keywords(test_word, top_k=5)
print(f"Top similar words to '{test_word}': {similar_words}")


RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
'NoneType' object has no attribute 'exists'

In [16]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1
