In [37]:
# shows tables, common columns

In [38]:
# import os
# import pandas as pd
# from itertools import combinations

# def process_csv_files(directory):
#     # Dictionary to store dataframes and metadata
#     tables = {}
#     columns_metadata = {}
#     common_columns_by_table_pair = {}
    
#     # Loop through all files in the directory
#     for filename in os.listdir(directory):
#         if filename.endswith(".csv"):
#             # Read CSV file
#             filepath = os.path.join(directory, filename)
#             table_name = filename[:-4]  # Remove '.csv' extension
#             df = pd.read_csv(filepath)
            
#             # Standardize column names
#             #df.columns = df.columns.str.strip().str.lower()  
            
#             # Save dataframe and columns
#             tables[table_name] = df
#             columns_metadata[table_name] = list(df.columns)
    
#     # Compare every pair of tables
#     for table1, table2 in combinations(tables.keys(), 2):
#         # Find common columns between table1 and table2
#         common_cols = set(columns_metadata[table1]).intersection(columns_metadata[table2])
#         if common_cols:
#             common_columns_by_table_pair[(table1, table2)] = list(common_cols)
    
#     return tables, columns_metadata, common_columns_by_table_pair

# # Example Usage
# directory = "."  # Current working directory
# tables, columns_metadata, common_columns_by_table_pair = process_csv_files(directory)

# # Output Results
# print("Tables:", tables.keys())
# print("Columns Metadata:", columns_metadata)
# print("Common Columns by Table Pair:", common_columns_by_table_pair)


In [39]:
import pandas as pd
import glob
import re
from collections import defaultdict


def build_column_table_mapping(csv_folder):
    """
    Scans all CSV files in the given folder to build a mapping of columns to tables.
    :param csv_folder: Path to the folder containing CSV files (each file is a table).
    :return: Dictionary mapping column names to table names and a dictionary of dataframes for each table.
    """
    mapping = defaultdict(list)
    tables = {}
    for file in glob.glob(f"{csv_folder}/*.csv"):
        table_name = file.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(file)
        tables[table_name] = df
        for column in df.columns:
            mapping[column].append(table_name)
    return mapping, tables



# Example Usage
csv_folder = "."  # Folder with CSV files
column_table_mapping, tables = build_column_table_mapping(csv_folder)

In [40]:
sample_queries = [
    "find Grade, CourseName where grade > 90 and courseName like '%Math%'",
    "list Grade, CourseName where grade between 50 and 90",
    "list Grade, CourseName in context where grade between 50 and 90",
    "show Grade, FirstName if firstname like 'John%'",
    "get Grade, Age where age is not null",
    "what are the Grade, CourseName with grade = 100",
    "fetch CourseName, Credits satisfying credits between 3 and 6",
    "retrieve StudentName, Grade where grade is null",
    "give me Age, Grade that meet age < 21",
    "display StudentID, Grade where grade >= 75",
    "determine Grade, CourseName if courseName like '%Math%'",
]

In [41]:
def auto_generate_query(raw_query, column_table_mapping, tables):
    """
    Automatically resolves and rewrites the query by determining joins and conditions based on column-table mapping.
    :param raw_query: User-specified natural language query.
    :param column_table_mapping: Dictionary mapping columns to tables.
    :param tables: Dictionary containing table dataframes for column checks.
    :return: Rewritten SQL query with compact formatting.
    """
    # Define a single regex pattern using reusable keywords
    KEYWORDS_PATTERN = r"(?:find|list|determine|show|get|retrieve|give me|provide|display|fetch|what are|show me)?\s*"
    CONTEXT_PATTERN = r"(?:from|in|on|of)?\s*(?:context)?"
    QUERY_PATTERN = re.compile(
        rf"{KEYWORDS_PATTERN}(.+?)\s*{CONTEXT_PATTERN}\s*(?:where|if|with|satisfying|that (?:meet|fulfill))\s+(.+)",
        re.IGNORECASE,
    )

    # SQL keywords to normalize
    SQL_KEYWORDS = ["between", "like", "is not null", "is null", "and", "or", "not", "in", "exists"]

    # Stop words to remove
    STOP_WORDS = {"a", "an", "the"}

    # Preprocess raw query to remove stop words
    def preprocess_query(query):
        words = query.split()
        return " ".join(word for word in words if word.lower() not in STOP_WORDS)

    # Preprocess the query
    raw_query = preprocess_query(raw_query)

    # Match the query
    match = QUERY_PATTERN.match(raw_query)
    if not match:
        raise ValueError("Query format is invalid. Please use a supported structure.")

    selected_columns = match.group(1).split(", ")
    condition = match.group(2).strip()

    # Resolve tables for selected columns
    columns_with_tables = []
    required_tables = set()
    for column in selected_columns:
        if column in column_table_mapping:
            table_name = column_table_mapping[column][0]  # Take the first match
            columns_with_tables.append(f"{table_name}.{column}")
            required_tables.add(table_name)
        else:
            raise ValueError(f"Column '{column}' not found in any table.")

    # Normalize SQL keywords in the condition
    for keyword in SQL_KEYWORDS:
        condition = re.sub(rf"\b{keyword}\b", keyword.upper(), condition, flags=re.IGNORECASE)

    # Determine join conditions
    join_conditions = []
    required_tables = list(required_tables)
    for i in range(len(required_tables) - 1):
        table_a, table_b = required_tables[i], required_tables[i + 1]
        common_columns = set(tables[table_a].columns).intersection(tables[table_b].columns)

        # Use only one common column for a simple join
        if len(common_columns) < 1:
            raise ValueError(f"No common column found to join '{table_a}' and '{table_b}'.")
        
        common_column = next(iter(common_columns))  # Take the first common column
        join_conditions.append(f"{table_a}.{common_column} = {table_b}.{common_column}")

    # Construct the SQL query
    formatted_query = f"SELECT {', '.join(columns_with_tables)}\nFROM {required_tables[0]}"
    for i in range(1, len(required_tables)):
        formatted_query += f"\nJOIN {required_tables[i]} ON {join_conditions[i - 1]}"
    formatted_query += f"\nWHERE {condition};"

    return formatted_query.strip()

In [42]:
sample_queries = [
    "find Grade, CourseName where grade > 90 and courseName like '%Math%'",
    "list Grade, CourseName where grade between 50 and 90",
    "list Grade, CourseName in context where grade between 50 and 90",
    "show Grade, FirstName if firstname like 'John%'", # redo it without "like"
    "get Grade, Major where Grade is not null",
    "what are the Grade, CourseName with the the the grade = 100",
    "fetch CourseName, CreditHours satisfying CreditHours between 3 and 6",
    "retrieve FirstName, Grade where grade is null",
    "give me StudentID, Grade that meet StudentID < 21",
    "display StudentID, Grade where grade >= 75",
    "determine Grade, CourseName if courseName like '%Math%'",
    "display first_name, title  where title >= 75"
]

In [43]:
# Process sample queries
for query in sample_queries:
    try:
        resolved_query = auto_generate_query(query, column_table_mapping, tables)
        print(f"Input: {query}\nResolved Query:\n{resolved_query}\n")
    except ValueError as e:
        print(f"Error processing query '{query}': {e}\n")

Input: find Grade, CourseName where grade > 90 and courseName like '%Math%'
Resolved Query:
SELECT enrollments.Grade, courses.CourseName
FROM courses
JOIN enrollments ON courses.CourseID = enrollments.CourseID
WHERE grade > 90 AND courseName LIKE '%Math%';

Input: list Grade, CourseName where grade between 50 and 90
Resolved Query:
SELECT enrollments.Grade, courses.CourseName
FROM courses
JOIN enrollments ON courses.CourseID = enrollments.CourseID
WHERE grade BETWEEN 50 AND 90;

Input: list Grade, CourseName in context where grade between 50 and 90
Resolved Query:
SELECT enrollments.Grade, courses.CourseName
FROM courses
JOIN enrollments ON courses.CourseID = enrollments.CourseID
WHERE grade BETWEEN 50 AND 90;

Input: show Grade, FirstName if firstname like 'John%'
Resolved Query:
SELECT enrollments.Grade, students.FirstName
FROM enrollments
JOIN students ON enrollments.StudentID = students.StudentID
WHERE firstname LIKE 'John%';

Input: get Grade, Major where Grade is not null
Resolve