In [1]:
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableMap, RunnableLambda
from sqlalchemy import create_engine
import tqdm
import time
import pickle
import os
from sqlalchemy import create_engine
import pandas as pd
from sqlalchemy import create_engine, inspect


from dotenv import load_dotenv

load_dotenv()


True

In [2]:
table_description = {

    'tbl_product_master': '''Defines all product-level metadata including pricing (MRP, PTR, PTD), packaging, units, categories, and custom attributes.
Used for converting quantities between units, identifying promotions, and structuring product taxonomy.
Supports UI display logic and batch rules via flags and conversion factors.
Essential for enriching sales, stock, and shipment data with standardized product info.''',

    'tbl_primary': '''This table captures primary sales transactions between sellers(Superstockist) and buyers (Distributors),it tells the order quantity against every product & invoiced quantity that was billed against those orders.
    Invoiced quantities are the actual sales made by SuperStockist to Distributors.''',

    'tbl_superstockist_master': '''A basic mapping table that links Super Stockist names with their corresponding ERP IDs.
Used to identify and reference super stockists across the distribution and supply chain datasets.
Acts as a master reference for joining shipment, stock, and sales data.
Essential for hierarchy-level reporting and regional inventory analysis.''',

    'tbl_distributor_master': '''Captures the mapping between distributors and their assigned super stockists, specifically for the Delhi region.
Includes multi-level sales hierarchy data (Level 2–6), distributor segmentation, channels, geotag, and ERP identifiers.
Useful for understanding the sales org structure, tax jurisdictions, and distributor classification.
Enables location-wise planning, supply chain alignment, and geo-segmented performance reporting.''',



}


In [3]:

engine = create_engine(
    "postgresql+psycopg2://postgres:12345678@localhost:5432/LLM_Haldiram_primary"
)

def read_sql(table):
    # ✅ PostgreSQL uses RANDOM() instead of RAND()
    query = f'SELECT * FROM "{table}" ORDER BY RANDOM() LIMIT 5;'

    # ✅ Read SQL query into DataFrame
    df_sample = pd.read_sql(query, con=engine)
    return df_sample


In [4]:


llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)


template = ChatPromptTemplate.from_messages([
    ("system", """
You are an intelligent data annotator. Please annotate data as mentioned by human and give output without any verbose and without any additional explanation.
You will be given sql table description and sample columns from the sql table. The description that you generate will be given as input to text to sql automated system.
Output of project depends on how you generate description. Make sure your description has all possible nuances.
"""),
    ("human", '''
- Based on the column data, please generate description of entire table along with description for each column and sample values(1 or 2) for each column.
- While generating column descriptions, please look at sql table description given to you and try to include them in column description. 
- DONT write generic description like "It provides a comprehensive view of the order lifecycle from purchase to delivery". Just write description based on what you see in columns.
      
Context regarding the tables:
These tables are provided by a supply chain food-based company. Retailers buy the different products (SKUs) from the distributors if the distributor has the available stock.

Output should look like below in form of list of strings and lists properly. MAKE SURE YOU CLOSE THE QUOTES in list of strings properly always.
["<table description based on all column values>" , [["<column 1> : Detail description of column along with datatype, <sample values:v1,v2 etc(indicate there are more values)>"],
["<column 2> : Detail description of column 2 along with datatype, <sample values:v1,v2 etc(indicate there are more values)>"]]  
]

SQL table description:
{description}

Sample rows from the table:
{data_sample}     
''')
])




chain = (
    RunnableMap({
        "description": lambda x: x["description"],
        "data_sample": lambda x: x["data_sample"]
    })
    | template
    | llm
    | StrOutputParser()
)


In [5]:
def get_annotated_description(table_name_raw: str, description: str) -> str:
    try:
        
        table_name = table_name_raw.lower()
        df = pd.read_sql(f'SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 5', engine)
        sample_text = df.head().to_markdown(index=False)

        result = chain.invoke({
            "description": description,
            "data_sample": sample_text
        })
        return result
    except Exception as e:
        print(f"❌ Error generating annotation for {table_name_raw}: {e}")
        return None


In [6]:

inspector = inspect(engine)
existing_tables = set(inspector.get_table_names())


all_outputs = []
dict_knowledge = {}

for table_name_raw, desc in table_description.items():
    table_name = table_name_raw.lower()

    if table_name not in existing_tables:
        print(f"⚠️ Skipping: {table_name} not found in DB.")
        continue

    try:
        # Fetch sample for logging only
        df_sample = pd.read_sql(f'SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 5', engine)
        print(f"✅ Sample from {table_name}:\n", df_sample.head())

        # Generate annotated description
        annotated_text = get_annotated_description(table_name_raw, desc)
        if annotated_text:
            markdown_block = f"### **{table_name_raw}**\n```json\n{annotated_text}\n```\n"
            dict_knowledge[table_name_raw] = markdown_block
            all_outputs.append(markdown_block)

    except Exception as e:
        print(f"❌ Error for table {table_name_raw}: {e}")


with open("annotated_schema_haldiram_primary.md", "w", encoding="utf-8") as f:
    f.write("\n\n".join(all_outputs))

with open('kb_haldiram_primary.pkl', 'wb') as f:
    pickle.dump(dict_knowledge, f)

print("✅ Annotated markdown saved to 'annotated_schema_haldiram_primary.md'")

✅ Sample from tbl_product_master:
   industy_segment_name pack_size_name             base_pack_design_name  \
0                  CAR         200 GM               Navratan Mix 200 Gm   
1                  CAR         MRP 20                Nut Cracker MRP 20   
2                  CAR     LARGE PACK                  Til Laddu 200 Gm   
3                  CAR         400 GM  Soan Cake Elaichi Flavour 400 Gm   
4                  CAR         MRP 15         Panga Tangy Tomato MRP 15   

  base_pack_design_id                industy_segment_id  \
0   ADM5B720617B41063                   Namkeen_In-Home   
1   BEI9B730619A51081  Nuts, Seeds, Berries_Out of Home   
2   DHI4B265852E54507                    Sweets_Ambient   
3   DHL4C510833B21402                    Sweets_Ambient   
4   BIA7B810634A24859        Western Snacks_Out of Home   

                              pack_size_id  \
0                   200 GM_Namkeen_In-Home   
1  MRP 20_Nuts, Seeds, Berries_Out of Home   
2                LARG

KeyboardInterrupt: 

In [7]:
import pickle

# Load the existing markdown file
with open("annotated_schema_haldiram_primary.md", "r", encoding="utf-8") as f:
    content = f.read()

# Split content into blocks based on headers
sections = content.split("### **")
dict_knowledge = {}

for section in sections[1:]:  # skip the first empty string before the first header
    try:
        table_name, rest = section.split("**", 1)
        dict_knowledge[table_name.strip()] = f"### **{table_name.strip()}**\n{rest.strip()}"
    except ValueError:
        print(f"⚠️ Skipping malformed section:\n{section[:100]}...")

# Save the dictionary as a pickle file
with open('kb_haldiram_primary.pkl', 'wb') as f:
    pickle.dump(dict_knowledge, f)

print("✅ Pickle file 'kb_haldiram_primary.pkl' created successfully from markdown.")


✅ Pickle file 'kb_haldiram_primary.pkl' created successfully from markdown.
