In [18]:
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableMap, RunnableLambda
from sqlalchemy import create_engine
import tqdm
import time
import pickle
import os
from sqlalchemy import create_engine
import pandas as pd
from sqlalchemy import create_engine, inspect


from dotenv import load_dotenv

load_dotenv()



True

In [19]:
table_description = {

'tbl_distributor_master': '''
Stores hierarchical mapping between superstockists and their associated distributors, 
along with details of sales management personnel, distributor categorization, and location. 
Captures multi-level position holders (Level 6 to Level 2) in the sales hierarchy, 
providing traceability from top-level superstockist managers down to distributor operations.

Essential for understanding the sales network structure, identifying distributor coverage, 
and filtering by operational attributes such as channel, segmentation, and location.

Columns:
- superstockist_name: Name of the superstockist entity supplying goods to distributors.
- level6_position_user: Level 6 manager responsible for the superstockist’s operations.
- level5_position_user: Level 5 manager in the hierarchy.
- level4_position_user: Level 4 manager in the hierarchy.
- level3_position_user: Level 3 manager in the hierarchy.
- level2_position_user: Level 2 manager in the hierarchy.
- distributor_name: Distributor’s registered name with ERP code and location details.
- distributor_erp_id: Unique ERP identifier for the distributor.
- distributor_type: Classification of distributor (e.g., Sub Stockist).
- state: State of operation for the distributor.
- distributor_segmentation: Market segmentation type (e.g., GT = General Trade).
- distributor_channel: Sales channel assigned to distributor (e.g., GT, MT).
- city_of_warehouse_address: City where the distributor’s warehouse is located.
- temp_created_date: Date when the record was created in the system.
'''
,

'tbl_Primary': '''
Captures transactional sales data between superstockists and their associated distributors, 
including product-level details, order quantities, billing, and invoicing information. 
Provides full linkage from the superstockist hierarchy to distributor geography and sales channels, 
enabling analysis of sales performance, product demand, and distribution coverage.

Essential for tracking order flow from superstockists to distributors, 
analyzing fulfillment gaps (short closes), and reconciling sales orders with invoiced quantities.

Columns:
- super_stockist_id: Unique identifier for the superstockist entity.
- super_stockist_name: Name of the superstockist supplying products to distributors.
- super_stockist_zone: Zone assigned to the superstockist (e.g., NORTH).
- super_stockist_region: Region assigned to the superstockist (DELHI).
- super_stockist_state: State where the superstockist operates.
- distributor_id: Unique identifier for the distributor.
- distributor_name: Registered name of the distributor with ERP code.
- distributor_zone: Zone assigned to the distributor (e.g., NORTH).
- distributor_region: Region assigned to the distributor (e.g., DELHI).
- distributor_state: State where the distributor operates.
- channel_type: Distribution channel type (e.g., GT = General Trade).
- product_id: Unique identifier for the product (SKU code).
- product_name: Name and description of the product.
- ordered_quantity: Total quantity of product ordered by the distributor.
- short_close_qty: Quantity not fulfilled (short closed) from the original order.
- sales_order_date: Date when the sales order was placed.
- bill_date: Date when the order was billed.
- invoiced_total_quantity: Quantity invoiced to the distributor.
'''
,

'tbl_Product_Master': '''
Stores detailed product-level metadata including pack size, base pack design, 
industry segment classification, pricing metrics, and category mapping. 
Provides multiple price references (PTR, PTD, display MRP, MRP) along with ERP product identifiers 
and promotional flags, enabling accurate pricing, product categorization, and sales reporting.

Essential for unifying product information across sales, distribution, and analytics systems, 
ensuring consistent mapping between product IDs, descriptions, and commercial attributes.

Columns:
- industry_segment_name: High-level industry segment classification (e.g., CAR, Beverages).
- pack_size_name: Standardized pack size grouping (e.g., LARGE PACK, 200 GM, 1 KG).
- base_pack_design_name: Descriptive name of the base pack design.
- base_pack_design_id: Unique identifier for the base pack design.
- industry_segment_id: Unique identifier for the industry segment.
- pack_size_id: Unique identifier for the pack size.
- product: Full product name including brand, variant, size, and packaging details.
- ptr: Price-to-retailer value.
- ptd: Price-to-distributor value.
- display_mrp: Displayed Maximum Retail Price.
- mrp: Standard Maximum Retail Price.
- alternate_category: Alternate product category classification.
- product_erp_id: ERP system identifier for the product SKU.
- is_promoted: Boolean flag indicating if the product is under promotion.
- product_weight_in_gm: Product weight in grams.
'''
,

'tbl_superstockist_master': '''
Maintains a master list of all superstockists in the distribution network, 
serving as the reference table for mapping superstockist IDs to their corresponding names. 
Used for joining transactional, mapping, and sales data to the correct superstockist entity.

Essential for ensuring consistency in identifying superstockists across multiple datasets 
(sales transactions, distributor mappings, and product flows).

Columns:
- superstockist_name: Official registered name of the superstockist entity.
- superstockist_id: Unique numeric identifier assigned to the superstockist.
'''

}


In [20]:
import urllib.parse 

# Load engine and knowledge base
password = urllib.parse.quote_plus("Iameighteeni@18")

In [21]:

engine = create_engine(
    f"postgresql+psycopg2://postgres:{password}@localhost:5432/LLM_Haldiram_primary"
)

def read_sql(table):
    # ✅ PostgreSQL uses RANDOM() instead of RAND()
    query = f'SELECT * FROM "{table}" ORDER BY RANDOM() LIMIT 5;'

    # ✅ Read SQL query into DataFrame
    df_sample = pd.read_sql(query, con=engine)
    return df_sample


In [22]:


llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)


template = ChatPromptTemplate.from_messages([
    ("system", """
You are an intelligent data annotator. Please annotate data as mentioned by human and give output without any verbose and without any additional explanation.
You will be given sql table description and sample columns from the sql table. The description that you generate will be given as input to text to sql automated system.
Output of project depends on how you generate description. Make sure your description has all possible nuances.
"""),
    ("human", '''
- Based on the column data, please generate description of entire table along with description for each column and sample values(1 or 2) for each column.
- While generating column descriptions, please look at sql table description given to you and try to include them in column description. 
- DONT write generic description like "It provides a comprehensive view of the order lifecycle from purchase to delivery". Just write description based on what you see in columns.
      
Context regarding the tables:
These tables are provided by a supply chain food-based company. 

Output should look like below in form of list of strings and lists properly. MAKE SURE YOU CLOSE THE QUOTES in list of strings properly always.
["<table description based on all column values>" , [["<column 1> : Detail description of column along with datatype, <sample values:v1,v2 etc(indicate there are more values)>"],
["<column 2> : Detail description of column 2 along with datatype, <sample values:v1,v2 etc(indicate there are more values)>"]]  
]

SQL table description:
{description}

Sample rows from the table:
{data_sample}     
''')
])




chain = (
    RunnableMap({
        "description": lambda x: x["description"],
        "data_sample": lambda x: x["data_sample"]
    })
    | template
    | llm
    | StrOutputParser()
)


In [23]:
def get_annotated_description(table_name_raw: str, description: str) -> str:
    try:
        
        table_name = table_name_raw.lower()
        df = pd.read_sql(f'SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 5', engine)
        sample_text = df.head().to_markdown(index=False)

        result = chain.invoke({
            "description": description,
            "data_sample": sample_text
        })
        return result
    except Exception as e:
        print(f"❌ Error generating annotation for {table_name_raw}: {e}")
        return None


In [24]:

inspector = inspect(engine)
existing_tables = set(inspector.get_table_names())


all_outputs = []
dict_knowledge = {}

for table_name_raw, desc in table_description.items():
    table_name = table_name_raw.lower()

    if table_name not in existing_tables:
        print(f"⚠️ Skipping: {table_name} not found in DB.")
        continue

    try:
        # Fetch sample for logging only
        df_sample = pd.read_sql(f'SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 5', engine)
        # print(f"✅ Sample from {table_name}:\n", df_sample.head())

        # Generate annotated description
        annotated_text = get_annotated_description(table_name_raw, desc)
        if annotated_text:
            markdown_block = f"### **{table_name_raw}**\n```json\n{annotated_text}\n```\n"
            dict_knowledge[table_name_raw] = markdown_block
            all_outputs.append(markdown_block)

    except Exception as e:
        print(f"❌ Error for table {table_name_raw}: {e}")


with open("annotated_schema_haldiram_primary_azam.md", "w", encoding="utf-8") as f:
    f.write("\n\n".join(all_outputs))

with open('kb_haldiram_primary_azam.pkl', 'wb') as f:
    pickle.dump(dict_knowledge, f)

print("✅ Annotated markdown saved to 'annotated_schema_haldiram_primary.md'")

✅ Annotated markdown saved to 'annotated_schema_haldiram_primary.md'


In [27]:
for description in all_outputs:
    print(description)

### **tbl_distributor_master**
```json
["This table stores hierarchical mapping between superstockists and their associated distributors, along with details of sales management personnel, distributor categorization, and location. It captures multi-level position holders in the sales hierarchy, providing traceability from top-level superstockist managers down to distributor operations. It is essential for understanding the sales network structure, identifying distributor coverage, and filtering by operational attributes such as channel, segmentation, and location.", 
[
["superstockist_name : Name of the superstockist entity supplying goods to distributors. Datatype: String, <sample values: S B Markplus Private Limited-2>"],
["level6_position_user : Level 6 manager responsible for the superstockist’s operations. Datatype: String, <sample values: Vinayak Mathur>"],
["level5_position_user : Level 5 manager in the hierarchy. Datatype: String, <sample values: Manoj Kumar Gaur>"],
["level4_po

In [None]:
# import pickle

# # Load the existing markdown file
# with open("annotated_schema_haldiram_primary_azam.md", "r", encoding="utf-8") as f:
#     content = f.read()

# # Split content into blocks based on headers
# sections = content.split("### **")
# dict_knowledge = {}

# for section in sections[1:]:  # skip the first empty string before the first header
#     try:
#         table_name, rest = section.split("**", 1)
#         dict_knowledge[table_name.strip()] = f"### **{table_name.strip()}**\n{rest.strip()}"
#     except ValueError:
#         print(f"⚠️ Skipping malformed section:\n{section[:100]}...")

# # Save the dictionary as a pickle file
# with open('kb_haldiram_primary_azam.pkl', 'wb') as f:
#     pickle.dump(dict_knowledge, f)

# print("✅ Pickle file 'kb_haldiram_primary.pkl' created successfully from markdown.")


✅ Pickle file 'kb_haldiram_primary.pkl' created successfully from markdown.
