<a href="https://colab.research.google.com/github/charoo-rumsan/DSPy_research/blob/main/dspy_no_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import pandas as pd
import os

# Specify the path to your CSV file here
file_path = "/content/first_100_rows (1) - first_100_rows (1).csv.csv"

uploaded_filename = None

# Check if the file exists locally
if os.path.exists(file_path):
    uploaded_filename = file_path
    print(f"Local file '{uploaded_filename}' is ready to be used.")
else:
    print(f"Error: File not found at '{file_path}'. Please ensure the file exists at this path.")

Local file '/content/first_100_rows (1) - first_100_rows (1).csv.csv' is ready to be used.


In [46]:
import csv
import os
from pathlib import Path
import polars as pl

class HeaderExtractor:
    def __init__(self):
        self.supported_formats = ['.csv', '.tsv', '.txt']

    def extract_headers_from_file(self, file_path: str):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        file_ext = Path(file_path).suffix.lower()
        if file_ext not in self.supported_formats:
            raise ValueError(f"Unsupported file format: {file_ext}")

        headers = self._extract_headers(file_path)
        print(f"✅ Extracted {len(headers)} headers from file.")

        return headers

    def _extract_headers(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            sample = f.read(1024)
            f.seek(0)
            delimiter = csv.Sniffer().sniff(sample).delimiter
        df = pl.read_csv(file_path, separator=delimiter, n_rows=0)
        return df.columns

In [47]:
import json

# Instantiate the HeaderExtractor
extractor = HeaderExtractor()

# Assuming 'uploaded_filename' holds the path to your CSV file from earlier steps
if 'uploaded_filename' in locals() and uploaded_filename:
    # Extract headers using the new class
    extracted_headers = extractor.extract_headers_from_file(uploaded_filename)

    print("\nHeaders extracted using HeaderExtractor:")
    print(extracted_headers)

    # Save to JSON file
    output_filename = 'extracted_headers.json'
    with open(output_filename, 'w') as f:
        json.dump(extracted_headers, f, indent=4)
    print(f"\nHeaders also saved to {output_filename}")

else:
    print("No CSV file was uploaded. Please upload a file first.")

✅ Extracted 353 headers from file.

Headers extracted using HeaderExtractor:

Headers also saved to extracted_headers.json


In [None]:
!pip install dspy-ai
import dspy

In [56]:
# ------------------------------
# Few-shot examples for DSPy
# ------------------------------
# Keep FEW_SHOT_EXAMPLES for potential future use or manual prompting,
# but do not pass directly to Predict if it causes serialization issues.
FEW_SHOT_EXAMPLES = [
    dspy.Example(
        original_header="General Questions/Municipality and Ward Details/Name of Municipality (नगरपालिकाको नाम)",
        standardized_header="municipality_name"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="General Questions/_GPS Coordinates_latitude",
        standardized_header="latitude"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="General Questions/_GPS Coordinates_longitude",
        standardized_header="longitude"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="Household Survey/Section A - Demographics/A1. Respondent Age (वर्ष)",
        standardized_header="respondent_age"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="Income Sources/Q12_3. Remittances last year (USD)",
        standardized_header="remittances_usd"
    ).with_inputs("original_header"),

    dspy.Example(original_header="id", standardized_header="id").with_inputs("original_header"),
    dspy.Example(original_header="Timestamp", standardized_header="timestamp").with_inputs("original_header"),

    # --- New Nepali + English examples ---
    dspy.Example(
        original_header="घरबाट स्वास्थ्य संस्थाको दुरी in METER",
        standardized_header="distance_from_home_to_health_facility"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="घरबाट बजारको दुरी in METER",
        standardized_header="distance_from_home_to_market"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="नजिकैको शौचालय र घरको दुरी in METER",
        standardized_header="distance_to_nearest_toilet_in_meter"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="Distance to safe Shelter in METER",
        standardized_header="distance_to_safe_shelter_meters"
    ).with_inputs("original_header"),
]

In [57]:
import json
import re
import dspy

# ------------------------------
# DSPy Signature
# ------------------------------
class StandardizeHeader(dspy.Signature):
    """
    Automatically standardize a CSV header to a concise snake_case label.
    No predefined mapping is used.Uses both cleanup and few-shot examples for semantic mapping.
    """

    original_header = dspy.InputField(
        desc="The original, potentially complex, CSV header string"
    )

    standardized_header = dspy.OutputField(
        desc="Automatically generated concise snake_case label."
    )

    @staticmethod
    def run(original_header: str) -> str:
        header = original_header.strip()

        # 1. Take last part after slash (most specific part)
        if '/' in header:
            header = header.split('/')[-1]

        # 2. Remove brackets and contents inside
        header = re.sub(r"[\(\)\[\]]", "", header)

        # 3. Remove non-ASCII characters (strip Nepali or other Unicode)
        header = re.sub(r"[^\x00-\x7F]", "", header)

        # 4. Replace punctuation with space
        header = re.sub(r"[^\w\s]", " ", header)

        # 5. Lowercase
        header = header.lower()

        # 6. Replace multiple spaces with single space
        header = re.sub(r"\s+", " ", header).strip()

        # 7. Map common abbreviations automatically
        header = header.replace("no", "number")  # House No → house_number

        # 8. Convert spaces to underscores
        header = header.replace(" ", "_")

        return header

# ------------------------------
# List of headers
# ------------------------------
headers = extracted_headers

# ------------------------------
# Apply standardization
# ------------------------------
standardized_mapping = {}
for h in headers:
    standardized_mapping[h] = StandardizeHeader.run(h)

# ------------------------------
# Save to JSON
# ------------------------------
output_file = "standardized_headers_auto3.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(standardized_mapping, f, indent=4, ensure_ascii=False)

# Print results
print("Standardizing headers...")
for original, standardized in standardized_mapping.items():
    print(f"'{original}' -> '{standardized}'")


Standardizing headers...
'' -> ''
'start' -> 'start'
'end' -> 'end'
'today' -> 'today'
'username' -> 'username'
'simserial' -> 'simserial'
'subscriberid' -> 'subscriberid'
'deviceid' -> 'deviceid'
'phonenumber' -> 'phonenumber'
'General Questions/Municipality and Ward Details/Name of Municipality (नगरपालिकाको नाम)' -> 'name_of_municipality'
'General Questions/Municipality and Ward Details/Ward Number (वडा नं .)' -> 'ward_number'
'General Questions/Municipality and Ward Details/Ward Number (वडा नं )' -> 'ward_number'
'General Questions/Name of the Tole (सर्वेक्षण भैरहेको स्थानको नाम)' -> 'name_of_the_tole'
'General Questions/House No. (घर नं)' -> 'house_number'
'GPS Coordinates' -> 'gps_coordinates'
'General Questions/_GPS Coordinates_latitude' -> '_gps_coordinates_latitude'
'General Questions/_GPS Coordinates_longitude' -> '_gps_coordinates_longitude'
'General Questions/_GPS Coordinates_altitude' -> '_gps_coordinates_altitude'
'General Questions/_GPS Coordinates_precision' -> '_gps_coo

In [51]:
import dspy
from dspy.teleprompt import BootstrapFewShot


# Set up the language model
# You can choose other models like dspy.Google("models/gemini-pro") or dspy.Cohere()

llm =  dspy.LM(model="ollama/llama3.1:latest",
         base_url= "", api_key='')
dspy.configure(lm=llm)

print("DSPy configured with ollama.")

DSPy configured with ollama.


In [61]:
import dspy

# Define the signature for standardizing headers
class StandardizeHeader(dspy.Signature):
    """Standardize a given CSV header name to a simpler, more usable format."""

    original_header = dspy.InputField(desc="The original, potentially complex, CSV header string")
    standardized_header = dspy.OutputField(desc="A simplified and standardized version of the header. Examples: 'General Questions/Municipality and Ward Details/Name of Municipality (नगरपालिकाको नाम)' -> 'municipality_name', 'General Questions/_GPS Coordinates_latitude' -> 'latitude','start'->'start','end'->'end'")

# Define the DSPy module to use this signature
class HeaderStandardizer(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predictor = dspy.Predict(StandardizeHeader)

    def forward(self, original_header):
        prediction = self.predictor(original_header=original_header)
        return prediction.standardized_header

print("DSPy Signature and Module for header standardization defined.")


DSPy Signature and Module for header standardization defined.


In [None]:
# Instantiate the HeaderStandardizer
standardizer = HeaderStandardizer()

# Ensure extracted_headers is available
if 'extracted_headers' in locals():
    standardized_headers = []
    print("\nStandardizing headers...")

    for header in extracted_headers:
        if not header.strip():
            continue

        # Correct DSPy usage
        standardized_name = standardizer(original_header=header)

        standardized_headers.append(standardized_name)
        print(f"'{header}' -> '{standardized_name}'")

    print("\nStandardized Headers List:")
    print(standardized_headers)

    # Save to JSON
    import json
    output_filename_standardized = 'standardized_headers.json'

    with open(output_filename_standardized, 'w') as f:
        json.dump(standardized_headers, f, indent=4)

    print(f"\nStandardized headers saved to {output_filename_standardized}")

else:
    print("Error: 'extracted_headers' list not found. Please ensure headers were extracted successfully.")


Standardizing headers...
'start' -> 'start'
'end' -> 'end'
'today' -> 'date'
'username' -> 'username'
'simserial' -> 'sim_serial'
'subscriberid' -> 'subscriber_id'
'deviceid' -> 'device_id'
'phonenumber' -> 'phone_number'
'General Questions/Municipality and Ward Details/Name of Municipality (नगरपालिकाको नाम)' -> 'municipality_name'
'General Questions/Municipality and Ward Details/Ward Number (वडा नं .)' -> 'ward_number'
'General Questions/Municipality and Ward Details/Ward Number (वडा नं )' -> 'ward_number'
'General Questions/Name of the Tole (सर्वेक्षण भैरहेको स्थानको नाम)' -> 'name_of_tole_survey_location'
'General Questions/House No. (घर नं)' -> 'house_no'
'GPS Coordinates' -> 'latitude'
'General Questions/_GPS Coordinates_latitude' -> 'latitude'
'General Questions/_GPS Coordinates_longitude' -> 'longitude'
'General Questions/_GPS Coordinates_altitude' -> 'altitude'
'General Questions/_GPS Coordinates_precision' -> '_gps_coordinates_precision'
'General Questions/Do I have your cons

In [52]:
import json
import re
import dspy

# ------------------------------
# DSPy Signature
# ------------------------------
class StandardizeHeader(dspy.Signature):
    """
    Automatically standardize a CSV header to a concise snake_case label.
    Uses both rule-based cleanup and few-shot examples for semantic mapping.
    """

    original_header = dspy.InputField(
        desc="The original, potentially complex, CSV header string"
    )

    standardized_header = dspy.OutputField(
        desc="Automatically generated concise snake_case label."
    )


    @staticmethod
    def run(original_header: str) -> str:
        header = original_header.strip()

        # 1. Take last part after slash (most specific part)
        if '/' in header:
            header = header.split('/')[-1]

        # 2. Remove brackets and contents inside
        header = re.sub(r"[\(\)\[\]]", "", header)

        # 3. Remove non-ASCII characters (strip Nepali or other Unicode)
        header = re.sub(r"[^\x00-\x7F]", "", header)

        # 4. Replace punctuation with space
        header = re.sub(r"[^\w\s]", " ", header)

        # 5. Lowercase
        header = header.lower()

        # 6. Replace multiple spaces with single space
        header = re.sub(r"\s+", " ", header).strip()

        # 7. Map common abbreviations automatically
        header = header.replace("no", "number")  # House No → house_number

        # 8. Convert spaces to underscores
        header = header.replace(" ", "_")

        return header


# ------------------------------
# Few-shot examples for DSPy
# ------------------------------
# Keep FEW_SHOT_EXAMPLES for potential future use or manual prompting,
# but do not pass directly to Predict if it causes serialization issues.
FEW_SHOT_EXAMPLES = [
    dspy.Example(
        original_header="General Questions/Municipality and Ward Details/Name of Municipality (नगरपालिकाको नाम)",
        standardized_header="municipality_name"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="General Questions/_GPS Coordinates_latitude",
        standardized_header="latitude"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="General Questions/_GPS Coordinates_longitude",
        standardized_header="longitude"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="Household Survey/Section A - Demographics/A1. Respondent Age (वर्ष)",
        standardized_header="respondent_age"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="Income Sources/Q12_3. Remittances last year (USD)",
        standardized_header="remittances_usd"
    ).with_inputs("original_header"),

    dspy.Example(original_header="id", standardized_header="id").with_inputs("original_header"),
    dspy.Example(original_header="Timestamp", standardized_header="timestamp").with_inputs("original_header"),

    # --- New Nepali + English examples ---
    dspy.Example(
        original_header="घरबाट स्वास्थ्य संस्थाको दुरी in METER",
        standardized_header="distance_from_home_to_health_facility"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="घरबाट बजारको दुरी in METER",
        standardized_header="distance_from_home_to_market"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="नजिकैको शौचालय र घरको दुरी in METER",
        standardized_header="distance_to_nearest_toilet_in_meter"
    ).with_inputs("original_header"),

    dspy.Example(
        original_header="Distance to safe Shelter in METER",
        standardized_header="distance_to_safe_shelter_meters"
    ).with_inputs("original_header"),
]


