<a href="https://colab.research.google.com/github/charoo-rumsan/DSPy_research/blob/main/embedding_sentence_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import os

# Specify the path to your CSV file here
file_path = "/content/first_100_rows (1) - first_100_rows (1).csv.csv"

uploaded_filename = None

# Check if the file exists locally
if os.path.exists(file_path):
    uploaded_filename = file_path
    print(f"Local file '{uploaded_filename}' is ready to be used.")
else:
    print(f"Error: File not found at '{file_path}'. Please ensure the file exists at this path.")

Local file '/content/first_100_rows (1) - first_100_rows (1).csv.csv' is ready to be used.


In [5]:
import csv
import os
from pathlib import Path
import polars as pl

class HeaderExtractor:
    def __init__(self):
        self.supported_formats = ['.csv', '.tsv', '.txt']

    def extract_headers_from_file(self, file_path: str):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        file_ext = Path(file_path).suffix.lower()
        if file_ext not in self.supported_formats:
            raise ValueError(f"Unsupported file format: {file_ext}")

        headers = self._extract_headers(file_path)
        print(f"✅ Extracted {len(headers)} headers from file.")

        return headers

    def _extract_headers(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            sample = f.read(1024)
            f.seek(0)
            delimiter = csv.Sniffer().sniff(sample).delimiter
        df = pl.read_csv(file_path, separator=delimiter, n_rows=0)
        return df.columns

In [6]:
import json

# Instantiate the HeaderExtractor
extractor = HeaderExtractor()

# Assuming 'uploaded_filename' holds the path to your CSV file from earlier steps
if 'uploaded_filename' in locals() and uploaded_filename:
    # Extract headers using the new class
    extracted_headers = extractor.extract_headers_from_file(uploaded_filename)

    print("\nHeaders extracted using HeaderExtractor:")
    print(extracted_headers)

    # Save to JSON file
    output_filename = 'extracted_headers_using_class.json'
    with open(output_filename, 'w') as f:
        json.dump(extracted_headers, f, indent=4)
    print(f"\nHeaders also saved to {output_filename}")

else:
    print("No CSV file was uploaded. Please upload a file first.")

✅ Extracted 353 headers from file.

Headers extracted using HeaderExtractor:

Headers also saved to extracted_headers_using_class.json


Approach 1: Embedding + Clustering + Auto-Normalization

This version:

Clusters similar headers (unsupervised)

Generates a representative label for each cluster (automatic)

Converts the label into clean snake_case

No predefined labels needed.

Uses:

Sentence Transformers for embeddings

KMeans for clustering

A normalization function to auto-generate labels

In [None]:
pip install sentence-transformers scikit-learn numpy


Step 1 — Import + Load Model

In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
import re

model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")


Step 2 — Convert Headers Into Embeddings

In [8]:
headers = extracted_headers   # from your HeaderExtractor class
embeddings = model.encode(headers)
embeddings = np.array(embeddings)


Step 3 — Determine Number of Clusters Automatically

In [9]:
import math

num_headers = len(headers)
k = max(2, int(math.sqrt(num_headers)))   # sqrt rule

print("Clustering into:", k, "clusters")


Clustering into: 18 clusters


Step 4 — Run KMeans Clustering

In [10]:
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(embeddings)


Step 5 — Auto-Generate a Standard Label for Each Cluster
Rule:

Pick the most common meaningful words in the cluster

Normalize to snake_case

Remove Nepali text, numbers, symbols, etc.

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def extract_keywords(text):
    # simple keyword extractor (can be upgraded)
    words = clean_text(text).split()
    words = [w for w in words if len(w) > 2]  # remove very short words
    return words

def to_snake_case(words):
    return "_".join(words)


In [12]:
from collections import Counter

cluster_labels = {}

for cluster_id in range(k):
    cluster_headers = [headers[i] for i in range(num_headers) if clusters[i] == cluster_id]

    # gather keywords across all headers in cluster
    all_words = []
    for h in cluster_headers:
        all_words.extend(extract_keywords(h))

    if not all_words:
        label = f"cluster_{cluster_id}"
    else:
        # pick top 2–3 frequent keywords
        common = [w for w, _ in Counter(all_words).most_common(3)]
        label = to_snake_case(common)

    cluster_labels[cluster_id] = label


Step 6 — Generate Final Mapping (Header → Auto Label)

In [13]:
auto_labeled = {}

for i, h in enumerate(headers):
    cluster_id = clusters[i]
    auto_labeled[h] = cluster_labels[cluster_id]

auto_labeled


{'': 'meter_घरब_katha',
 'start': 'meter_घरब_katha',
 'end': 'meter_घरब_katha',
 'today': 'meter_घरब_katha',
 'username': 'meter_घरब_katha',
 'simserial': 'meter_घरब_katha',
 'subscriberid': 'meter_घरब_katha',
 'deviceid': 'meter_घरब_katha',
 'phonenumber': 'meter_घरब_katha',
 'General Questions/Municipality and Ward Details/Name of Municipality (नगरपालिकाको नाम)': 'the_house_details',
 'General Questions/Municipality and Ward Details/Ward Number (वडा नं .)': 'the_house_details',
 'General Questions/Municipality and Ward Details/Ward Number (वडा नं )': 'the_house_details',
 'General Questions/Name of the Tole (सर्वेक्षण भैरहेको स्थानको नाम)': 'meter_घरब_katha',
 'General Questions/House No. (घर नं)': 'the_house_details',
 'GPS Coordinates': 'resources_distance_general',
 'General Questions/_GPS Coordinates_latitude': 'resources_distance_general',
 'General Questions/_GPS Coordinates_longitude': 'resources_distance_general',
 'General Questions/_GPS Coordinates_altitude': 'resources_dis

In [14]:
import json

output_file = "auto_standardized_headers.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(auto_labeled, f, indent=4, ensure_ascii=False)

print(f"JSON file generated: {output_file}")


JSON file generated: auto_standardized_headers.json
