<a href="https://colab.research.google.com/github/charoo-rumsan/community_tool_R-D/blob/main/ML_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Method 1: Regex-Based Classification (Best for Real-Time: Simple, Fast, No ML Overhead)

In [None]:
!pip install polars sentence-transformers transformers torch



In [None]:
import polars as pl
import re
import difflib
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import torch
from google.colab import files
import json

In [None]:
# Step 1: Upload CSV file
print("Please upload your CSV file (e.g., 'first_100_rows (1) - first_100_rows (1).csv.csv')")
uploaded = files.upload()
csv_file = list(uploaded.keys())[0]  # Get the uploaded file name

Please upload your CSV file (e.g., 'first_100_rows (1) - first_100_rows (1).csv.csv')


In [None]:
# Step 2: Load CSV with Polars
try:
    df = pl.read_csv(csv_file, encoding="utf-8-sig")
    print(f"Successfully loaded {csv_file} with {len(df)} rows and {len(df.columns)} columns")
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise

In [None]:
# Method 1: Regex-Based Classification (Fastest)
def classify_field_regex(column_name, sample_values):
    column_lower = column_name.lower()

    # Email: Header or value matches email pattern
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    if 'email' in column_lower or any(re.match(email_pattern, str(val)) for val in sample_values if val):
        return 'email_address', 0.95

    # Phone: Header or value matches phone pattern (7-15 digits, optional +/()/spaces)
    phone_pattern = r'^\+?[\d\s()-]{7,15}$'
    if 'phone' in column_lower or any(re.match(phone_pattern, str(val)) for val in sample_values if val):
        return 'phone_number', 0.95

    # Address: Header keywords or value looks like location
    address_keywords = ['municipality', 'ward', 'tole', 'house', 'address', 'gps', 'location', 'coordinates']
    address_value_pattern = r'[A-Z][a-z]+(?: [A-Z][a-z]+)?'
    if any(kw in column_lower for kw in address_keywords) or any(re.search(address_value_pattern, str(val)) for val in sample_values if val):
        return 'address', 0.90

    # Name: Header or value looks like a name (capitalized words with space)
    name_keywords = ['name', 'owner', 'house owner']
    if any(kw in column_lower for kw in name_keywords) or any(isinstance(val, str) and ' ' in val and val[0].isupper() for val in sample_values if val):
        return 'name', 0.90

    return 'other', 0.50

In [None]:
# Method 2: String Similarity + Regex (Hybrid)
type_synonyms = {
    'email_address': ['email', 'mail', 'contact email'],
    'phone_number': ['phone', 'mobile', 'contact number', 'tel'],
    'address': ['address', 'location', 'municipality', 'ward', 'tole', 'house', 'gps', 'coordinates'],
    'name': ['name', 'owner', 'house owner', 'person']
}

def classify_field_hybrid(column_name, sample_values):
    column_lower = column_name.lower()

    # Regex checks
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    phone_pattern = r'^\+?[\d\s()-]{7,15}$'
    address_value_pattern = r'[A-Z][a-z]+(?: [A-Z][a-z]+)?'

    if any(re.match(email_pattern, str(val)) for val in sample_values if val):
        return 'email_address', 0.95
    if any(re.match(phone_pattern, str(val)) for val in sample_values if val):
        return 'phone_number', 0.95
    if any(re.search(address_value_pattern, str(val)) for val in sample_values if val):
        return 'address', 0.90

    # Fuzzy matching
    best_match = 'other'
    best_score = 0
    for field_type, synonyms in type_synonyms.items():
        for synonym in synonyms:
            score = difflib.SequenceMatcher(None, column_lower, synonym).ratio()
            if score > best_score and score > 0.6:
                best_score = score
                best_match = field_type

    return best_match, best_score if best_score > 0 else 0.60


In [None]:
# Method 3: Lightweight ML with Zero-Shot Classification
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0 if torch.cuda.is_available() else -1)

def classify_field_ml(column_name, sample_values):
    candidate_labels = ['email_address', 'phone_number', 'address', 'name', 'other']
    sequence = f"Column: {column_name}. Sample data: {' '.join(str(val) for val in sample_values[:3] if val)}"
    result = classifier(sequence, candidate_labels, multi_label=False)
    best_label = result['labels'][0]
    score = result['scores'][0]
    return best_label, score if score > 0.7 else 0.50

In [None]:
# Step 4: Combine methods for best results
def classify_field_combined(column_name, sample_values):
    # Try regex first (fastest, high confidence)
    field_type, score = classify_field_regex(column_name, sample_values)
    if field_type != 'other':
        return field_type, score

    # Try hybrid (fuzzy matching)
    field_type, score = classify_field_hybrid(column_name, sample_values)
    if field_type != 'other' and score > 0.7:
        return field_type, score

    # Fall back to ML (most robust but slower)
    return classify_field_ml(column_name, sample_values)


In [None]:
# Step 5: Apply to dataset
field_mappings = {}
for col in df.columns:
    samples = df[col].drop_nulls().head(5).to_list()  # Sample 5 values for efficiency
    field_type, score = classify_field_combined(col, samples)
    field_mappings[col] = {'type': field_type, 'confidence': score}

In [None]:
# Step 6: Output results
# Print mappings
print("\nField Classification Results:")
for col, mapping in field_mappings.items():
    print(f"Column '{col}' → Type: {mapping['type']} (Confidence: {mapping['confidence']:.2f})")



Field Classification Results:
Column '' → Type: other (Confidence: 0.50)
Column 'start' → Type: other (Confidence: 0.50)
Column 'end' → Type: other (Confidence: 0.50)
Column 'today' → Type: phone_number (Confidence: 0.95)
Column 'username' → Type: name (Confidence: 0.90)
Column 'simserial' → Type: name (Confidence: 0.50)
Column 'subscriberid' → Type: other (Confidence: 0.50)
Column 'deviceid' → Type: address (Confidence: 0.90)
Column 'phonenumber' → Type: phone_number (Confidence: 0.95)
Column 'General Questions/Municipality and Ward Details/Name of Municipality (नगरपालिकाको नाम)' → Type: address (Confidence: 0.90)
Column 'General Questions/Municipality and Ward Details/Ward Number (वडा नं .)' → Type: address (Confidence: 0.90)
Column 'General Questions/Municipality and Ward Details/Ward Number (वडा नं )' → Type: address (Confidence: 0.90)
Column 'General Questions/Name of the Tole (सर्वेक्षण भैरहेको स्थानको नाम)' → Type: address (Confidence: 0.90)
Column 'General Questions/House No. 

In [None]:
# Save mappings to CSV
mappings_df = pl.DataFrame({
    'column': list(field_mappings.keys()),
    'type': [m['type'] for m in field_mappings.values()],
    'confidence': [m['confidence'] for m in field_mappings.values()]
})
mappings_df.write_csv('/content/field_mappings.csv')
print("\nMappings saved to '/content/field_mappings.csv'")


In [None]:
# Save mappings to JSON for reusability
with open('/content/field_mappings.json', 'w') as f:
    json.dump(field_mappings, f, indent=2)
print("Mappings saved to '/content/field_mappings.json'")

# Download output files
files.download('/content/field_mappings.csv')
files.download('/content/field_mappings.json')