<a href="https://colab.research.google.com/github/charoo-rumsan/community_tool_research/blob/main/classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import time
import tracemalloc
import re

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import xgboost as xgb

In [None]:
# Step 1: Simulate datasets
df1 = pd.DataFrame({
"Contact Info": ["john.doe@company.com", "jane.smith@mail.com", "michael.lee@hr.com", "emma.brown@xyz.org", "david.johnson@work.io"],
"Phone Number": ["(555) 123-4567", "555-987-6543", "212-345-7890", "+1 646-555-1212", "(310) 789-4321"],
"Location": ["123 Main St, New York, NY", "45 Wall Street, New York, NY", "67 Park Ave, Boston, MA", "89 Oak Road, Chicago, IL", "12 Sunset Blvd, Los Angeles"],
"Full Name": ["John Doe", "Jane Smith", "Michael Lee", "Emma Brown", "David Johnson"],
"Emp_ID": ["E001", "E002", "E003", "E004", "E005"]
})


df2 = pd.DataFrame({
"Email": ["sarah.connor@work.com", "tom.hardy@jobs.org", "alice.wong@mail.net", "robert.white@staff.io", "linda.green@corp.org"],
"Mobile": ["202-333-4567", "+44 7700 900123", "(415) 222-9876", "646-444-1212", "555-678-9999"],
"Address": ["100 King St, Washington, DC", "22 Queen Rd, London, UK", "78 Market St, San Francisco", "15 Pine Lane, Boston, MA", "200 Broadway, New York, NY"],
"Employee Name": ["Sarah Connor", "Tom Hardy", "Alice Wong", "Robert White", "Linda Green"],
"WorkerID": ["W101", "W102", "W103", "W104", "W105"]
})

In [None]:
# Standard labels
labels = {
"Contact Info": "email_address",
"Email": "email_address",
"Phone Number": "phone_number",
"Mobile": "phone_number",
"Location": "address",
"Address": "address",
"Full Name": "employee_name",
"Employee Name": "employee_name",
"Emp_ID": "employee_id",
"WorkerID": "employee_id"
}

In [None]:
# Step 2: Feature Extraction
def extract_features(col_values):
    sample = col_values.dropna().astype(str).values[:50] # sample max 50 rows
    joined = " ".join(sample)
    features = {
        "has_at": int("@" in joined),
        "digit_ratio": sum(c.isdigit() for c in joined) / max(1, len(joined)),
        "avg_len": np.mean([len(v) for v in sample]),
        "has_commas": int("," in joined),
        "has_plus": int("+" in joined),
        "has_spaces": int(" " in joined),
    }
    return features

In [None]:
# Build training data
X, y = [], []
for df in [df1, df2]:
    for col in df.columns:
        feats = extract_features(df[col])
        X.append(list(feats.values()))
        y.append(labels[col])

feature_names = list(extract_features(df1["Contact Info"]).keys())

In [None]:
# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [None]:
# Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.3, random_state=42)

In [None]:
# Step 4: Benchmark Models

models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"Random Forest": RandomForestClassifier(random_state=42),
"XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}


results = []


for name, model in models.items():
    tracemalloc.start()
    start_time = time.time()


    model.fit(X_train, y_train)


    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()


    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Get the unique labels present in the test set
    unique_labels_test = np.unique(y_test)
    # Get the corresponding target names using the label encoder
    target_names_test = le.classes_[unique_labels_test]


    results.append({
    "Model": name,
    "Accuracy": acc,
    "Training Time (s)": round(end_time - start_time, 4),
    "Current Memory (MB)": round(current / 10**6, 2),
    "Peak Memory (MB)": round(peak / 10**6, 2)
    })


    print("\n===", name, "===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Training Time: {end_time - start_time:.4f} seconds")
    print(f"Current Memory Usage: {current / 10**6:.2f} MB")
    print(f"Peak Memory Usage: {peak / 10**6:.2f} MB")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, labels=unique_labels_test, target_names=target_names_test))


=== Logistic Regression ===
Accuracy: 1.0000
Training Time: 0.1559 seconds
Current Memory Usage: 0.02 MB
Peak Memory Usage: 0.06 MB
Classification Report:
               precision    recall  f1-score   support

email_address       1.00      1.00      1.00         1
employee_name       1.00      1.00      1.00         1
 phone_number       1.00      1.00      1.00         1

     accuracy                           1.00         3
    macro avg       1.00      1.00      1.00         3
 weighted avg       1.00      1.00      1.00         3


=== Random Forest ===
Accuracy: 1.0000
Training Time: 1.9918 seconds
Current Memory Usage: 0.14 MB
Peak Memory Usage: 0.16 MB
Classification Report:
               precision    recall  f1-score   support

email_address       1.00      1.00      1.00         1
employee_name       1.00      1.00      1.00         1
 phone_number       1.00      1.00      1.00         1

     accuracy                           1.00         3
    macro avg       1.00     

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
Accuracy: 0.0000
Training Time: 1.7080 seconds
Current Memory Usage: 0.02 MB
Peak Memory Usage: 0.03 MB
Classification Report:
               precision    recall  f1-score   support

email_address       0.00      0.00      0.00       1.0
employee_name       0.00      0.00      0.00       1.0
 phone_number       0.00      0.00      0.00       1.0

    micro avg       0.00      0.00      0.00       3.0
    macro avg       0.00      0.00      0.00       3.0
 weighted avg       0.00      0.00      0.00       3.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Step 5: Summary Table

summary_df = pd.DataFrame(results)
print("\n Benchmark Results")
print(summary_df)


 Benchmark Results
                 Model  Accuracy  Training Time (s)  Current Memory (MB)  \
0  Logistic Regression       1.0             0.1559                 0.02   
1        Random Forest       1.0             1.9918                 0.14   
2              XGBoost       0.0             1.7080                 0.02   

   Peak Memory (MB)  
0              0.06  
1              0.16  
2              0.03  


Implement 5 approaches to classify HR dataset fields.
 Approaches:
 1. Classical ML (Logistic Regression, Random Forest, XGBoost)
 2. Regex + ML Hybrid
 3. Unsupervised Clustering (KMeans)
 4. AutoML (TPOT / Auto-sklearn)
 5. Deep Learning (DistilBERT

In [None]:
!pip install xgboost -q

In [None]:
!pip install tpot auto-sklearn transformers torch -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Installing build dep

In [None]:
import pandas as pd
import numpy as np
import re
import time
import tracemalloc


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score


import xgboost as xgb
from tpot import TPOTClassifier
import autosklearn.classification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

ModuleNotFoundError: No module named 'tpot'

In [None]:
# Step 1: Simulate datasets
df1 = pd.DataFrame({
"Contact Info": ["john.doe@company.com", "jane.smith@mail.com", "michael.lee@hr.com", "emma.brown@xyz.org", "david.johnson@work.io"],
"Phone Number": ["(555) 123-4567", "555-987-6543", "212-345-7890", "+1 646-555-1212", "(310) 789-4321"],
"Location": ["123 Main St, New York, NY", "45 Wall Street, New York, NY", "67 Park Ave, Boston, MA", "89 Oak Road, Chicago, IL", "12 Sunset Blvd, Los Angeles"],
"Full Name": ["John Doe", "Jane Smith", "Michael Lee", "Emma Brown", "David Johnson"],
"Emp_ID": ["E001", "E002", "E003", "E004", "E005"]
})


df2 = pd.DataFrame({
"Email": ["sarah.connor@work.com", "tom.hardy@jobs.org", "alice.wong@mail.net", "robert.white@staff.io", "linda.green@corp.org"],
"Mobile": ["202-333-4567", "+44 7700 900123", "(415) 222-9876", "646-444-1212", "555-678-9999"],
"Address": ["100 King St, Washington, DC", "22 Queen Rd, London, UK", "78 Market St, San Francisco", "15 Pine Lane, Boston, MA", "200 Broadway, New York, NY"],
"Employee Name": ["Sarah Connor", "Tom Hardy", "Alice Wong", "Robert White", "Linda Green"],
"WorkerID": ["W101", "W102", "W103", "W104", "W105"]
})

In [None]:
# Labels
labels = {
"Contact Info": "email_address",
"Email": "email_address",
"Phone Number": "phone_number",
"Mobile": "phone_number",
"Location": "address",
"Address": "address",
"Full Name": "employee_name",
"Employee Name": "employee_name",
"Emp_ID": "employee_id",
"WorkerID": "employee_id"
}

In [None]:
# Step 2: Feature Extraction
def extract_features(col_values):
    sample = col_values.dropna().astype(str).values[:50]
    joined = " ".join(sample)
    return {
        "has_at": int("@" in joined),
        "digit_ratio": sum(c.isdigit() for c in joined) / max(1, len(joined)),
        "avg_len": np.mean([len(v) for v in sample]),
        "has_commas": int("," in joined),
        "has_plus": int("+" in joined),
        "has_spaces": int(" " in joined),
    }


def extract_regex_features(col_values):
    sample = col_values.dropna().astype(str).values[:50]
    joined = " ".join(sample)
    return {
        "email_pattern": int(bool(re.search(r"[\w._%+-]+@[\w.-]+", joined))),
        "phone_pattern": int(bool(re.search(r"\d{3}[- )]\d{3}[- ]\d{4}", joined))),
        "address_pattern": int("," in joined and any(ch.isdigit() for ch in joined)),
        "name_pattern": int(all(w.istitle() for w in sample[0].split())),
        "id_pattern": int(bool(re.search(r"[A-Z]\d+", joined)))
    }

In [None]:
# Step 3: Build Data

X, y, X_regex = [], [], []
for df in [df1, df2]:
    for col in df.columns:
        feats = extract_features(df[col])
        regex_feats = extract_regex_features(df[col])
        X.append(list(feats.values()))
        X_regex.append(list(regex_feats.values()))
        y.append(labels[col])


# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

In [None]:
# Step 4: Define Benchmark Function

def benchmark_model(name, model, X_train, X_test, y_train, y_test):
    tracemalloc.start()
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()


    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return {
    "Model": name,
    "Accuracy": acc,
    "Training Time (s)": round(end - start, 4),
    "Current Memory (MB)": round(current / 1e6, 2),
    "Peak Memory (MB)": round(peak / 1e6, 2)
    }

In [None]:
# Approach 1: Classical ML

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.3, random_state=42)


results = []
results.append(benchmark_model("Logistic Regression", LogisticRegression(max_iter=1000), X_train, X_test, y_train, y_test))
results.append(benchmark_model("Random Forest", RandomForestClassifier(random_state=42), X_train, X_test, y_train, y_test))
results.append(benchmark_model("XGBoost", xgb.XGBClassifier(eval_metric='mlogloss'), X_train, X_test, y_train, y_test))

In [None]:
# Approach 2: Regex + ML Hybrid

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_regex, y_enc, test_size=0.3, random_state=42)
results.append(benchmark_model("Regex+LogReg", LogisticRegression(max_iter=1000), X_train_r, X_test_r, y_train_r, y_test_r))

In [None]:
# Approach 3: Unsupervised Clustering

kmeans = KMeans(n_clusters=len(set(y_enc)), random_state=42)
kmeans.fit(X)
y_pred_kmeans = kmeans.labels_
results.append({
"Model": "KMeans Clustering",
"Accuracy": adjusted_rand_score(y_enc, y_pred_kmeans),
"Training Time (s)": None,
"Current Memory (MB)": None,
"Peak Memory (MB)": None
})

In [None]:
# Approach 4: AutoML (TPOT demo, small generations)

tpot = TPOTClassifier(generations=2, population_size=5, verbosity=0)
results.append(benchmark_model("TPOT AutoML", tpot, X_train, X_test, y_train, y_test))

NameError: name 'TPOTClassifier' is not defined

In [None]:
# Approach 5: Deep Learning (DistilBERT)
# Prepare text data: column name + sample values

texts = []
for df in [df1, df2]:
    for col in df.columns:
        sample = " ".join(df[col].astype(str).values[:3])
        texts.append(col + " " + sample)


X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(texts, y_enc, test_size=0.3, random_state=42)

In [None]:
# Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
train_encodings = tokenizer(X_train_t, truncation=True, padding=True)
test_encodings = tokenizer(X_test_t, truncation=True, padding=True)


class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}


train_dataset = Dataset(train_encodings, y_train_t)
test_dataset = Dataset(test_encodings, y_test_t)


model_dl = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))


training_args = TrainingArguments(output_dir="./results", num_train_epochs=1, per_device_train_batch_size=2, logging_dir="./logs", logging_steps=5)
trainer = Trainer(model=model_dl, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)


tracemalloc.start()
start = time.time()
trainer.train()
end = time.time()
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()


preds = trainer.predict(test_dataset)
y_pred_dl = np.argmax(preds.predictions, axis=1)
acc = accuracy_score(y_test_t, y_pred_dl)


results.append({
    "Model": "DistilBERT",
    "Accuracy": acc,
    "Training Time (s)": round(end - start, 4),
    "Current Memory (MB)": round(current / 1e6, 2),
    "Peak Memory (MB)": round(peak / 1e6, 2)
})

NameError: name 'DistilBertTokenizerFast' is not defined

In [None]:
# Step 6: Summary

summary_df = pd.DataFrame(results)
print("Benchmark Results")
print(summary_df)

Benchmark Results
                 Model  Accuracy  Training Time (s)  Current Memory (MB)  \
0  Logistic Regression  1.000000             0.1578                 0.01   
1        Random Forest  1.000000             1.4263                 0.14   
2              XGBoost  0.000000             0.0539                 0.01   
3         Regex+LogReg  0.666667             0.0100                 0.01   
4    KMeans Clustering  1.000000                NaN                  NaN   

   Peak Memory (MB)  
0              0.05  
1              0.17  
2              0.01  
3              0.05  
4               NaN  
