<a href="https://colab.research.google.com/github/charookc5/Text-Semantics-Classification/blob/main/Semantics_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets pandas scikit-learn



DATASET

In [None]:
import pandas as pd

# Example labeled dataset
data = [
    {"text": "Column: Contact Info, Sample: john.doe@company.com", "label": "email_address"},
    {"text": "Column: EmailID, Sample: alice@work.org", "label": "email_address"},
    {"text": "Column: Phone, Sample: (555) 123-4567", "label": "phone_number"},
    {"text": "Column: Cell, Sample: +1-202-555-0199", "label": "phone_number"},
    {"text": "Column: Addr, Sample: 123 Main St, NY", "label": "address"},
    {"text": "Column: Residence, Sample: 456 Pine Ave, CA", "label": "address"},
]

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,"Column: Contact Info, Sample: john.doe@company...",email_address
1,"Column: EmailID, Sample: alice@work.org",email_address
2,"Column: Phone, Sample: (555) 123-4567",phone_number
3,"Column: Cell, Sample: +1-202-555-0199",phone_number
4,"Column: Addr, Sample: 123 Main St, NY",address
5,"Column: Residence, Sample: 456 Pine Ave, CA",address


ENCODING converts text labels into numbers so a machine learning model can understand them.

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])

num_labels = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)

Classes: ['address' 'email_address' 'phone_number']


Tokenizer : raw text converted into a format that DistilBERT can understand

In [None]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize text
encodings = tokenizer(df["text"].tolist(), truncation=True, padding=True)

Created a PyTorch-compatible dataset that holds your tokenized text and numeric labels

In [None]:
import torch
from torch.utils.data import Dataset

class HRDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

dataset = HRDataset(encodings, df["label_id"].tolist())


Load Model & Train

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,0.9


TrainOutput(global_step=10, training_loss=0.8999631881713868, metrics={'train_runtime': 40.5512, 'train_samples_per_second': 0.74, 'train_steps_per_second': 0.247, 'total_flos': 139714201080.0, 'train_loss': 0.8999631881713868, 'epoch': 5.0})

Prediction

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

samples = [
    "Column: Contact Info, Sample: mike.smith@company.com",
    "Column: Mobile, Sample: 987-654-3210",
    "Column: Addr, Sample: 789 Elm Rd, TX"
]

for s in samples:
    result = classifier(s)[0]
    label = label_encoder.inverse_transform([int(result["label"].split("_")[-1])])[0]
    print(f"{s} → {label} (confidence {result['score']:.2f})")


Device set to use cpu


Column: Contact Info, Sample: mike.smith@company.com → email_address (confidence 0.46)
Column: Mobile, Sample: 987-654-3210 → phone_number (confidence 0.51)
Column: Addr, Sample: 789 Elm Rd, TX → address (confidence 0.40)


In [None]:
# Additional samples for testing
additional_samples = [
    "Column: Work Email, Sample: david.jones@anothercompany.net",
    "Column: Office Phone, Sample: +44 20 7946 0958",
    "Column: Home Address, Sample: Apartment 2B, 567 Oak St, Chicago, IL",
    "Column: Mailing, Sample: P.O. Box 1234, Anytown, USA",
]

print("Testing with additional samples:")
for s in additional_samples:
    result = classifier(s)[0]
    label = label_encoder.inverse_transform([int(result["label"].split("_")[-1])])[0]
    print(f"{s} → {label} (confidence {result['score']:.2f})")

Testing with additional samples:
Column: Work Email, Sample: david.jones@anothercompany.net → email_address (confidence 0.49)
Column: Office Phone, Sample: +44 20 7946 0958 → phone_number (confidence 0.49)
Column: Home Address, Sample: Apartment 2B, 567 Oak St, Chicago, IL → address (confidence 0.37)
Column: Mailing, Sample: P.O. Box 1234, Anytown, USA → address (confidence 0.38)


Measure Execution Time for One Sample

In [None]:
import time  # Step 1: Import the time module

sample = "Column: Mobile, Sample: 987-654-3210"

start_time = time.time()  # Step 2: Record the start time

result = classifier(sample)[0]  # Step 3: Run the classification

end_time = time.time()  # Step 4: Record the end time

# Step 5: Calculate and print the duration
duration = end_time - start_time
label = label_encoder.inverse_transform([int(result["label"].split("_")[-1])])[0]
print(f"{sample} → {label} (confidence {result['score']:.2f}) in {duration:.4f} seconds")


Column: Mobile, Sample: 987-654-3210 → phone_number (confidence 0.48) in 0.0844 seconds


In [None]:
from sklearn.metrics import accuracy_score

# Evaluate accuracy on the training dataset
predictions = []
true_labels = []
for item in dataset:
    input_text = tokenizer.decode(item["input_ids"], skip_special_tokens=True)
    result = classifier(input_text)[0]
    predicted_label_id = int(result["label"].split("_")[-1])
    predictions.append(predicted_label_id)
    true_labels.append(item["labels"].item())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on training data: {accuracy:.2f}")

print("\nConfidence scores for training data:")
for i, row in df.iterrows():
    result = classifier(row["text"])[0]
    label = label_encoder.inverse_transform([int(result["label"].split("_")[-1])])[0]
    print(f"{row['text']} → {label} (confidence {result['score']:.2f})")

print("\nConfidence scores for additional samples:")
for s in additional_samples:
    result = classifier(s)[0]
    label = label_encoder.inverse_transform([int(result["label"].split("_")[-1])])[0]
    print(f"{s} → {label} (confidence {result['score']:.2f})")

Accuracy on training data: 1.00

Confidence scores for training data:
Column: Contact Info, Sample: john.doe@company.com → email_address (confidence 0.44)
Column: EmailID, Sample: alice@work.org → email_address (confidence 0.50)
Column: Phone, Sample: (555) 123-4567 → phone_number (confidence 0.53)
Column: Cell, Sample: +1-202-555-0199 → phone_number (confidence 0.56)
Column: Addr, Sample: 123 Main St, NY → address (confidence 0.43)
Column: Residence, Sample: 456 Pine Ave, CA → address (confidence 0.44)

Confidence scores for additional samples:
Column: Work Email, Sample: david.jones@anothercompany.net → email_address (confidence 0.50)
Column: Office Phone, Sample: +44 20 7946 0958 → phone_number (confidence 0.46)
Column: Home Address, Sample: Apartment 2B, 567 Oak St, Chicago, IL → address (confidence 0.38)
Column: Mailing, Sample: P.O. Box 1234, Anytown, USA → address (confidence 0.38)
