In [1]:
import json
import torch
import transformers

In [3]:
data = "../data/train.jsonl"

train = []
with open(data, "r", encoding="utf-8") as f:
    for line in f:
        train.append(json.loads(line))

print(train[0])
print(train[0]['entities'])
start = train[0]['entities'][0]['start']
end = train[0]['entities'][0]['end']
print(train[0]['text'][start-1: end])

start = train[0]['entities'][1]['start']
end = train[0]['entities'][1]['end']
print(train[0]['text'][start-1: end])

{'id': 'utt_0001', 'text': 'my credit card number is 4242 4242 4242 4242 and my email is ramesh dot sharma at gmail dot com', 'entities': [{'start': 26, 'end': 49, 'label': 'CREDIT_CARD'}, {'start': 66, 'end': 80, 'label': 'PERSON_NAME'}, {'start': 84, 'end': 104, 'label': 'EMAIL'}]}
[{'start': 26, 'end': 49, 'label': 'CREDIT_CARD'}, {'start': 66, 'end': 80, 'label': 'PERSON_NAME'}, {'start': 84, 'end': 104, 'label': 'EMAIL'}]
4242 4242 4242 4242 and 
sh dot sharma a


In [5]:
from model import create_model, create_tokenizer

In [6]:
model = create_model(model_name="google-bert/bert-base-uncased")
tokenizer = create_tokenizer(tokenizer_name="google-bert/bert-base-uncased")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [22]:
from dataset import PIIDataset, collate_batch
from labels import *
from torch.utils.data import DataLoader

In [20]:
label_list = LABELS
dataset = PIIDataset(path="../data/train.jsonl", tokenizer=tokenizer, label_list=label_list)

In [23]:
def create_dataloader(dataset, tokenizer, batch_size=1, shuffle=True):
    pad_token_id = tokenizer.pad_token_id
    label_pad_id = -100

    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=lambda batch: collate_batch(batch, pad_token_id, label_pad_id)
    )
    return loader

In [25]:
train_loader = create_dataloader(dataset, tokenizer)

In [43]:
TEMPLATES = [
    "my credit card number is {card} and my email is {email}",
    "the card number is {card} and the name on it is {name}",
    "my name is {name} and my phone number is {phone}",
    "please update the account for {name}, email {email}, phone {phone}",
    "{name}'s credit card {card} was declined yesterday",
    "my phone number is {phone}, and my email is {email}",
]

HOMOPHONE_MAP = {
    "four": "for",
    "two": "too",
    "to": "2",
    "zero": "0",
    "one": "1",
    "three": "3",
    "eight": "ate",
    "at": "@" ,
    "dot": ".",
    "and": "n",
}

In [45]:
import random
import names
import faker
import json

fake = faker.Faker("en_IN")

def gen_card():
    blocks = [str(random.randint(1000,9999)) for _ in range(4)]
    return " ".join(blocks)

def gen_email(name):
    parts = name.lower().split()
    return f"{parts[0]}.{parts[-1]}@{fake.free_email_domain()}"

def gen_phone():
    return fake.msisdn()[0:10]

def homophone_noise(text):
    words = text.split()
    noisy_words = []
    for w in words:
        lw = w.lower()
        if lw in HOMOPHONE_MAP and random.random() < 0.3:
            noisy_words.append(HOMOPHONE_MAP[lw])
        else:
            noisy_words.append(w)
    return " ".join(noisy_words)

def apply_asr_noise(text):
    text = homophone_noise(text)
    return text.lower()

def create_example(template):
    name = names.get_full_name()
    card = gen_card()
    email = gen_email(name)
    phone = gen_phone()

    text = template.format(name=name, card=card, email=email, phone=phone)
    clean_text = text

    noisy_text = apply_asr_noise(clean_text)

    entities = []
    for label, value in [("PERSON_NAME", name.lower()), ("CREDIT_CARD", card), ("EMAIL", email), ("PHONE", phone)]:
        start = noisy_text.find(value.lower())
        if start != -1:
            entities.append({"start": start, "end": start + len(value), "label": label})

    return {"id": fake.uuid4(), "text": noisy_text, "entities": entities}

def generate_dataset(n=500, out_path="synthetic.jsonl"):
    with open(out_path, "w") as f:
        for _ in range(n):
            tpl = random.choice(TEMPLATES)
            ex = create_example(tpl)
            f.write(json.dumps(ex) + "\n")

In [46]:
generate_dataset(n=1000, out_path="train.jsonl")
generate_dataset(n=200, out_path="dev.jsonl")