In [25]:
import os
from datasets import DatasetDict, load_dataset
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

def load_bio_data(file_path):
    """
    Load BIO-formatted data into a structured dataset.

    Args:
        file_path (str): Path to the BIO file.

    Returns:
        list of dict: Contains tokens and corresponding BIO labels.
    """
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        current_tokens = []
        current_labels = []

        for line in f:
            if line.strip() == "":
                # End of a sentence
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
            else:
                token, tag = line.strip().split()
                current_tokens.append(token)
                current_labels.append(tag)

        # Add the last sentence
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)

    return sentences, labels

# Load BIO data
file_path = "./resources/all_event_data.bio"
tokens, labels = load_bio_data(file_path)

In [26]:
# Split into train and test sets
train_tokens, test_tokens, train_labels, test_labels = train_test_split(tokens, labels, test_size=0.2, random_state=42)


In [27]:
from transformers import BertTokenizerFast

# Load the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Define label mapping
unique_labels = sorted(set(tag for label_list in labels for tag in label_list))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

In [28]:
def encode_data(tokens, labels, tokenizer, max_length=128):
    """
    Encode tokens and labels into BERT-compatible format.

    Args:
        tokens (list of list): Tokenized sentences.
        labels (list of list): BIO labels.
        tokenizer: Tokenizer object.
        max_length (int): Maximum sequence length.

    Returns:
        dict: Encoded inputs and labels.
    """
    encodings = tokenizer(tokens, is_split_into_words=True, truncation=True, padding=True, max_length=max_length)

    encoded_labels = []
    for i, label_list in enumerate(labels):
        word_ids = encodings.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Ignore padding tokens
            elif word_id != previous_word_id:
                label_ids.append(label2id[label_list[word_id]])
            else:
                label_ids.append(-100)  # Ignore subwords
            previous_word_id = word_id

        encoded_labels.append(label_ids)

    encodings["labels"] = encoded_labels
    return encodings

# Encode train and test data
train_encodings = encode_data(train_tokens, train_labels, tokenizer)
test_encodings = encode_data(test_tokens, test_labels, tokenizer)


In [29]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

# Create datasets
train_dataset = NERDataset(train_encodings)
test_dataset = NERDataset(test_encodings)


In [30]:
from transformers import BertForTokenClassification

# Load pre-trained BERT model for token classification
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from torch.utils.data import DataLoader
from transformers import AdamW

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [None]:
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from tqdm import tqdm

# Use GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        # Move batch to device
        inputs = {key: val.to(device) for key, val in batch.items()}

        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}: Loss = {total_loss / len(train_loader):.4f}")


100%|██████████| 4/4 [00:00<00:00,  4.05it/s]


Epoch 1: Loss = 1.1290


100%|██████████| 4/4 [00:00<00:00,  5.32it/s]


Epoch 2: Loss = 0.3063


100%|██████████| 4/4 [00:00<00:00,  5.57it/s]


Epoch 3: Loss = 0.2670


100%|██████████| 4/4 [00:00<00:00,  5.41it/s]


Epoch 4: Loss = 0.3171


100%|██████████| 4/4 [00:00<00:00,  5.72it/s]


Epoch 5: Loss = 0.2860


100%|██████████| 4/4 [00:00<00:00,  5.55it/s]


Epoch 6: Loss = 0.2771


100%|██████████| 4/4 [00:00<00:00,  5.69it/s]


Epoch 7: Loss = 0.2541


100%|██████████| 4/4 [00:00<00:00,  5.75it/s]


Epoch 8: Loss = 0.2394


100%|██████████| 4/4 [00:00<00:00,  5.60it/s]


Epoch 9: Loss = 0.2296


100%|██████████| 4/4 [00:00<00:00,  5.94it/s]

Epoch 10: Loss = 0.1913





In [33]:
from sklearn.metrics import classification_report

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=2)

        # Collect predictions and true labels
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert predictions and labels to flat lists for classification report
flat_preds = []
flat_labels = []

for pred, true_label in zip(predictions, true_labels):
    for p, l in zip(pred, true_label):
        if l != -100:  # Only include non-padding tokens
            flat_preds.append(id2label[p])  # Convert predicted IDs to labels
            flat_labels.append(id2label[l])  # Convert true IDs to labels

print(classification_report(flat_labels, flat_preds, digits=4))

              precision    recall  f1-score   support

      B-FOOD     0.0000    0.0000    0.0000         5
 B-FOOD-TIME     0.4444    0.4444    0.4444         9
      E-FOOD     0.0000    0.0000    0.0000         1
      I-FOOD     0.0000    0.0000    0.0000         5
 I-FOOD-TIME     0.0000    0.0000    0.0000         7
           O     0.9837    0.9983    0.9910      1211

    accuracy                         0.9798      1238
   macro avg     0.2380    0.2405    0.2392      1238
weighted avg     0.9655    0.9798    0.9726      1238



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

('./ner_model\\tokenizer_config.json',
 './ner_model\\special_tokens_map.json',
 './ner_model\\vocab.txt',
 './ner_model\\added_tokens.json',
 './ner_model\\tokenizer.json')

In [None]:
# Test the saved model
loaded_model = BertForTokenClassification.from_pretrained("./ner_model")
loaded_tokenizer = BertTokenizerFast.from_pretrained("./ner_model")

# Tokenize a sentence
sentence = """
Come and join us as we step into the new year with exciting activities! (  ≧ᗜ≦) ⋆⭒˚.⋆

Explore Middle Eastern culture through fun games and savor rich, flavorful, and exquisite food prepared just for YOU! (˶˃ ᵕ ˂˶)❤️


Save the date!🗓️🙀


📅Date : 24 January 2025

📍Venue : SMUC ALC 3.1-3.2

⏰Time: 4:00PM - 7:00PM



Food and refreshments will be provided!🍽️


Sign up HERE by clicking on the RSVP button below SMU Al Khaleej : Step Into the Middle East! Forms are open until 17 January 2025. Hurry as slots are limited! 😱

 

If you have any questions or concerns, do reach out to @hahahhaha via Telegram.

 

Follow us on Instagram and join our Telegram Group for our latest updates!

 

Should you wish to unsubscribe:

Internal recipients of SMU, please visit <link> to filter away this EDM.  
"""

tokens = loaded_tokenizer.tokenize(sentence)
inputs = loaded_tokenizer(sentence, return_tensors="pt")

# Make a prediction
outputs = loaded_model(**inputs)
logits = outputs.logits
preds = torch.argmax(logits, dim=2)

# Decode the prediction
decoded_preds = [id2label[p] for p in preds[0].numpy()]

# Print all label that starts with B-FOOD
food_labels = [label for label in decoded_preds if label.startswith("B-FOOD")]
print(food_labels)



['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[]


In [48]:
# load csv
import pandas as pd
df = pd.read_csv('./resources/all_event_data.csv')

# for each row in df where df["is_event"] == "Yes" and df["food_mentioned"] == "Yes"
# get "body" and run the model on it
for index, row in df.iterrows():
    if row["is_event"] == "Yes" and row["food_mentioned"] == "Yes":
        tokens = loaded_tokenizer.tokenize(row["body"])
        inputs = loaded_tokenizer(row["body"], return_tensors="pt", truncation=True, padding=True, max_length=512)

        # Make a prediction
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=2)

        # Decode the prediction
        decoded_preds = [id2label[p] for p in preds[0].numpy()]

        if len(set(decoded_preds)) > 1:
            # Print the word and label that is not O
            for token, label in zip(tokens, decoded_preds):
                if label != "O":
                    print(token, label)
            print(row["body"])
            print("\n\n")

18 I-FOOD-TIME
: B-FOOD-TIME
nov B-FOOD-TIME
- I-FOOD-TIME
##pm I-FOOD-TIME
##pm I-FOOD-TIME
SMU Classification: Restricted



Dear SCIS Students,

 

Gentle reminder about the Ascenda?s sharing happening this Friday, 22 Nov 2024, 4pm - 5.30pm.

If you are keen to attend, do sign up here (https://smu.sg/cs301-ascenda) by today, 18 Nov 2024.

 

Thanks. 

 

Regards,

Office of the Dean

School of Computing and Information Systems

 

From: School of Computing and Information Systems 
Sent: Friday, October 18, 2024 3:42 PM
Subject: Ascenda Loyalty Talk - Friday, 22 Nov 2024, 4pm - 5.30pm

 

Dear SCIS Students,

 

Ascenda (https://www.ascenda.com)  will be doing a sharing on Friday (22 Nov). 

 

Details as follow:

 

Venue: SMU SCIS1 SR B1-1.

Time: 22 Nov (Friday) Week 14, 4pm - 5.30pm

 

Agenda: 

1. Life as an Ascenda Engineer sharing

2. Configuration Management Automated

3. Q&A / mingling

 

4 reasons why you should attend the talk:

1. Learn about what is configuration Manag