In [1]:
import pandas as pd

# Read in the data
df = pd.read_csv("resources/all_event_data.csv")

# Print the first few rows of the dataframe
print(df.head())

                                              sender     received_time  \
0  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  25/12/2024 04:39   
1  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  25/12/2024 00:28   
2  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  24/12/2024 23:11   
3  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  24/12/2024 20:23   
4  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  24/12/2024 19:31   

                                             subject  \
0     [SMU VERTS] Events Director Co-opt Recruitment   
1  [BFI@SMU Connects] Season's Greetings: 2024 in...   
2  [SMU Athletics] Join our Open City Run at the ...   
3  [ICON ICS] Join the Band: Auditions Now Open! ...   
4  STAT401 Actuarial Science Work-Study Elective ...   

                                                body is_event  \
0  \r\n\r\n \r\n\r\n \r\n\r\n?? Join SMU Verts' 2...       No   
1  SMU Classification: Restricted\r\n\r\n\r\n\r\n...       No   
2  POV: You are thinkin

In [2]:
# drop rows where is_event == "500 Unable to submit request because the service is temporarily unavailable."
df = df[df["is_event"] != "500 Unable to submit request because the service is temporarily unavailable."]

In [3]:
import re
from sklearn.preprocessing import LabelEncoder

# Clean the text
def clean_text(text):
    # Remove URLs and email links (http, https, mailto)
    text = re.sub(r'(https?://\S+|mailto:\S+)', '', text)
    # Remove all newline (\n) and carriage return (\r) characters
    text = re.sub(r'[\n\r]', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase and strip leading/trailing whitespace
    return text.lower().strip()


# Apply cleaning
df['body'] = df['body'].apply(clean_text)

# inspect the data
print(df.head())

                                              sender     received_time  \
0  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  25/12/2024 04:39   
1  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  25/12/2024 00:28   
2  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  24/12/2024 23:11   
3  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  24/12/2024 20:23   
4  /O=EXCHANGELABS/OU=EXCHANGE ADMINISTRATIVE GRO...  24/12/2024 19:31   

                                             subject  \
0     [SMU VERTS] Events Director Co-opt Recruitment   
1  [BFI@SMU Connects] Season's Greetings: 2024 in...   
2  [SMU Athletics] Join our Open City Run at the ...   
3  [ICON ICS] Join the Band: Auditions Now Open! ...   
4  STAT401 Actuarial Science Work-Study Elective ...   

                                                body is_event  \
0  ?? join smu verts' 24th executive committee: l...       No   
1  smu classification: restricted please click he...       No   
2  pov: you are thinkin

In [4]:
# Map 'Yes' -> 1 and 'No' -> 0 for both columns
df['is_event'] = df['is_event'].map({'Yes': 1, 'No': 0})
df['food_mentioned'] = df['food_mentioned'].map({'Yes': 1, 'No': 0})

In [5]:
# Combine labels into a single column for multi-label classification
df['labels'] = df.apply(lambda row: [int(row['is_event']), int(row['food_mentioned'])], axis=1)

In [6]:
from sklearn.model_selection import train_test_split

# Split into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['body'], df['labels'].tolist(), test_size=0.2, random_state=42
)

In [7]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text with truncation and padding
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import torch

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # Multi-label
        return item

# Create datasets
train_dataset = EmailDataset(train_encodings, train_labels)
test_dataset = EmailDataset(test_encodings, test_labels)


In [9]:
from transformers import BertForSequenceClassification

# Load BERT for sequence classification with 2 output labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import AdamW
from torch.nn import BCEWithLogitsLoss

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = BCEWithLogitsLoss()

# Move the model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
from torch.utils.data import DataLoader

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [12]:
num_epochs = 5  # You can adjust this based on your dataset size

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        # Move batch to device
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        loss = criterion(logits, labels)  # Multi-label loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/5, Loss: 0.3837
Epoch 2/5, Loss: 0.1590
Epoch 3/5, Loss: 0.1148
Epoch 4/5, Loss: 0.0921
Epoch 5/5, Loss: 0.0855


In [14]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        # Get predictions
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.sigmoid(logits) > 0.5  # Threshold at 0.5

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert to NumPy arrays for evaluation
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Calculate accuracy and F1 for each label
is_event_accuracy = accuracy_score(true_labels[:, 0], predictions[:, 0])
food_mentioned_accuracy = accuracy_score(true_labels[:, 1], predictions[:, 1])
is_event_f1 = f1_score(true_labels[:, 0], predictions[:, 0])
food_mentioned_f1 = f1_score(true_labels[:, 1], predictions[:, 1])

print(f"Is Event Accuracy: {is_event_accuracy:.2f}, F1 Score: {is_event_f1:.2f}")
print(f"Food Mentioned Accuracy: {food_mentioned_accuracy:.2f}, F1 Score: {food_mentioned_f1:.2f}")


Is Event Accuracy: 0.93, F1 Score: 0.95
Food Mentioned Accuracy: 0.97, F1 Score: 0.92


In [15]:
model.save_pretrained("multi_label_event_model")
tokenizer.save_pretrained("multi_label_event_model")

('multi_label_event_model\\tokenizer_config.json',
 'multi_label_event_model\\special_tokens_map.json',
 'multi_label_event_model\\vocab.txt',
 'multi_label_event_model\\added_tokens.json')

In [17]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained("multi_label_event_model")
model = BertForSequenceClassification.from_pretrained("multi_label_event_model")
model.to(device)

# New email content
new_email = """
Come and join us as we step into the new year with exciting activities! (  ≧ᗜ≦) ⋆⭒˚.⋆

Explore Middle Eastern culture through fun games and savor rich, flavorful, and exquisite food prepared just for YOU! (˶˃ ᵕ ˂˶)❤️


Save the date!🗓️🙀


📅Date : 24 January 2025

📍Venue : SMUC ALC 3.1-3.2

⏰Time: 4:00PM - 7:00PM



Food and refreshments will be provided!🍽️


Sign up HERE by clicking on the RSVP button below SMU Al Khaleej : Step Into the Middle East! Forms are open until 17 January 2025. Hurry as slots are limited! 😱

 

If you have any questions or concerns, do reach out to @hahahhaha via Telegram.

 

Follow us on Instagram and join our Telegram Group for our latest updates!

 

Should you wish to unsubscribe:

Internal recipients of SMU, please visit <link> to filter away this EDM.  
"""

new_email = clean_text(new_email)

# Tokenize and predict
inputs = tokenizer(new_email, return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs = {key: val.to(device) for key, val in inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits)  # Probabilities for each label
    prediction = probs > 0.5  # Threshold predictions at 0.5

print(f"Prediction: {prediction.cpu().numpy()}")


Prediction: [[ True  True]]
