# BertForSequenceClassification

In [2]:
import pandas as pd
import re
import emoji
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AdamW
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
from transformers import BertTokenizerFast

In [3]:
chunks = pd.read_csv("merged_emoji_data.csv", chunksize=1000)  # read in chunks of 1000 rows since there is a lot of data
final_merged_df = pd.concat(chunks, ignore_index=True)  # combine the chunks

In [4]:
# change the text, title, and ucode_short to type string
final_merged_df['text'] = final_merged_df['text'].astype('string')
final_merged_df['title'] = final_merged_df['title'].astype('string')
final_merged_df['ucode_short'] = final_merged_df['ucode_short'].astype('string')

# drop all the NA values 
final_merged_df = final_merged_df.dropna()

In [5]:
final_merged_df

Unnamed: 0,text,title,ucode_short,ucode,cleaned_text
0,Text,egg,1f95a,ü•ö,text
1,Happy EasterüòÄüêáü•ö,egg,1f95a,ü•ö,happy easter
2,@elonmusk @teslaownersSV #eggs Easter eggs tod...,egg,1f95a,ü•ö,eggs easter eggs today will surely be deliciou...
3,We hope everyone has a Hoppy Easter! üòâüê∞ü•öüê£üå∑ #H...,egg,1f95a,ü•ö,we hope everyone has a hoppy easter happyeaste...
4,üê∞ü•ö Get ready to hop into the most egg-citing d...,egg,1f95a,ü•ö,get ready to hop into the most eggciting digit...
...,...,...,...,...,...
848550,Happy Easter üê£ üíì from my brother ‚ù§Ô∏è Armen Grig...,hatching chick,1f423,üê£,happy easter from my brother armen grigoryan s...
848551,"Have a great finish to your weekend, Happy Eas...",hatching chick,1f423,üê£,have a great finish to your weekend happy east...
848552,"Beef ribs, smothered turkey wings, cheesy pota...",hatching chick,1f423,üê£,beef ribs smothered turkey wings cheesy potato...
848553,Happy Easter! Hope you all have an amazing day...,hatching chick,1f423,üê£,happy easter hope you all have an amazing day ...


In [6]:
label_encoder = LabelEncoder()
final_merged_df['emoji_label'] = label_encoder.fit_transform(final_merged_df['ucode'])

sample_size = 75000 

data_sampled, _ = train_test_split(
    final_merged_df,
    train_size=sample_size,
    stratify=final_merged_df['emoji_label'], 
    random_state=42
)
# # shuffle the reduced dataset
data_sampled = data_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

# split data into training and testing sets
X = data_sampled['cleaned_text']
y = data_sampled['emoji_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
X_train = X_train.astype(str).tolist()  # Convert to string type and then to list
X_test = X_test.astype(str).tolist()

# load the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# tokenize data
X_train_tokens = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
X_test_tokens = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)

# STOP

In [10]:
# BERT for classification with the number of emoji classes
num_classes = len(label_encoder.classes_)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes).to(device)

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = CustomDataset(
    input_ids=X_train_tokens['input_ids'],
    attention_mask=X_train_tokens['attention_mask'],
    labels=torch.tensor(y_train.values)
)

eval_dataset = CustomDataset(
    input_ids=X_test_tokens['input_ids'],
    attention_mask=X_test_tokens['attention_mask'],
    labels=torch.tensor(y_test.values)
)

from sklearn.metrics import accuracy_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Start with 8
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Simulate larger batch size by accumulating gradients
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False,  # Disable mixed precision if not using CUDA-enabled GPU
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_steps=500,
    eval_steps=500,
    max_grad_norm=1.0,  # Clip gradients
    report_to='none'
)


# Optimizer setup
optimizer = AdamW(model.parameters(), lr=1e-5)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),  # Use AdamW optimizer
)

# Training
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,3.1364,3.118223,0.170533
2,2.8733,3.050564,0.188667
3,2.9581,3.039346,0.193533


TrainOutput(global_step=11250, training_loss=3.0763958357069225, metrics={'train_runtime': 17774.496, 'train_samples_per_second': 10.127, 'train_steps_per_second': 0.633, 'total_flos': 9530919723600000.0, 'train_loss': 3.0763958357069225, 'epoch': 3.0})

In [8]:
import emoji
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove all mentions (e.g., @username)
    text = re.sub(r'@[\w]+', '', text)  # removes anything starting with @
    # remove emojis if not needed (optional, you can skip this part if emojis should be kept)
    text = emoji.replace_emoji(text, replace='')  # this removes all emojis

    # remove non-alphanumeric characters (keep letters and numbers)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [13]:
import emoji
# Define the input texts
new_texts = ["I'm anxious about class. Everything is stressing me out.", "Easter egg hunt this sunday!", "Good job on the promotion! Im'm very proud of you."]

# Clean and tokenize the texts
new_texts_cleaned = [clean_text(text) for text in new_texts]
new_tokens = tokenizer(
    new_texts_cleaned,
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=512
)

# Move model and tokens to the appropriate device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)
new_tokens = {key: val.to(device) for key, val in new_tokens.items()}

# Get predictions from the model
model.eval()
with torch.no_grad():
    outputs = model(**new_tokens)
    logits = outputs.logits

# Decode the predicted labels
predicted_labels = torch.argmax(logits, dim=-1)
predicted_emoji = label_encoder.inverse_transform(predicted_labels.cpu().numpy())

# Display the predicted emojis for each input text
for text, emoji in zip(new_texts, predicted_emoji):
    print(f"Text: {text} -> Predicted Emoji: {emoji}")

Text: I'm anxious about class. Everything is stressing me out. -> Predicted Emoji: üò¢
Text: Easter egg hunt this sunday! -> Predicted Emoji: ü•ö
Text: Good job on the promotion! Im'm very proud of you. -> Predicted Emoji: üéâ


In [14]:
# Save the model (commented out so it doesn't save every time
# trainer.save_model("./BertForSequence_model")

In [9]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = BertForSequenceClassification.from_pretrained("./BertForSequence_model").to(device)

In [10]:
import emoji
# Define the input texts
new_texts = ["Its a sunny day today!", 
             "I'm anxious about class. Everything is stressing me out", 
             "Easter egg hunt this sunday!", 
             "Good job on the promotion! Im'm very proud of you.",
            "Happy birthday amanda!", 
             "Nice work on the project!", 
             "Did you feed the bunny?",
             "I like cheese",
             "Woke brain virus",
            "Nick denies being gay",
            "I don't think ill be able to finish my project today"]

# Clean and tokenize the texts
new_texts_cleaned = [clean_text(text) for text in new_texts]
new_tokens = tokenizer(
    new_texts_cleaned,
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=512
)

# Move model and tokens to the appropriate device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)
new_tokens = {key: val.to(device) for key, val in new_tokens.items()}

# Get predictions from the model
model.eval()
with torch.no_grad():
    outputs = model(**new_tokens)
    logits = outputs.logits

# Decode the predicted labels
predicted_labels = torch.argmax(logits, dim=-1)
predicted_emoji = label_encoder.inverse_transform(predicted_labels.cpu().numpy())

# Display the predicted emojis for each input text
for text, emoji in zip(new_texts, predicted_emoji):
    print(f"Text: {text} -> Predicted Emoji: {emoji}")

Text: Its a sunny day today! -> Predicted Emoji: ‚òÄ
Text: I'm anxious about class. Everything is stressing me out -> Predicted Emoji: üò¢
Text: Easter egg hunt this sunday! -> Predicted Emoji: ü•ö
Text: Good job on the promotion! Im'm very proud of you. -> Predicted Emoji: üéâ
Text: Happy birthday amanda! -> Predicted Emoji: üéâ
Text: Nice work on the project! -> Predicted Emoji: üëç
Text: Did you feed the bunny? -> Predicted Emoji: üê∞
Text: I like cheese -> Predicted Emoji: üòã
Text: Woke brain virus -> Predicted Emoji: üò®
Text: Nick denies being gay -> Predicted Emoji: üíÄ
Text: I don't think ill be able to finish my project today -> Predicted Emoji: üò¢
