In [1]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import torch.nn as nn
import os
from dotenv import load_dotenv
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from huggingface_hub import HfApi
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


cuda


True

In [None]:
word_subs = { 
    'can\'t': 'cannot',
    'won\'t': 'will not',
    'shouldn\'t': 'should not',
    'wouldn\'t': 'would not',
    'haven\'t': 'have not',
    'he\'ll': 'he will',
    'she\'ll': 'she will',
    'they\'ll': 'they will',
    'aren\'t': 'are not',
    'isn\'t': 'is not',
    'wasn\'t': 'was not',
    'weren\'t': 'were not',
    'doesn\'t': 'does not',
    'don\'t': 'do not',
    'didn\'t': 'did not',
    'hasn\'t': 'has not',
    'hadn\'t': 'had not',
    'is\'nt': 'is not',
    'it\'s': 'it is',
    'that\'s': 'that is',
    'who\'s': 'who is',
    'what\'s': 'what is',
    'here\'s': 'here is',
    'there\'s': 'there is',
    'let\'s': 'let us',
    'i\'ll': 'i will',
    'you\'ll': 'you will',
    'we\'ll': 'we will',
    'i\'ve': 'i have',
    'you\'ve': 'you have',
    'we\'ve': 'we have',
    'they\'ve': 'they have',
    'i\'d': 'i would',
    'you\'d': 'you would',
    'he\'d': 'he would',
    'she\'d': 'she would',
    'we\'d': 'we would',
    'they\'d': 'they would',
    'i\'m': 'i am',
    'you\'re': 'you are',
    'he\'s': 'he is',
    'she\'s': 'she is',
    'we\'re': 'we are',
    'they\'re': 'they are',
    'you\'ve': 'you have',
    'we\'ve': 'we have',
    'they\'ve': 'they have',
    'wtf': 'what the fuck',
    'brb': 'be right back',
    'btw': 'by the way',
    'idk': 'I don\'t know',
    'lol': 'laugh out loud',
    'smh': 'shaking my head',
    'omg': 'oh my god',
    'lmao': 'laughing my ass off',
    'rofl': 'rolling on the floor laughing',
    'tbh': 'to be honest',
    'fyi': 'for your information',
    'tmi': 'too much information',
    'asap': 'as soon as possible',
    'np': 'no problem',
    'yolo': 'you only live once',
    'bff': 'best friends forever',
    'bbl': 'be back later',
    'gtg': 'got to go',
    'ttyl': 'talk to you later',
    'omfg': 'oh my fucking god',
    'fml': 'fuck my life',
    'dm': 'direct message',
    'tl;dr': 'too long; didn\'t read',
    'nvm': 'never mind',
    'l8r': 'later',
    '2moro': 'tomorrow',
    'cya': 'see you',
    'bday': 'birthday'
}

def replace_words(text, word_subs):
    pattern = re.compile(r'\b(' + '|'.join(word_subs.keys()) + r')\b')
    return pattern.sub(lambda x: word_subs[x.group(0)], text)

In [3]:
train_dataset = load_dataset('go_emotions', split='train')
test_dataset = load_dataset('go_emotions', split='test')

train_dataset[0]

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def convert_to_multihot(label_list, num_labels):
    multihot = torch.zeros(num_labels)
    
    for label in label_list:
        multihot[label] = 1
    
    return multihot

def tokenize_and_encode_dataset(dataset, num_labels=28):
    # Initialize lists to hold inputs and labels
    input_ids = []
    attention_masks = []
    labels = []
    
    # Iterate over the dataset and tokenize each sample
    for i in tqdm(range(len(dataset))):
        
        text_to_clean = dataset[i]['text']
        cleaned_text = replace_words(text_to_clean, word_subs)
        
        tokenized_input = tokenizer(
            cleaned_text,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Convert the labels to multihot encoding
        multihot_labels = convert_to_multihot(
            dataset[i]['labels'],
            num_labels=num_labels
        )
        
        # Append the data to the lists
        input_ids.append(tokenized_input['input_ids'].squeeze(0))
        attention_masks.append(tokenized_input['attention_mask'].squeeze(0))
        labels.append(multihot_labels)
    
    # Convert lists to tensors
    input_ids = torch.stack(input_ids)
    attention_masks = torch.stack(attention_masks)
    labels = torch.stack(labels)
    
    # Create a custom Dataset
    dataset_tensor = CustomDataset(input_ids, attention_masks, labels)
    
    return dataset_tensor
        
training = tokenize_and_encode_dataset(train_dataset)
testing = tokenize_and_encode_dataset(test_dataset)

100%|██████████| 43410/43410 [00:23<00:00, 1815.91it/s]
100%|██████████| 5427/5427 [00:02<00:00, 1851.80it/s]


In [5]:
print(tokenizer(train_dataset[0]['text']))
print(train_dataset[0]['text'])

{'input_ids': [101, 2026, 8837, 2833, 2003, 2505, 1045, 2134, 1005, 1056, 2031, 2000, 5660, 2870, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
My favourite food is anything I didn't have to cook myself.


In [6]:
base_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=28, 
    problem_type='multi_label_classification',
    )

model = base_model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
torch.compile(model)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=30,
    per_device_eval_batch_size=30,
    num_train_epochs=3,
    weight_decay=.01,
    logging_dir='./logs',
    logging_steps=500,
    logging_strategy='epoch',
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training,
    eval_dataset=testing
)

In [8]:
trainer.train()

 33%|███▎      | 361/1083 [05:24<09:38,  1.25it/s]

{'loss': 0.2099, 'grad_norm': 0.6607567667961121, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


                                                  
 46%|████▌     | 500/1083 [07:25<07:47,  1.25it/s]

{'eval_loss': 0.10677764564752579, 'eval_runtime': 9.9292, 'eval_samples_per_second': 546.568, 'eval_steps_per_second': 18.229, 'epoch': 1.38}


 67%|██████▋   | 723/1083 [11:15<16:26,  2.74s/it]

{'loss': 0.1088, 'grad_norm': 0.6569858193397522, 'learning_rate': 6.6481994459833796e-06, 'epoch': 2.0}


                                                   
 92%|█████████▏| 1000/1083 [15:36<01:12,  1.14it/s]

{'eval_loss': 0.09499209374189377, 'eval_runtime': 10.6894, 'eval_samples_per_second': 507.699, 'eval_steps_per_second': 16.933, 'epoch': 2.77}


100%|██████████| 1083/1083 [16:49<00:00,  1.07it/s]

{'loss': 0.0972, 'grad_norm': 0.6781384348869324, 'learning_rate': 0.0, 'epoch': 3.0}
{'train_runtime': 1009.7368, 'train_samples_per_second': 128.974, 'train_steps_per_second': 1.073, 'train_loss': 0.13864698964803174, 'epoch': 3.0}





TrainOutput(global_step=1083, training_loss=0.13864698964803174, metrics={'train_runtime': 1009.7368, 'train_samples_per_second': 128.974, 'train_steps_per_second': 1.073, 'total_flos': 1.722344547631104e+16, 'train_loss': 0.13864698964803174, 'epoch': 2.995162404975812})

In [13]:
trainer.save_model('./saved_model')