In [None]:
#pip install pandas numpy transformers torch scikit-learn streamlit

In [19]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

In [20]:
# Load data
train_data = pd.read_csv('train.tsv', sep='\t', header=None, names=['text', 'labels', 'id'])
dev_data = pd.read_csv('dev.tsv', sep='\t', header=None, names=['text', 'labels', 'id'])
test_data = pd.read_csv('test.tsv', sep='\t', header=None, names=['text', 'labels', 'id'])

# Combine data
data = pd.concat([train_data, dev_data, test_data], ignore_index=True)

# Convert labels from strings to lists
data['labels'] = data['labels'].apply(lambda x: [int(i) for i in x.split(',')])

# Load emotion labels
with open('emotions.txt', 'r') as f:
    emotions = [line.strip() for line in f]

num_labels = len(emotions)

In [21]:
class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [22]:
from sklearn.model_selection import train_test_split

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['text'].tolist(),
    data['labels'].tolist(),
    test_size=0.1,
    random_state=42
)

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = GoEmotionsDataset(train_texts, train_labels, tokenizer)
val_dataset = GoEmotionsDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [23]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    problem_type='multi_label_classification'
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
from transformers import AdamW, get_linear_schedule_with_warmup
epochs = 100

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)




In [25]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast

class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128, num_labels=28):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_labels = num_labels  # Include num_labels as an attribute
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = self.labels[idx]
        
        # Create a multi-hot encoded vector for labels
        label_tensor = torch.zeros(self.num_labels, dtype=torch.float)
        label_tensor[labels] = 1.0  # Set indices corresponding to labels to 1
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label_tensor
        }

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Assume emotions list is already defined
num_labels = len(emotions)  # Should be 28 for GoEmotions

# Create datasets
train_dataset = GoEmotionsDataset(train_texts, train_labels, tokenizer, num_labels=num_labels)
val_dataset = GoEmotionsDataset(val_texts, val_labels, tokenizer, num_labels=num_labels)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [26]:
model.save_pretrained('emotion_model')
tokenizer.save_pretrained('emotion_model')

('emotion_model/tokenizer_config.json',
 'emotion_model/special_tokens_map.json',
 'emotion_model/vocab.txt',
 'emotion_model/added_tokens.json',
 'emotion_model/tokenizer.json')

continue from Deployment with Streamlit step 