In [1]:
import torch
import tensorflow as tf
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd

dataset = pd.read_csv("./augmented_train_dataset.csv", delimiter=",")
df = pd.DataFrame(dataset, columns=['Utterance','Emotion'])

replacement_string = "None"
df['Emotion'].fillna(replacement_string, inplace=True)

df = df[df['Emotion'] != "None"]
df = df[df['Emotion'] != "neutral"]

class_counts = df['Emotion'].value_counts()
print(class_counts)

Emotion
sadness     5708
surprise    4596
anger       4562
joy         1232
fear        1138
disgust     1065
Name: count, dtype: int64


In [4]:
X = df['Utterance'].values
y = df[['Emotion']].values

In [5]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Convert the output labels to integers using label encoding
label_encoders = []
y_encoded = []
for i in range(y.shape[1]):
    label_encoder = LabelEncoder()
    y_encoded.append(label_encoder.fit_transform(y[:, i]))
    label_encoders.append(label_encoder)

y_categorical = [to_categorical(y_label) for y_label in y_encoded]
print(y_categorical)

[array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)]


In [6]:
X = X.tolist()
y_cat = y_categorical[0].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(X, y_cat, test_size=0.2, random_state=42)

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.0kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.10MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.64MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 566kB/s]


In [8]:
import urllib3, socket
from urllib3.connection import HTTPConnection

HTTPConnection.default_socket_options = ( 
    HTTPConnection.default_socket_options + [
    (socket.SOL_SOCKET, socket.SO_SNDBUF, 8000000), 
    (socket.SOL_SOCKET, socket.SO_RCVBUF, 8000000)
    ])

In [9]:
import requests

try:
    response = requests.head("https://huggingface.co/bert-base-uncased/resolve/main/config.json", timeout=30)
    if response.status_code == 200:
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6, resume_download=True, return_dict=True)  # Change 'num_labels' for multi-class tasks
    else:
        print(f"HTTP request failed with status code: {response.status_code}")
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {str(e)}")


Downloading model.safetensors: 100%|██████████| 440M/440M [01:15<00:00, 5.87MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

In [11]:
# Convert labels to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

In [12]:
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], torch.tensor(val_labels))

  train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels))
  val_dataset = TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], torch.tensor(val_labels))


In [13]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [14]:
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [15]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
epochs = 1  # Adjust the number of epochs as needed

for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Average training loss: {avg_train_loss:.4f}")
    
    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            val_predictions.extend(predictions.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())
    
    val_accuracy = sum(1 for p, t in zip(val_predictions, val_true_labels) if p == t) / len(val_true_labels)
    print(f"Validation accuracy: {val_accuracy:.4f}")


KeyboardInterrupt: 

In [100]:
print("Number of output units (classes):", model.config.num_labels)

Number of output units (classes): 6


In [99]:
from tqdm import tqdm

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
epochs = 1  # Adjust the number of epochs as needed

for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    # Create a tqdm progress bar for training
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")
    
    for batch in progress_bar:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        # Update the progress bar with the current loss
        progress_bar.set_postfix({'Avg Loss': total_loss / len(progress_bar)})
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Average training loss: {avg_train_loss:.4f}")
    
    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            val_predictions.extend(predictions.cpu().numpy())
            val_true_labels.extend(labels.cpu().numpy())
    
    val_accuracy = sum(1 for p, t in zip(val_predictions, val_true_labels) if p == t) / len(val_true_labels)
    print(f"Validation accuracy: {val_accuracy:.4f}")


Epoch 1/1:   0%|          | 0/236 [00:00<?, ?it/s]

Epoch 1/1:   0%|          | 0/236 [00:21<?, ?it/s]


RuntimeError: The size of tensor a (32) must match the size of tensor b (6) at non-singleton dimension 1