In [3]:
!pip install datasets



In [4]:

from datasets import load_dataset
import pandas as pd

dataset = load_dataset("go_emotions")

README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [6]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
val_df = pd.DataFrame(dataset["validation"])

In [7]:
# tokenizer the dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
tokenized_datasets = dataset.map(lambda examples: tokenizer(examples["text"], padding="max_length", truncation=True), batched=True)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [9]:
train_df = pd.DataFrame(tokenized_datasets["train"])
test_df = pd.DataFrame(tokenized_datasets["test"])
val_df = pd.DataFrame(tokenized_datasets["validation"])

In [11]:
train_labels

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
#one hot encode the labels
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_df["labels"])
test_labels = mlb.transform(test_df["labels"])
val_labels = mlb.transform(val_df["labels"])

# train_df['labels'] = train_df['labels'].apply(lambda x: [int(i) for i in x])

In [None]:
# change the labels into encoding labels
train_df['labels'] = train_labels.tolist()
test_df['labels'] = test_labels.tolist()
val_df['labels'] = val_labels.tolist()

In [None]:
train_df

In [None]:
# remove the index and text column
train_df.drop(columns=['text', 'id'], inplace=True, axis=1)
test_df.drop(columns=['text', 'id'], inplace=True, axis=1)
val_df.drop(columns=['text', 'id'], inplace=True, axis=1)


In [None]:
train_df

In [None]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
val_df.to_csv('val.csv', index=False)

In [None]:
# now load the dataset into torch data loader
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd

class GoEmotionsDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]  # Access row by index using iloc
        input_ids = torch.tensor(row['input_ids'])
        attention_mask = torch.tensor(row['attention_mask'])
        labels = torch.tensor(row['labels'])
        # Assuming 'input_ids', 'attention_mask', and 'labels' are the relevant columns
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


In [None]:

# Create dataset instances
train_dataset = GoEmotionsDataset(train_df)
test_dataset = GoEmotionsDataset(test_df)
val_dataset = GoEmotionsDataset(val_df)

In [None]:
len(train_dataset)

In [None]:

# Now, use these datasets with DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# 2. Load tokenizer and model
model_name = "bert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=28)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = AdamW(model.parameters(), lr=5e-5)
model = model.to(device)

In [None]:
for i in range(5):
    print(f"Epoch {i+1}")
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch['labels'] = batch['labels'].type(torch.float32)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        torch.cuda.empty_cache() # Add this line
    print(f'loss: {loss.item()}')

In [None]:
from sklearn.metrics import accuracy_score, f1_score, hamming_loss
from torch.nn import BCEWithLogitsLoss  # Assuming binary cross-entropy loss

# Set model to evaluation mode
model.eval()

# Initialize accumulators for metrics and loss
val_loss = 0.0
val_steps = 0
all_preds = []
all_labels = []

# Define loss function
loss_fn = BCEWithLogitsLoss()

with torch.no_grad():
    for batch in val_loader:
        # Move batch to the correct device
        batch = {k: v.to(device) for k, v in batch.items()}
        batch['labels'] = batch['labels'].type(torch.float32)

        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits
        loss = loss_fn(logits, batch['labels'])  # Compute validation loss

        # Calculate predictions
        preds = (torch.sigmoid(logits) > 0.5).int()

        # Accumulate predictions and labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

        # Update loss and step count
        val_loss += loss.item()
        val_steps += 1

    # Compute metrics
    val_loss /= val_steps
    val_acc = accuracy_score(all_labels, all_preds)
    val_f1 = f1_score(all_labels, all_preds, average='micro')
    val_hamm = hamming_loss(all_labels, all_preds)

    print(f"""Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f},
        Validation F1 Score: {val_f1:.4f}, Validation Hamming Loss: {val_hamm:.4f}""")


In [None]:
tokenizer.save_pretrained("bert Gomotions tokenizer")
model.save_pretrained("bert Gomotions")

In [None]:
from huggingface_hub import notebook_login

notebook_login()


In [None]:
model.push_to_hub('codewithdark/bert-Gomotions')

In [None]:
tokenizer.push_to_hub('codewithdark/bert-Gomotions')

In [None]:
import torch
import datasets
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader


data_list = []
for batch in train_dataset:
    data_list.extend([{key: value.tolist() for key, value in batch.items()}])

# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_list(data_list)

In [None]:
data_list = []
for batch in test_dataset:
    data_list.extend([{key: value.tolist() for key, value in batch.items()}])

# Convert to Hugging Face Dataset
hf_dataset_test = Dataset.from_list(data_list)

In [None]:
data_list = []
for batch in val_dataset:
    data_list.extend([{key: value.tolist() for key, value in batch.items()}])

# Convert to Hugging Face Dataset
hf_dataset_val = Dataset.from_list(data_list)

In [None]:
hf_dataset_val

In [None]:
from datasets import DatasetDict

# Create a DatasetDict with train, validation, and test sets
dataset_dict = DatasetDict({
    "train": hf_dataset,
    "validation": hf_dataset_val,
    "test": hf_dataset_test
})


In [None]:
dataset_dict

In [None]:
# Push to Hugging Face Hub
dataset_dict.push_to_hub("codewithdark/Gomotions-tokenizer")

In [None]:
from datasets import load_dataset

dataset = load_dataset("codewithdark/Gomotions-tokenizer")


In [None]:
dataset

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer
model_name = "codewithdark/bert-Gomotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Emotion labels (adjust based on your dataset)
emotion_labels = [
    "admiration",
    "amusement",
    "anger", 
    "annoyance",
    "approval",
    "caring",
    "confusion",
    "curiosity",
    "desire",
    "disappointment",
    "disapproval",
    "disgust",
    "embarrassment",
    "excitement",
    "fear",
    "gratitude",
    "grief",
    "joy",
    "love",
    "nervousness",
    "optimism",
    "pride",
    "realization",
    "relief",
    "remorse",
    "sadness",
    "surprise",
    "neutral",
]

# Example text
text = "I'm so happy today!"
inputs = tokenizer(text, return_tensors="pt")

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).squeeze(0)  # Convert logits to probabilities

# Get top 5 predictions
top5_indices = torch.argsort(probs, descending=True)[:5]  # Get indices of top 5 labels
top5_labels = [emotion_labels[i] for i in top5_indices]
top5_probs = [probs[i].item() for i in top5_indices]


In [19]:
top5_labels

['joy', 'gratitude', 'excitement', 'admiration', 'relief']

In [20]:
top5_probs

[0.9458723664283752,
 0.028213057667016983,
 0.010968657210469246,
 0.00972858164459467,
 0.00642473204061389]

In [1]:
from transformers import pipeline

classifier = pipeline("text-classification", model="codewithdark/bert-Gomotions", top_k=None)
pred = classifier("I'm so excited about the trip!")

config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu


In [2]:
pred

[[{'label': 'LABEL_13', 'score': 0.946674644947052},
  {'label': 'LABEL_17', 'score': 0.03159971162676811},
  {'label': 'LABEL_27', 'score': 0.014035654254257679},
  {'label': 'LABEL_7', 'score': 0.011113960295915604},
  {'label': 'LABEL_4', 'score': 0.009281047619879246},
  {'label': 'LABEL_26', 'score': 0.005930771119892597},
  {'label': 'LABEL_0', 'score': 0.0048994808457791805},
  {'label': 'LABEL_22', 'score': 0.003083831397816539},
  {'label': 'LABEL_8', 'score': 0.0030112252570688725},
  {'label': 'LABEL_20', 'score': 0.0027980185113847256},
  {'label': 'LABEL_15', 'score': 0.002765149110928178},
  {'label': 'LABEL_18', 'score': 0.0013140254886820912},
  {'label': 'LABEL_21', 'score': 0.001248006010428071},
  {'label': 'LABEL_23', 'score': 0.001191668095998466},
  {'label': 'LABEL_1', 'score': 0.001108821015805006},
  {'label': 'LABEL_9', 'score': 0.0009146715747192502},
  {'label': 'LABEL_14', 'score': 0.0008913648780435324},
  {'label': 'LABEL_10', 'score': 0.00084766512736678