In [None]:
%pip install transformers
%pip install scikit-learn
%pip install modal
%pip install datasets
%pip install accelerate -U

In [23]:
import transformers
from sklearn.model_selection import train_test_split
import modal
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer

In [9]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# Define the mapping from emotion names to numbers
emotion_mapping = {
    'admiration': 1, 'amusement': 2, 'anger': 3, 'annoyance': 4,
    'approval': 5, 'caring': 6, 'confusion': 7, 'curiosity': 8,
    'desire': 9, 'disappointment': 10, 'disapproval': 11, 'disgust': 12,
    'embarrassment': 13, 'excitement': 14, 'fear': 15, 'gratitude': 16,
    'grief': 17, 'joy': 18, 'love': 19, 'nervousness': 20, 'optimism': 21,
    'pride': 22, 'realization': 23, 'relief': 24, 'remorse': 25, 'sadness': 26,
    'surprise': 27, 'neutral': 28
}

def process_csv(file_path):
    """Processes a single CSV file."""
    df = pd.read_csv(file_path, delimiter=',')

    # Find emotion numbers (same as your existing code)
    emotion_numbers = []
    for index, row in df.iterrows():
        emotion_number = 0
        for emotion, number in emotion_mapping.items():
            if row['emotion'] == emotion:
                emotion_number = number
                break
        emotion_numbers.append(emotion_number)

    # Add emotion numbers as a new column
    df['emotion'] = emotion_numbers

    # Select relevant columns and ensure data types
    new_df = df[['text', 'emotion']]
    new_df['text'] = new_df['text'].astype(str)
    new_df['emotion'] = new_df['emotion'].astype(int)

    return new_df

initial_labeled_data = process_csv('active_learning_emotions.csv')

In [18]:
initial_labeled_data.shape
print(initial_labeled_data)

                                                 text  emotion
0   Dear user, we've detected unauthorized access ...       15
1   We've noticed a suspicious login attempt from ...       15
2   Our systems have detected unusual activity on ...       15
3   Be advised that we've identified an attempt to...       15
4   Our security system has flagged your account f...       15
5   Discover the secrets of your account with hidd...        8
6   Ever wondered how secure your data really is? ...        8
7   We've added some exciting new features to your...        8
8   We've made a change to your account. Can you s...        8
9   We have a surprise waiting in your account. So...        8
10  This is your chance! An exclusive offer just f...       14
11  Congratulations! Your account has been selecte...       14
12  You've earned it! A special reward awaits you ...       14
13  Get ready for an exciting opportunity! You're ...       14
14  You're almost there! Confirm your details now ...  

In [31]:
enron_dataset = load_dataset('SetFit/enron_spam')

print(enron_dataset['train'])

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['message_id', 'text', 'label', 'label_text', 'subject', 'message', 'date'],
    num_rows: 31716
})


In [25]:
# Load pre-trained RoBERTa
model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
# model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

In [24]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)  # Use your model_name

# def tokenize_function(examples):
#     return tokenizer(examples, padding='max_length', truncation=True)

# tokenized_dataset = initial_labeled_data['text'].map(tokenize_function)

# Updated logic for accessing tokenized data (assuming PyTorch)
# X_train = [encoding['input_ids'] for encoding in tokenized_dataset]
# y_train = [encoding['input_ids'] for encoding in initial_labeled_data['emotion'].map(tokenize_function)]

class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.emotion
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


In [None]:
train_data = initial_labeled_data.reset_index(drop=True)
training_set = SentimentData(train_data, tokenizer, 256)

In [7]:
def get_new_labels_from_human(unlabeled_examples):
    new_labels = []

    for example in unlabeled_examples:
        print("Example:", example)
        # Example - assuming your labels are simple:  'positive', 'negative', 'neutral'
        label_choice = input("Choose one: 'positive', 'negative', 'neutral': ")
        while label_choice not in ['positive', 'negative', 'neutral']:
            print("Invalid input. Please enter one of the valid choices.")
            label_choice = input("Choose one: 'positive', 'negative', 'neutral': ")
        new_labels.append(label_choice)

    return new_labels


In [8]:
# from torch.utils.data import DataLoader
import torch.optim as optim

# ... Assuming you have X_train and y_train prepared as tensors
num_iterations = 10

optimizer = optim.Adam(model.parameters())  # Example optimizer
train_dataloader = DataLoader(X_train, batch_size=16, shuffle=True)

for _ in range(num_iterations):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        loss = loss_function(outputs, y_train)  # Replace loss_function with an appropriate loss
        loss.backward()
        optimizer.step()

    # Query strategy: Uncertainty Sampling
    model.eval()  # Switch model to evaluation mode for uncertainty estimation
    query_idx = modal.uncertainty_sampling(model, X_unlabeled)

    # Get labels from a human annotator for the selected instances
    new_labels = get_new_labels_from_human(X_unlabeled[query_idx])

    # Update datasets (assuming new_labels is a list or array)
    X_train = np.concatenate([X_train, X_unlabeled[query_idx]])
    y_train = np.concatenate([y_train, new_labels])

    # Remove queried instances from the unlabeled pool
    X_unlabeled = np.delete(X_unlabeled, query_idx, axis=0)


ValueError: not enough values to unpack (expected 2, got 1)