In [1]:
# import all dependencies needed
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import os

In [19]:
# download the pretrained model and tokenizers for transfer learning
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english", num_labels=2)
no_pretrained_model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=2)



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Read in our data from our dataset
if not os.path.isdir('amazon_sentiment_data'):
    raise ValueError('amazon_sentiment_data is not a directory')

labels = []
texts = []

with open('amazon_sentiment_data/reviews.txt') as f:
    for line in f:
        label, text = line[9], line[11:]
        if label not in ['2', '1']:
            print(label)
            raise ValueError(f'Invalid label: {label}')
        if label == '2':
            labels.append(1)
        else:
            labels.append(0)
        texts.append(text.strip())

In [6]:
# Tokenize the data
tokenized_data = tokenizer(texts, max_length=256, padding='max_length', truncation=True, return_tensors='pt')


In [7]:
input_ids = tokenized_data['input_ids']
attention_masks = tokenized_data['attention_mask']
labels = labels 

# Split the data into training and testing sets
train_inputs, temp_test_inputs, train_masks, temp_test_masks, train_labels, temp_test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42
)

# I also want a test set and validation set
test_inputs, val_inputs, test_masks, val_masks, test_labels, val_labels = train_test_split(
    temp_test_inputs, temp_test_masks, temp_test_labels, test_size=0.5, random_state=42
)

In [8]:
# Convert labels to tensors
if not torch.is_tensor(train_labels):
    train_labels = torch.tensor(train_labels)
if not torch.is_tensor(test_labels):
    test_labels = torch.tensor(test_labels)
if not torch.is_tensor(train_inputs):
    train_inputs = torch.tensor(train_inputs)
if not torch.is_tensor(test_inputs):
    test_inputs = torch.tensor(test_inputs)
if not torch.is_tensor(train_masks):
    train_masks = torch.tensor(train_masks)
if not torch.is_tensor(test_masks):
    test_masks = torch.tensor(test_masks)
if not torch.is_tensor(val_inputs):
    val_inputs = torch.tensor(val_inputs)
if not torch.is_tensor(val_masks):
    val_masks = torch.tensor(val_masks)
if not torch.is_tensor(val_labels):
    val_labels = torch.tensor(val_labels)

print(train_inputs.shape)
print(train_labels.shape)
print(train_masks.shape)
print(test_masks.shape)
print(test_labels.shape)
print(test_inputs.shape)
print(val_inputs.shape)
print(val_masks.shape)
print(val_labels.shape)

torch.Size([320000, 256])
torch.Size([320000])
torch.Size([320000, 256])
torch.Size([40000, 256])
torch.Size([40000])
torch.Size([40000, 256])
torch.Size([40000, 256])
torch.Size([40000, 256])
torch.Size([40000])


In [9]:
# define our dataset
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels
        self.encodings = {
            'input_ids': self.input_ids,
            'attention_mask': self.attention_masks
        }
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [10]:
# Create datasets
train_dataset = SentimentDataset(train_inputs, train_masks, train_labels)
val_dataset = SentimentDataset(val_inputs, val_masks, val_labels)
test_dataset = SentimentDataset(test_inputs, test_masks, test_labels)

In [11]:
# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [12]:
# Training setup
from transformers import AdamW, Trainer, TrainingArguments

optimizer = AdamW(model.parameters(), lr=2e-5)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
def test(model, dataloader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    total = 0
    correct = 0
    predictions = []

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        assert isinstance(input_ids, torch.Tensor)
        assert isinstance(attention_mask, torch.Tensor)
        assert isinstance(labels, torch.Tensor)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred_labels = logits.argmax(dim=1)
            predictions.extend(pred_labels.cpu().numpy())

        correct += (pred_labels == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total

    return accuracy, predictions

In [16]:
from tqdm import tqdm

def train(model, train_dataloader, val_dataloader, optimizer, training_args, early_stop=False):
    output_dir = training_args.output_dir
    epochs = training_args.num_train_epochs
    batch_size = training_args.per_device_train_batch_size
    warmup_steps = training_args.warmup_steps
    weight_decay = training_args.weight_decay
    logging_dir = training_args.logging_dir
    evaluation_strategy = training_args.evaluation_strategy
    save_strategy = training_args.save_strategy
    load_best_model_at_end = training_args.load_best_model_at_end

    if torch.cuda.is_available():
        device = torch.device("cuda:0")  
        model.to(device)
        print("Model is on GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        model.to(device)
        print("Model is on CPU")

    criterion = torch.nn.CrossEntropyLoss()

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0

        train_progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{epochs} Training")
        for i, batch in train_progress_bar:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(outputs.logits, labels)
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()

            train_progress_bar.set_description(f"Epoch {epoch + 1} Training Loss: {loss.item():.4f}")

            if i % 500 == 0:
                # calculate accuracy
                accuracy = ((outputs.logits.argmax(1) == labels).sum().item() / batch_size) * 100
                print(f'Epoch {epoch + 1}, Checkpoint {i / 100}, Loss: {loss.item()}, accuracy: {accuracy}')

            if early_stop and i == 1000:
                break

        scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print('Average Training Loss:', avg_train_loss)

        if evaluation_strategy == "epoch":
            model.eval()
            total_eval_loss = 0

            # Add tqdm progress bar for validation loop
            val_progress_bar = tqdm(val_dataloader, total=len(val_dataloader), desc=f"Epoch {epoch + 1} Validation")
            for batch in val_progress_bar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                with torch.no_grad():
                    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                    loss = criterion(outputs.logits, labels)
                    total_eval_loss += loss.item()

                    val_progress_bar.set_description(f"Validation Loss: {loss.item():.4f}")

            avg_val_loss = total_eval_loss / len(val_dataloader)
            print('Validation Loss:', avg_val_loss)

            # val_accuracy = ((outputs.logits.argmax(1) == labels).sum().item() / len(outputs.logits)) * 100
            # print('Validation Accuracy:', val_accuracy)

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                if save_strategy == "best":
                    torch.save(model.state_dict(), os.path.join(output_dir, 'best_model.pth'))

        if save_strategy == "epoch":
            torch.save(model.state_dict(), os.path.join(output_dir, f'model_epoch_{epoch + 1}.pth'))

    if load_best_model_at_end and save_strategy == "best":
        model.load_state_dict(torch.load(os.path.join(output_dir, 'best_model.pth')))

    return model


In [20]:
# test the model accuracy before training
accuracy_results, predictions = test(no_pretrained_model, val_dataloader)

print('Accuracy:', accuracy_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Accuracy: 0.4892


In [116]:
best_model = train(model, train_dataloader, val_dataloader, optimizer, training_args, early_stop=True)

Epoch 1/3 Training:   0%|          | 0/20000 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
Epoch 1 Training Loss: 0.0211:   0%|          | 1/20000 [00:00<4:19:59,  1.28it/s]

Epoch 1, Checkpoint 0.0, Loss: 0.021122876554727554, accuracy: 200.0


Epoch 1 Training Loss: 0.0112:   3%|▎         | 501/20000 [05:52<3:47:55,  1.43it/s]

Epoch 1, Checkpoint 5.0, Loss: 0.011210933327674866, accuracy: 200.0


Epoch 1 Training Loss: 0.0683:   5%|▌         | 1000/20000 [11:43<3:42:48,  1.42it/s]


Epoch 1, Checkpoint 10.0, Loss: 0.0683380737900734, accuracy: 200.0
Average Training Loss: 0.00650562212264631


Validation Loss: 0.1776: 100%|██████████| 2500/2500 [09:21<00:00,  4.45it/s]


Validation Loss: 0.1113830860665068
Validation Accuracy: 87.5


Epoch 2 Training Loss: 0.0373:   0%|          | 1/20000 [00:00<4:14:08,  1.31it/s]

Epoch 2, Checkpoint 0.0, Loss: 0.0372585766017437, accuracy: 200.0


Epoch 2 Training Loss: 0.0825:   3%|▎         | 501/20000 [05:51<3:48:29,  1.42it/s]

Epoch 2, Checkpoint 5.0, Loss: 0.08245743066072464, accuracy: 187.5


Epoch 2 Training Loss: 0.0272:   5%|▌         | 1000/20000 [11:42<3:42:34,  1.42it/s]


Epoch 2, Checkpoint 10.0, Loss: 0.027223806828260422, accuracy: 200.0
Average Training Loss: 0.005375306690786965


Validation Loss: 0.1172: 100%|██████████| 2500/2500 [09:20<00:00,  4.46it/s]


Validation Loss: 0.10252427590275183
Validation Accuracy: 93.75


Epoch 3 Training Loss: 0.0373:   0%|          | 1/20000 [00:00<4:13:37,  1.31it/s]

Epoch 3, Checkpoint 0.0, Loss: 0.037322890013456345, accuracy: 200.0


Epoch 3 Training Loss: 0.0785:   3%|▎         | 501/20000 [05:51<3:48:06,  1.42it/s]

Epoch 3, Checkpoint 5.0, Loss: 0.07849567383527756, accuracy: 187.5


Epoch 3 Training Loss: 0.0066:   5%|▌         | 1000/20000 [11:42<3:42:30,  1.42it/s]


Epoch 3, Checkpoint 10.0, Loss: 0.006567754782736301, accuracy: 200.0
Average Training Loss: 0.005196426911046728


Validation Loss: 0.1049: 100%|██████████| 2500/2500 [09:20<00:00,  4.46it/s]


Validation Loss: 0.10061930843470618
Validation Accuracy: 93.75


In [151]:
accuracy_results, predictions = test(best_model, val_dataloader)

print('Accuracy:', accuracy_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Accuracy: 0.96645


In [152]:
# save the model
torch.save(best_model.state_dict(), 'models/sentiment_model_dallas.pth')

In [4]:
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english", num_labels=2)

model.load_state_dict(torch.load('models/sentiment_model_dallas.pth'))

if torch.cuda.is_available():
    device = torch.device("cuda:0")  
    model.to(device)
    print("Model is on GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    model.to(device)
    print("Model is on CPU")



Model is on GPU: NVIDIA TITAN RTX


In [7]:
CUSTOM_INPUT = "I love this product, but it needs serous work on its battery life"
tokenized_input = tokenizer(CUSTOM_INPUT, max_length=256, padding='max_length', truncation=True, return_tensors='pt')
input_ids = tokenized_input['input_ids']
attention_mask = tokenized_input['attention_mask']

with torch.no_grad():
    outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device))
    logits = outputs.logits
    print(logits)
    prediction = logits.argmax(dim=1).item()

print('Prediction:', prediction)
print('Sentiment:', 'Positive' if prediction == 1 else 'Negative')


tensor([[ 3.2639, -2.7294]], device='cuda:0')
Prediction: 0
Sentiment: Negative
