In [7]:
# !pip install transformers datasets
# !pip install torch

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from torch.utils.data import DataLoader
from transformers import BertTokenizer

In [16]:
sentiment = load_dataset("yelp_polarity")
tr = sentiment['train']
ts = sentiment['test']
tr = tr.train_test_split(test_size=0.97)['train']
ts = ts.train_test_split(test_size=0.97)['train']


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(data):
    return tokenizer(data['text'], padding='max_length', truncation=True)

tokenized_dataset = tr.map(tokenize_function, batched=True)

word_count = tokenizer.vocab_size
print("Vocabulary Size:", word_count)

Map: 100%|████████████████████████| 16800/16800 [00:32<00:00, 516.05 examples/s]

Vocabulary Size: 30522





In [26]:
tr = tr.map(tokenize_function, batched=True)
tr.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True)

ts = ts.map(tokenize_function, batched=True)
ts.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
ts_loader = DataLoader(ts, batch_size=batch_size, shuffle=False)

Map: 100%|████████████████████████| 16800/16800 [00:29<00:00, 574.66 examples/s]
Map: 100%|██████████████████████████| 1140/1140 [00:02<00:00, 470.57 examples/s]


In [38]:
class LinearRegression(nn.Module):
    def __init__(self, word_count, embedding_dim):
        super(LinearRegression, self).__init__()
        self.embed_words = nn.Embedding(word_count, embedding_dim)  
        self.layer1 = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        embedded = self.embed_words(x) 
        avg_embedding = embedded.mean(dim=1) 
        out = self.layer1(avg_embedding) 
        return out  

In [39]:
embedding_dim = 10 

In [40]:
model = LinearRegression(word_count, embedding_dim)
stepper = optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.BCEWithLogitsLoss()

In [41]:
def forward_pass(model, dataloader, stepper, loss_function, iterations=10):
    model.train()
    for e in range(iterations):
        total_loss = 0
        for sample in dataloader:
            inputs, attention_mask, labels = sample['input_ids'], sample['attention_mask'], sample['label']
            stepper.zero_grad()
            outputs = model(inputs).squeeze()
            loss = loss_function(outputs, labels.float())
            loss.backward()
            stepper.step()
            total_loss = total_loss + loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

In [42]:
forward_pass(model, tr_loader, stepper, loss_function, num_epochs=10)

Epoch 1, Loss: 0.669491346081098
Epoch 2, Loss: 0.6069448996299789
Epoch 3, Loss: 0.5227746870546114
Epoch 4, Loss: 0.4488695376047066
Epoch 5, Loss: 0.39134810373896645
Epoch 6, Loss: 0.34697813470803557
Epoch 7, Loss: 0.31305692597514106
Epoch 8, Loss: 0.2858913141063281
Epoch 9, Loss: 0.26447297050307195
Epoch 10, Loss: 0.24689224763019454


In [43]:
def test_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for sample in dataloader:
            inputs, attention_mask, labels = sample['input_ids'], sample['attention_mask'], sample['label']
            outputs = model(inputs).squeeze()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [44]:
test_model(model, ts_loader)

Test Accuracy: 88.95%


In [2]:
sentiment = load_dataset("yelp_polarity")
tr = sentiment['train']
ts = sentiment['test']
tr = tr.train_test_split(test_size=0.97)['train']
ts = ts.train_test_split(test_size=0.97)['train']

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
batch_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def tokenize_function(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512, return_tensors="pt")

In [6]:
tr = tr.map(tokenize_function, batched=True)
tr.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|███████████████████████| 16800/16800 [00:08<00:00, 1869.18 examples/s]


In [7]:
train_loader = DataLoader(tr, batch_size=batch_size, shuffle=True)

In [8]:
ts = ts.map(tokenize_function, batched=True)
ts.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
ts_loader = DataLoader(ts, batch_size=batch_size, shuffle=False)

Map: 100%|█████████████████████████| 1140/1140 [00:00<00:00, 1930.57 examples/s]


In [10]:
optimizer = AdamW(model.parameters(), lr=2e-2)



In [None]:
for epoch in range(1): 
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")