In [1]:
import torch
from transformers import BertTokenizer
import pandas as pd

In [2]:


# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load your dataset
train_data = pd.read_csv('/content/drive/MyDrive/MovieClassifier/data/train.csv')

# Choose a sequence length
MAX_LEN = 256

# Tokenize and encode the sentences as per BERT's requirements
input_ids = []
attention_masks = []

for sent in train_data['review']:
    encoded_dict = tokenizer.encode_plus(
        sent,                          # Sentence to encode.
        add_special_tokens=True,       # Add '[CLS]' and '[SEP]'
        max_length=MAX_LEN,            # Pad & truncate all sentences.
        padding='max_length',          # Pad all sentences to the same length.
        truncation=True,               # Truncate to max_length.
        return_attention_mask=True,    # Construct attention masks.
        return_tensors='pt',           # Return PyTorch tensors.
    )



    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_data['sentiment'].values)

# Print out the first 5 input IDs
print('Original: ', train_data['review'][0])
print('Token IDs:', input_ids[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Original:  it s frequently said that movies can never equal the original book well in this case not only the movie is not as good as the book but is an insult to the book i d rather see milan kundera s novel turned on fire than into this something which the director probably calls adaptation all the beautiful philosophy that asks is it better to carry a heavy load on your shoulders or cope with the unbearable lightness of being is put aside and instead all the movie deals with is daniel day lewis i cannot say tomas sexual adventures with his dumb wife his mistress and his other mistresses fran ois truffaut already said it bad directors make bad movies don t waste your time and money read the book instead it s really worth it
Token IDs: tensor([  101,  2009,  1055,  4703,  2056,  2008,  5691,  2064,  2196,  5020,
         1996,  2434,  2338,  2092,  1999,  2023,  2553,  2025,  2069,  1996,
         3185,  2003,  2025,  2004,  2204,  2004,  1996,  2338,  2021,  2003,
         2019, 15301

In [3]:
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

In [5]:
class IMDbDataset(Dataset):
    """IMDb Movie Reviews dataset."""
    def __init__(self, reviews, masks, labels):
        self.reviews = reviews
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.reviews[idx],
            'attention_mask': self.masks[idx],
            'labels': self.labels[idx]
        }

def tokenize_and_create_datasets(data_path, tokenizer, max_len=256):
    datasets = {}
    for split in ['train', 'val', 'test']:
        dataset_path = Path(data_path) / f"{split}.csv"
        data = pd.read_csv(dataset_path)

        input_ids = []
        attention_masks = []
        for sent in data['review']:
            encoded_dict = tokenizer.encode_plus(
                sent,                          # Sentence to encode.
                add_special_tokens=True,       # Add '[CLS]' and '[SEP]'
                max_length=max_len,            # Pad & truncate all sentences.
                padding='max_length',          # Pad all sentences to the same length.
                truncation=True,               # Truncate to max_length.
                return_attention_mask=True,    # Construct attention masks.
                return_tensors='pt',           # Return PyTorch tensors.
            )
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])

        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.tensor(data['sentiment'].values)

        datasets[split] = IMDbDataset(input_ids, attention_masks, labels)
    return datasets

def create_dataloaders(data_path, batch_size=16):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    datasets = tokenize_and_create_datasets(data_path, tokenizer)
    dataloaders = {
        'train': DataLoader(datasets['train'], batch_size=batch_size, shuffle=True),
        'val': DataLoader(datasets['val'], batch_size=batch_size, shuffle=False),
        'test': DataLoader(datasets['test'], batch_size=batch_size, shuffle=False)
    }
    return dataloaders


if __name__ == "__main__":
    data_path = "/content/drive/MyDrive/MovieClassifier/data"
    batch_size = 16
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    dataloaders = create_dataloaders(data_path, batch_size=batch_size)
    print("Dataloaders created for train, val, and test.")



Dataloaders created for train, val, and test.


In [7]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
import numpy as np


In [10]:
torch.cuda.is_available()

True

In [11]:
# Set device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',  # Use the 12-layer BERT model with an uncased vocab.
    num_labels=2,
).to(device)

# Load data loaders directly, assuming tokenize_and_create_datasets is done in create_dataloaders
data_path = "/content/drive/MyDrive/MovieClassifier/data"  # Adjust as needed
batch_size = 16


# Use create_dataloaders to prepare your data
dataloaders = create_dataloaders(data_path, batch_size=batch_size)

# Initialize the optimizer with the recommended approach from PyTorch
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

# Setup the scheduler
num_epochs = 4  # Define the number of epochs to train for
total_steps = len(dataloaders['train']) * num_epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Training function
def train(model, dataloaders, optimizer, scheduler, epochs=4):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        total_train_accuracy = 0
        for batch in dataloaders['train']:
            b_input_ids = batch['input_ids'].to(device)
            b_attention_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)

            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()
            scheduler.step()

            # Calculate accuracy
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            total_train_accuracy += flat_accuracy(logits, label_ids)

        avg_train_accuracy = total_train_accuracy / len(dataloaders['train'])
        print(f"Epoch {epoch + 1}/{epochs} - Train Accuracy: {avg_train_accuracy:.4f}")

# Accuracy calculation function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Execute the training process
train(model, dataloaders, optimizer, scheduler, epochs=num_epochs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4 - Train Accuracy: 0.8858
Epoch 2/4 - Train Accuracy: 0.9536
Epoch 3/4 - Train Accuracy: 0.9857
Epoch 4/4 - Train Accuracy: 0.9963


**Evaluation**

In [12]:
def evaluate(model, dataloader):
    model.eval()  # Set model to evaluation mode
    total_eval_accuracy = 0
    for batch in dataloader:
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_mask)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(dataloader)
    print(f"Validation Accuracy: {avg_val_accuracy:.4f}")


In [13]:
validate_dataloader = dataloaders['val']

# Call the evaluate function
evaluate(model, validate_dataloader)


Validation Accuracy: 0.9305


In [14]:
test_dataloader = dataloaders['test']

# Call the evaluate function for the test dataset
evaluate(model, test_dataloader)


Validation Accuracy: 0.9281


**Saving the model**

In [15]:
# Save the model state dictionary to a file
model_save_name = 'bert_model_state_dict.bin'
path = F"/content/drive/MyDrive/MovieClassifier/model/{model_save_name}"  # Specify the path in your Google Drive
torch.save(model.state_dict(), path)

**Saving the tokenizer**

In [16]:
tokenizer_save_name = 'bert_tokenizer'
tokenizer_path = F"/content/drive/MyDrive/MovieClassifier/model/{tokenizer_save_name}"
tokenizer.save_pretrained(tokenizer_path)


('/content/drive/MyDrive/MovieClassifier/model/bert_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/MovieClassifier/model/bert_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/MovieClassifier/model/bert_tokenizer/vocab.txt',
 '/content/drive/MyDrive/MovieClassifier/model/bert_tokenizer/added_tokens.json')