In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd
df = pd.read_csv("/notebooks/imdb_reduced_1.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
994,"On watching this film, I was amazed at how med...",positive
995,Nothing is sacred. Just ask Ernie Fosselius. T...,positive
996,I hated it. I hate self-aware pretentious inan...,negative
997,I usually try to be professional and construct...,negative


In [4]:
# Split the dataset into training and testing sets

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Convert sentiment labels to numerical format (e.g., 0 for negative, 1 for positive)
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Unnamed: 0,review,sentiment
778,"I never watched the 'Next Action Hero' show, a...",1
286,There have been many documentaries that I have...,1
165,An American Werewolf in London had some funny ...,0
960,This was my first Gaspar Noe movie I've watche...,1
493,"An extremely down-to-earth, well made and acte...",1
...,...,...
106,The performance of every actor and actress (in...,1
270,"Clifton Webb is one of my favorites. However, ...",0
860,This production was quite a surprise for me. I...,1
435,You wear only the best Italian suits from Arma...,1


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [6]:

train_encodings = tokenizer(list(train_df["review"]), padding=True, truncation=True, return_tensors="pt")


In [7]:
test_encodings = tokenizer(list(test_df["review"]), padding=True, truncation=True, return_tensors="pt")


In [8]:
# Tokenizer

# Step 4: Define Dataset Class
# Define a custom dataset class to encapsulate the tokenized data and labels
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Step 5: Create Dataset Instances
# Create dataset instances for both training and testing data
train_dataset = SentimentDataset(train_encodings, list(train_df["sentiment"]))
test_dataset = SentimentDataset(test_encodings, list(test_df["sentiment"]))

In [9]:
from multiprocessing import cpu_count
n_cores = cpu_count()
print(f'Number of Logical CPU cores: {n_cores}')

Number of Logical CPU cores: 8


In [15]:
# Define model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# Set WANDB_MODE environment variable to 'dryrun' to disable Wandb
import os
os.environ["WANDB_MODE"] = "dryrun"
# Define Trainer
training_args = TrainingArguments(
     output_dir='./results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache 

In [17]:
import numpy as np
from sklearn.metrics import accuracy_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


# Train model
trainer.train()



***** Running training *****
  Num examples = 799
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 150
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.25482,0.91
2,0.389100,0.183224,0.92
3,0.389100,0.2609,0.935


***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-50
Configuration saved in ./results/checkpoint-50/config.json
Model weights saved in ./results/checkpoint-50/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-150
Configuration saved in ./results/checkpoint-150/config.json
Model weights saved in ./results/checkpoint-150/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-150 (score: 0.935).


TrainOutput(global_step=150, training_loss=0.28155089219411217, metrics={'train_runtime': 242.3021, 'train_samples_per_second': 9.893, 'train_steps_per_second': 0.619, 'total_flos': 630677199697920.0, 'train_loss': 0.28155089219411217, 'epoch': 3.0})

In [19]:
# Evaluate model
eval_results = trainer.evaluate()
print("Test Accuracy:", eval_results['eval_accuracy'])

***** Running Evaluation *****
  Num examples = 200
  Batch size = 16


Test Accuracy: 0.935


In [20]:
# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to device
model = model.to(device)

text="Its amazing how this movie is made , but the only problem is the actor with bad acting skills , but its one of the award winning movie"

# Tokenization
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

# Move input tensors to device
inputs = {key: val.to(device) for key, val in inputs.items()}

# Inference
output = model(**inputs)
output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.9282,  1.7402]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [21]:
predicted_label = torch.argmax(output.logits).item()
if predicted_label == 1:
    print("Predicted Sentiment: Positive")
else:
    print("Predicted Sentiment: Negative")

Predicted Sentiment: Positive
