## This is a simple end to end example for Huggingface based transformer model called Bert
## Also shows retraining, saving of a model and usage for inference

In [None]:
#1. Setup Environment
#	pip install torch torchvision torchaudio
#	pip install transformers datasets

In [None]:
#2. Import Libraries
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
#3. Load and Preprocess Dataset
# Load the IMDB dataset
dataset = load_dataset('imdb')

# Use only a subset for training to keep it quick
train_dataset = dataset['train'].shuffle(seed=42).select([i for i in range(1000)])
test_dataset = dataset['test'].shuffle(seed=42).select([i for i in range(1000)])

In [None]:
#4. Tokenize the Data
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
#5. Create Data Loaders
# Create PyTorch data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
#6. Define the Model
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification

In [None]:
#7. Define Training Arguments and Trainer
#The Trainer class simplifies the training process significantly.
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
)

# Create a trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset            # Evaluation dataset
)

In [None]:
#8. Train the Model
	# Train the model
trainer.train()

In [None]:
#9. Evaluate the Model
	# Evaluate the model
results = trainer.evaluate()
print(f"Evaluation Results: {results}")

In [None]:
#10. Save the Model
	# Save the fine-tuned model
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_tokenizer')

In [None]:
#11. Load the Model for Inference
#	For making predictions on new data, you can load the model and tokenizer.
	
# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('./sentiment_model')
tokenizer = BertTokenizer.from_pretrained('./sentiment_tokenizer')

# Function to predict sentiment
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return "positive" if prediction == 1 else "negative"

# Example prediction
print(predict_sentiment("This movie was amazing!"))
print(predict_sentiment("I did not like this movie at all."))

In [None]:
#12. outputs.logits
	Logits: These are the raw scores produced by the model before applying any activation function like softmax. They represent the model's confidence in each class (in this case, sentiment classes)	
	
#13. outputs = model(**inputs)	
	Model: The model is a pre-trained transformer model loaded from the Hugging Face library.
	Inputs: The **inputs syntax unpacks the dictionary of inputs, which typically includes input_ids and attention_mask,
    and passes them to the model for inference.
	Outputs: The model returns an object that includes various outputs, with logits being the raw, unnormalized scores 
    for each class.
	
#14. Predicting the Sentiment:
	prediction = torch.argmax(logits, dim=-1).item()
	
	torch.argmax: Finds the index of the highest value in logits, which corresponds to the predicted class. 
    dim=-1 specifies that the operation is performed across the last dimension, which is the class dimension.
	item(): Converts the PyTorch tensor to a standard Python integer.