In [None]:
"""
PEFT_Sentiment_Analysis - API Documentation Notebook

This notebook demonstrates the core APIs and tools used in PEFT (Parameter-Efficient Fine-Tuning):

1. HuggingFace Transformers APIs (tokenizer, model, dataset)
2. PEFT LoRA API (LoraConfig, get_peft_model)
3. How to use these APIs for any sentiment analysis or text classification task

This notebook is tool-focused and does NOT contain project-specific implementation.
For a complete project example, see PEFT_Sentiment_Analysis.example.ipynb
"""


'\nPEFT_Sentiment_Analysis_on_Movie_Reviews.API\n\nThis notebook demonstrates:\n1. The native HuggingFace APIs (tokenizer, model, dataset)\n2. The PEFT LoRA API (LoraConfig, get_peft_model)\n3. How our utils wrapper functions expose a simpler high-level API\n\nThis notebook *does not* train a model.\nIt is purely for understanding the API surface.\n'

In [None]:
# Import core HuggingFace and PEFT libraries
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model

# These are the main APIs we'll demonstrate


  from .autonotebook import tqdm as notebook_tqdm


## 1. Tokenization API

The tokenizer converts raw text into token IDs that the model can process.

In [None]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Example: Tokenize a simple text
sample_text = "This movie was excellent and very entertaining!"

# Tokenize with padding and truncation
encoded = tokenizer(
    sample_text,
    padding="max_length",
    truncation=True,
    max_length=20
)

print("Input IDs:", encoded["input_ids"][:10], "...")
print("Attention Mask:", encoded["attention_mask"][:10], "...")


{'input_ids': [0, 44820, 268, 146, 234, 21992, 203, 3013, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

## 2. Dataset API

HuggingFace's Dataset class provides efficient data handling for training.

In [None]:
# Create a simple dataset from dictionary
dataset = Dataset.from_dict({
    "text": ["I loved this movie", "This was terrible", "Great acting and plot"],
    "label": [1, 0, 1]  # 1=positive, 0=negative
})

print(f"Dataset size: {len(dataset)}")
print(f"Features: {dataset.features}")
print(f"\nFirst example: {dataset[0]}")


Dataset({
    features: ['text', 'label'],
    num_rows: 2
})

## 3. Model API

Load a pre-trained RoBERTa model for sequence classification.

In [None]:
# Load RoBERTa model for binary classification
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2  # Binary classification
)

print(f"Model type: {type(model).__name__}")
print(f"Number of parameters: {model.num_parameters():,}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## 4. PEFT LoRA API

LoRA (Low-Rank Adaptation) enables parameter-efficient fine-tuning by adding small trainable adapters.

In [None]:
# Configure LoRA parameters
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification
    r=8,                          # Rank of adaptation matrices
    lora_alpha=16,                # Scaling factor
    lora_dropout=0.1,             # Dropout for regularization
    bias="none",                  # Don't adapt bias terms
    target_modules=["query", "value"]  # Apply LoRA to Q and V attention matrices
)

# Wrap the model with LoRA adapters
peft_model = get_peft_model(model, lora_config)

# Show the dramatic reduction in trainable parameters
peft_model.print_trainable_parameters()


trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


## 5. Training API

The Trainer API simplifies the training loop with automatic batching, optimization, and evaluation.

In [None]:
# Example TrainingArguments configuration
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


## 6. Tokenizing Datasets

Apply the tokenizer to your dataset using the `.map()` function.

In [None]:
# Define tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(f"Tokenized dataset columns: {tokenized_dataset.column_names}")
print(f"Example tokenized input shape: {len(tokenized_dataset[0]['input_ids'])}")


Unnamed: 0,text,text_final
0,Donald Trump just couldn t wish all Americans ...,donald trump wish american happy new year leav...
1,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...
2,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...
3,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...
4,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...


## Summary

This notebook demonstrated the key APIs for PEFT-based sentiment analysis:

- **Tokenizer**: Converts text to model inputs
- **Dataset**: Efficient data management
- **Model**: Pre-trained transformers for classification
- **LoRA/PEFT**: Parameter-efficient fine-tuning
- **Trainer**: Simplified training workflow

For a complete end-to-end project implementation, see `PEFT_Sentiment_Analysis.example.ipynb`.