<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/whether_a_sentence_is_about_technology_or_not.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers

classify sentences into two categories for simplicity.
for example: whether a sentence is about technology or not.

Dataset: We define a simple dataset class that wraps our data, using the BertTokenizer to tokenize our sentences.

DataLoader: allows us to batch our data for training.

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

# Sample data: A list of tuples in the form (sentence, label)
# Label 0: Not about technology, 1: About technology
data = [
    ("I love programming with Python.", 1),
    ("The weather today is beautiful.", 0),
    ("Elon Musk launches another rocket.", 1),
    ("Cooking is one of my favorite hobbies.", 0)
]

# Custom dataset class
class SimpleDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, label = self.data[idx]
        inputs = self.tokenizer(sentence, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        return {
            "input_ids": inputs["input_ids"].squeeze(0),  # We squeeze the batch dimension
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(label)
        }

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load the BERT model
# num_labels=2 parameter specifies that we have two classes.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Create the dataset and data loader
dataset = SimpleDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2)

# Example: Fine-tuning BERT (simplified for demonstration)
# In a real-world scenario, you'd need a significantly larger dataset and more epochs to achieve good performance
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
#sets the model to training mode
# models can have different behaviors during training and evaluation (inference) due to certain layers like dropout and batch normalization.
model.train()
for epoch in range(1):  # Loop over the dataset once (for demonstration)
    for batch in dataloader:
      #gradients would accumulate across batches. This line resets the gradients of all model parameters to zero.
        optimizer.zero_grad()
        # This line performs a forward pass of the model. The **batch syntax unpacks the batch dictionary into keyword arguments,
        # effectively passing input_ids, attention_mask, and potentially other tensors to the model.
        # The model processes this input and returns an output object.
        outputs = model(**batch)
        loss = outputs.loss
        # computes the gradient of the loss with respect to all model parameters (i.e., it performs backpropagation).
        # These gradients are used by the optimizer in the next step to update the model's weights,
        # aiming to minimize the loss
        loss.backward()
        optimizer.step()
        # For each batch, we print the loss to monitor the training process.
        print(f"Loss: {loss.item()}")

# training loop for a neural network in PyTorch, encompassing setting the model to training mode, iterating over the dataset,
# performing forward and backward passes, and computing gradients for each batch.
# This process is aimed at fine-tuning the model's weights to improve its performance on the classification task
#Note: This is a very simplified example. Real-world applications require more data and training epochs.
