In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
!pip install torch


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,  # Adjust as needed
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

In [None]:
import pandas as pd


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

# Load train.csv and valid.csv using pandas or any other preferred method
train_data = pd.read_csv('train.csv')
valid_data = pd.read_csv('valid.csv')

# Extract the texts and labels from the CSV files
'''
Combine the 'Title', 'Body', 'Tags', and 'CreationDate' columns into a single string for training texts
Map the 'Y' column values to numeric labels for training labels
Combine the 'Title', 'Body', 'Tags', and 'CreationDate' columns into a single string for validation texts
Map the 'Y' column values to numeric labels for validation labels
'''
train_texts = train_data['Title'] + ' ' + train_data['Body'] + ' ' + train_data['Tags'] + ' ' + train_data['CreationDate']
train_labels = train_data['Y'].map({'HQ': 0, 'LQ_EDIT': 1, 'LQ_CLOSE': 2})
valid_texts = valid_data['Title'] + ' ' + valid_data['Body'] + ' ' + valid_data['Tags'] + ' ' + valid_data['CreationDate']
valid_labels = valid_data['Y'].map({'HQ': 0, 'LQ_EDIT': 1, 'LQ_CLOSE': 2})

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize training set
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')

# Tokenize validation set
val_encodings = tokenizer(valid_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')

# Create PyTorch DataLoader objects
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.tolist()))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(valid_labels.tolist()))
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set the device to use (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 5  

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    # Iterate over the training data
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs} - Average Training Loss: {avg_loss:.4f}')
    
    # Evaluation on the validation set
    model.eval()
    total_val_loss = 0
    total_val_correct = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            total_val_loss += loss.item()
            predicted_labels = torch.argmax(logits, dim=1)
            total_val_correct += (predicted_labels == labels).sum().item()
    
    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = total_val_correct / len(val_dataset)
    
    print(f'Epoch {epoch+1}/{num_epochs} - Average Validation Loss: {avg_val_loss:.4f}')
    print(f'Epoch {epoch+1}/{num_epochs} - Validation Accuracy: {val_accuracy:.4f}')

# Save the fine-tuned model
model.save_pretrained('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/5 - Average Training Loss: 0.2131
Epoch 1/5 - Average Validation Loss: 0.1486
Epoch 1/5 - Validation Accuracy: 0.9423
Epoch 2/5 - Average Training Loss: 0.1198
Epoch 2/5 - Average Validation Loss: 0.1376
Epoch 2/5 - Validation Accuracy: 0.9463
Epoch 3/5 - Average Training Loss: 0.0784
Epoch 3/5 - Average Validation Loss: 0.1577
Epoch 3/5 - Validation Accuracy: 0.9402
Epoch 4/5 - Average Training Loss: 0.0461
Epoch 4/5 - Average Validation Loss: 0.1883
Epoch 4/5 - Validation Accuracy: 0.9453
