In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Check if drive is mounted
if 'drive' in locals() or 'drive' in globals():
    print("Drive successfully mounted!")
else:
    print("Drive not mounted.")


Mounted at /content/drive
Drive successfully mounted!


In [None]:
!pip install transformers
!pip install torch

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.1 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import random
import re

# Set a random seed for reproducibility
random.seed(42)
torch.manual_seed(42)

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your dataset
data = pd.read_csv(r"/content/drive/MyDrive/GujaratiTrainingData.csv")

# Preprocess the text and labels
labels = data["label"].tolist()
texts = data["text"].tolist()

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)

# Load the multilingual BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_)).to(device)

# Tokenize the training and testing texts
train_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
test_inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
train_labels_tensor = torch.tensor(train_labels)
test_labels_tensor = torch.tensor(test_labels)

# Create DataLoader for training and testing
train_dataset = TensorDataset(train_inputs["input_ids"], train_inputs["attention_mask"], train_labels_tensor)
test_dataset = TensorDataset(test_inputs["input_ids"], test_inputs["attention_mask"], test_labels_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)  # No need to shuffle for testing

# Define optimizer and loss function for fine-tuning
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)  # Adjust the learning rate
criterion = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
num_epochs = 25  # You can adjust the number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, batch_labels = [item.to(device) for item in batch]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, batch_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {average_loss:.4f}")

# Evaluation after fine-tuning
model.eval()
test_predictions = []
test_true_labels = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, batch_labels = [item.to(device) for item in batch]

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predictions = torch.argmax(probabilities, dim=1)

        test_predictions.extend(predictions.cpu().numpy())
        test_true_labels.extend(batch_labels.cpu().numpy())

accuracy = accuracy_score(test_true_labels, test_predictions)
precision = precision_score(test_true_labels, test_predictions, average="weighted", zero_division=1)
recall = recall_score(test_true_labels, test_predictions, average="weighted", zero_division=1)
f1 = f1_score(test_true_labels, test_predictions, average="weighted", zero_division=1)

print(f"Test Accuracy after Fine-tuning: {accuracy:.4f}")
print(f"Test Precision after Fine-tuning: {precision:.4f}")
print(f"Test Recall after Fine-tuning: {recall:.4f}")
print(f"Test F1-Score after Fine-tuning: {f1:.4f}")




Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/25 - Loss: 0.7020
Epoch 2/25 - Loss: 0.6809
Epoch 3/25 - Loss: 0.6455
Epoch 4/25 - Loss: 0.6049
Epoch 5/25 - Loss: 0.5430
Epoch 6/25 - Loss: 0.5185
Epoch 7/25 - Loss: 0.4813
Epoch 8/25 - Loss: 0.4512
Epoch 9/25 - Loss: 0.4236
Epoch 10/25 - Loss: 0.3792
Epoch 11/25 - Loss: 0.3586
Epoch 12/25 - Loss: 0.3326
Epoch 13/25 - Loss: 0.3751
Epoch 14/25 - Loss: 0.3086
Epoch 15/25 - Loss: 0.3175
Epoch 16/25 - Loss: 0.2138
Epoch 17/25 - Loss: 0.2318
Epoch 18/25 - Loss: 0.2258
Epoch 19/25 - Loss: 0.1994
Epoch 20/25 - Loss: 0.3487
Epoch 21/25 - Loss: 0.2535
Epoch 22/25 - Loss: 0.1925
Epoch 23/25 - Loss: 0.1841
Epoch 24/25 - Loss: 0.1768
Epoch 25/25 - Loss: 0.1976
Test Accuracy after Fine-tuning: 0.6000
Test Precision after Fine-tuning: 0.5969
Test Recall after Fine-tuning: 0.6000
Test F1-Score after Fine-tuning: 0.5969
