In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.4 MB/s[0m eta [36m0:00:

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Load CSV data into pandas DataFrame
data1 = pd.read_csv('/home/ssn/Downloads/GujaratiTrainingData.csv')

# Preprocessing and tokenization using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def preprocess_and_tokenize(text):
    return tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=128)  # Adjust max_length

data1['tokenized_text'] = data1['text'].apply(preprocess_and_tokenize)

# Padding
max_length = max(data1['tokenized_text'].apply(len))
data1['padded_tokenized_text'] = data1['tokenized_text'].apply(lambda x: x + [0] * (max_length - len(x)))

# Convert to tensors
input_ids = torch.tensor(data1['padded_tokenized_text'].tolist())

label_mapping = {'HOF': 1, 'NOT': 0}  # Replace with your label mapping
data1['label'] = data1['label'].map(label_mapping)

# Convert labels to tensor
labels = torch.tensor(data1['label'].tolist(), dtype=torch.long)


# Create attention masks
attention_masks = (input_ids != 0).float()

# Train-Test Split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, test_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.2, random_state=42)

# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Load pre-trained BERT model
num_classes = len(data1['label'].unique())  # Adjust based on your number of classes
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Fine-tuning
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluation
model.eval()
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

true_labels = []
predicted_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks)
        _, predicted = torch.max(outputs.logits, 1)

        true_labels.extend(labels.tolist())
        predicted_labels.extend(predicted.tolist())

# Calculate metrics
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
