In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import BertConfig, BertForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

# Load data
data = pd.read_csv("final_dataset_4000.csv")
le = LabelEncoder()
data["labels"] = le.fit_transform(data['label'])
data.drop(data.tail(750).index, inplace=True)
data_final = data[["konkani", "labels"]]

dataset = Dataset.from_pandas(data_final)
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

# Tokenizer (we still need some tokenizer — use a basic one, or train your own separately)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocess
def preprocess_function(examples):
    return tokenizer(examples["konkani"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Initialize new model config
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=512,
    num_labels=3,  # your number of classes
)

# Create new model from scratch
model = BertForSequenceClassification(config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",  # No wandb
)

# Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Evaluate
results = trainer.evaluate()

print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"F1 Score: {results['eval_f1']:.4f}")
print(f"F1 Score: {results['eval_loss']:.4f}")
print("-" * 30)

Map: 100%|██████████| 2600/2600 [00:00<00:00, 18156.19 examples/s]
Map: 100%|██████████| 650/650 [00:00<00:00, 21607.10 examples/s]


Step,Training Loss
500,1.1158
1000,1.1039
1500,1.1053
2000,1.1008
2500,1.101
3000,1.0961


Accuracy: 0.4015
F1 Score: 0.3276
F1 Score: 1.0834
------------------------------
