In [1]:
!pip install transformers datasets torch scikit-learn

import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer



In [2]:
df = pd.read_csv("query_dataset.csv")  # Replace with your dataset file path

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Display dataset
print(dataset)

Dataset({
    features: ['query', 'label'],
    num_rows: 1000
})


In [3]:
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

# Verify splits
print(f"Train dataset: {len(train_dataset)} samples")
print(f"Test dataset: {len(test_dataset)} samples")

Train dataset: 800 samples
Test dataset: 200 samples


In [4]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["query"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [5]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [6]:
!pip install --upgrade accelerate


training_args = TrainingArguments(
    output_dir="./results",              # Save checkpoints here
    evaluation_strategy="epoch",        # Evaluate after every epoch
    learning_rate=2e-5,                 # Learning rate
    per_device_train_batch_size=16,     # Batch size for training
    per_device_eval_batch_size=16,      # Batch size for evaluation
    num_train_epochs=3,                 # Number of epochs
    weight_decay=0.01,                  # Weight decay for regularization
    logging_dir="./logs",               # Log directory
    save_total_limit=2                  # Save only 2 checkpoints
)



In [7]:
trainer = Trainer(
    model=model,                        # The pre-trained model
    args=training_args,                 # Training arguments
    train_dataset=train_dataset,        # Training dataset
    eval_dataset=test_dataset           # Testing dataset
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.003856
2,No log,0.001323
3,No log,0.001083


TrainOutput(global_step=150, training_loss=0.06393456776936848, metrics={'train_runtime': 286.2579, 'train_samples_per_second': 8.384, 'train_steps_per_second': 0.524, 'total_flos': 78933316608000.0, 'train_loss': 0.06393456776936848, 'epoch': 3.0})

In [8]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.0010830023093149066, 'eval_runtime': 5.6847, 'eval_samples_per_second': 35.182, 'eval_steps_per_second': 2.287, 'epoch': 3.0}


In [9]:
# Save the model
model.save_pretrained("./chitchat_wiki_classifier")
tokenizer.save_pretrained("./chitchat_wiki_classifier")


('./chitchat_wiki_classifier/tokenizer_config.json',
 './chitchat_wiki_classifier/special_tokens_map.json',
 './chitchat_wiki_classifier/vocab.txt',
 './chitchat_wiki_classifier/added_tokens.json')