In [None]:
# installations
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import get_scheduler
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import Dataset
import evaluate
import numpy as np
from google.colab import drive
from torch.optim import AdamW
from huggingface_hub import login

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read tsv file
power_training = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/power-tr-train.tsv', sep='\t')

In [None]:
print(len(power_training))
print(power_training['label'].value_counts())

17384
label
1    8932
0    8452
Name: count, dtype: int64


In [None]:
# Train-validation split (90% train, 10% validation, stratified)
train_data, val_data = train_test_split(power_training, test_size=0.1, random_state=42, stratify=power_training['label'])

In [None]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [None]:
# Load BERT model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenization function
def preprocess_function(examples):
    return tokenizer(examples["text_en"], truncation=True, padding=True, max_length=512)

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/15645 [00:00<?, ? examples/s]

Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

In [None]:
# Define compute metrics

# load metrics
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)

    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)

    return {"Accuracy": acc}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# Custom optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.05)

num_training_steps = len(train_dataset) // 16 * 3
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=int(num_training_steps * 0.1),
    num_training_steps=num_training_steps
)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/part2_new_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,              # can increment
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    fp16=True,                              # Use mixed precision for faster training
    load_best_model_at_end=True,
    optim="adamw_torch",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.390618,0.823
2,1.273400,0.312141,0.876
3,1.034900,0.350015,0.875


TrainOutput(global_step=1467, training_loss=1.0235102054775371, metrics={'train_runtime': 375.4105, 'train_samples_per_second': 125.023, 'train_steps_per_second': 3.908, 'total_flos': 1.23491173833216e+16, 'train_loss': 1.0235102054775371, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
labels = val_dataset['label']
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85       845
           1       0.88      0.83      0.85       894

    accuracy                           0.85      1739
   macro avg       0.85      0.85      0.85      1739
weighted avg       0.86      0.85      0.85      1739

