In [2]:
#!pip install transformer --user
#!pip intall datasets --user
#!pip install evaluate

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
# Load the COLA dataset
cola = load_dataset("glue", "cola")

In [4]:
type(cola)

datasets.dataset_dict.DatasetDict

In [5]:
cola.keys()

dict_keys(['train', 'validation', 'test'])

In [6]:
cola

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame(cola["train"])

In [9]:
df.shape

(8551, 3)

In [10]:
df.head(6)

Unnamed: 0,sentence,label,idx
0,"Our friends won't buy this analysis, let alone...",1,0
1,One more pseudo generalization and I'm giving up.,1,1
2,One more pseudo generalization or I'm giving up.,1,2
3,"The more we study verbs, the crazier they get.",1,3
4,Day by day the facts are getting murkier.,1,4
5,I'll fix you a drink.,1,5


In [12]:
# Access the train split
train_data = cola["train"]

In [13]:
label_distribution = train_data.to_pandas()["label"].value_counts()
label_distribution

label
1    6023
0    2528
Name: count, dtype: int64

In [14]:
# Load the pre-trained tokenizer and model
model_name = "bert-base-uncased"

tokenizer  = AutoTokenizer.from_pretrained(model_name,
                                           cache_dir = r'D:\AI-DATASETS\07-Hugging-Face-Data')

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=2,
                                                           cache_dir = r'D:\AI-DATASETS\07-Hugging-Face-Data')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Prepare the validation data
def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

In [17]:
tokenized_val = cola["validation"].map(preprocess_function, batched=True)

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

In [18]:
tokenized_val

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1043
})

In [21]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [22]:
import evaluate  # New library for metrics

In [23]:
# Define the evaluation metric
metric = evaluate.load("matthews_correlation")

Downloading builder script:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

In [24]:
# Evaluate the model
model.eval()
all_predictions = []
all_labels      = []

In [25]:
batch_size = 16

In [26]:
# Create a DataLoader for the validation dataset
from torch.utils.data import DataLoader

In [28]:
# Convert tokenized dataset into PyTorch tensors (necessary for model input)
val_dataset = tokenized_val.with_format("torch", columns=["input_ids", "attention_mask", "label"])

In [29]:
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [31]:
import torch

In [32]:
with torch.no_grad():
    for batch in val_dataloader:
        input_ids      = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        label          = batch["label"]

        # Get model predictions
        outputs     = model(input_ids, attention_mask=attention_mask)
        logits      = outputs.logits
        predictions = torch.argmax(logits, dim=-1).tolist()

        all_predictions.append(predictions)
        all_labels.append(label)
        
        break;
        

In [33]:
all_labels

[tensor([1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1])]

In [34]:
# Compute the Matthew's Correlation Coefficient (MCC)
mcc = metric.compute(predictions=all_predictions, references=all_labels)
print("Matthew's Correlation Coefficient:", mcc["matthews_correlation"])

ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)},
Input predictions: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
Input references: [tensor([1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1])]