In [1]:
#!pip install transformer --user
#!pip intall datasets --user
#!pip install evaluate

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import evaluate  # New library for metrics

In [3]:
# Load the COLA dataset
cola = load_dataset("glue", "cola")

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [4]:
type(cola)

datasets.dataset_dict.DatasetDict

In [5]:
cola.keys()

dict_keys(['train', 'validation', 'test'])

In [6]:
len(cola['train'])

8551

In [7]:
len(cola['validation']), len(cola['test'])

(1043, 1063)

In [9]:
cola['train'][5]

{'sentence': "I'll fix you a drink.", 'label': 1, 'idx': 5}

In [10]:
# Access the train split
train_data = cola["train"]

In [11]:
# Count the distribution of labels
label_distribution = train_data.to_pandas()["label"].value_counts()
label_distribution

1    6023
0    2528
Name: label, dtype: int64

The label distribution in the CoLA training dataset is:

- Label 1 (Acceptable): 6,023 samples
- Label 0 (Unacceptable): 2,528 samples
  
This indicates that the dataset is imbalanced, with significantly more examples of acceptable sentences compared to unacceptable ones. The ratio is approximately 2.38:1 in favor of acceptable sentences.

In [12]:
# Load the pre-trained tokenizer and model
model_name = "bert-base-uncased"

tokenizer  = AutoTokenizer.from_pretrained(model_name,
                                           cache_dir = r'D:\AI-DATASETS\07-Hugging-Face-Data')

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=2,
                                                           cache_dir = r'D:\AI-DATASETS\07-Hugging-Face-Data')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Prepare the validation data
def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

In [29]:
tokenized_val = cola["validation"].map(preprocess_function, batched=True)

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

In [33]:
# Convert tokenized dataset into PyTorch tensors (necessary for model input)
val_dataset = tokenized_val.with_format("torch", columns=["input_ids", "attention_mask", "label"])

In [34]:
# Define the evaluation metric
metric = evaluate.load("matthews_correlation")

In [43]:
# Evaluate the model
model.eval()
all_predictions = []
all_labels      = []

In [44]:
batch_size = 16

In [45]:
# Create a DataLoader for the validation dataset
from torch.utils.data import DataLoader

In [46]:
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [47]:
with torch.no_grad():
    for batch in val_dataloader:
        input_ids      = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        label          = batch["label"]

        # Get model predictions
        outputs     = model(input_ids, attention_mask=attention_mask)
        logits      = outputs.logits
        predictions = torch.argmax(logits, dim=-1).tolist()

        all_predictions.append(predictions)
        all_labels.append(label)
        
        break;
        

In [49]:
all_labels

[tensor([1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1])]

```python

Original Text (before tokenization): "The quick brown fox jumps over the lazy dog."

After Tokenization (tokenized output):
    
{
    'input_ids':      [101, 1996, 4248, 2829, 4419, 3598, 2058, 1996, 2552, 3899, 1997, 1996, 2821, 102],
    'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    'label': 1        # (or 0 if the sentence is grammatically incorrect)
}
```

**Inference Process:**
The inference process is done as follows:

You feed the input_ids and attention_mask to the model.
The model produces logits, which are unnormalized prediction scores for each class.
You then apply argmax on the logits to get the predicted class.


In [None]:
# Compute the Matthew's Correlation Coefficient (MCC)
mcc = metric.compute(predictions=all_predictions, references=all_labels)
print("Matthew's Correlation Coefficient:", mcc["matthews_correlation"])