In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
import torch
from torch.nn.functional import softmax
import datasets
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import random
from torch.utils.data import DataLoader, Subset
import torch.nn.functional as F
from sklearn.metrics import cohen_kappa_score, mean_absolute_error
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
pd.options.mode.copy_on_write = True
LOAD_MODEL = True
SAVE_MODEL = False

In [20]:
fpath = "../datasets_ready/Task_Achievement.csv"
df5 = pd.read_csv(fpath)
df5['score'] = df5['score'].round(1)

df_filtered5 = df5[(df5['score'] > 3.0) & (df5['score'] < 12.0)]

reverse_mapping_3 = {
    3.5: 0, 4.0: 0,
    4.5: 1, 5.0: 1,
    5.5: 2, 6.0: 2,
    6.5: 3, 7.0: 3,
    7.5: 4, 8.0: 4,
    8.5: 5, 9.0: 5
}

# Apply mapping
df_filtered5['score'] = df_filtered5['score'].map(reverse_mapping_3)

# Sample the maximum available size for each class
df_sampled5 = df_filtered5.groupby('score', group_keys=False).apply(
    lambda x: x.sample(len(x), random_state=42)
).reset_index(drop=True)

dataset5 = Dataset.from_pandas(df_sampled5)

  df_sampled5 = df_filtered5.groupby('score', group_keys=False).apply(


In [16]:
num_labels_5 = 6

# Load the tokenizer and model
tokenizer5 = AutoTokenizer.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis")
model5 = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis", num_labels=num_labels_5, ignore_mismatched_sizes=True)

# Tokenization function
def tokenize_function(examples):
    # Concatenate the input columns for each example in the batch
    combined_text = [
        p + " " + e + " " + t for p, e, t in zip(examples["prompt"], examples["essay"], examples["text"])
    ]
    # Tokenize the concatenated text
    return tokenizer5(combined_text, padding="max_length", truncation=True, max_length=1024)

# Tokenize the dataset
tokenized_datasets5 = dataset5.map(tokenize_function, batched=True)
tokenized_datasets5 = tokenized_datasets5.remove_columns(["prompt", "essay", "text"])
tokenized_datasets5 = tokenized_datasets5.rename_column("score", "labels")
tokenized_datasets5.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Set random seed for reproducibility

# Get the labels from the tokenized dataset
labels5 = tokenized_datasets5["labels"]

# Get the unique labels
unique_labels5 = np.unique(labels5)

# Store the indices for each label
label_to_indices5 = {label: np.where(labels5 == label)[0] for label in unique_labels5}

# Lists to hold the train and validation indices
train_indices5 = []
val_indices5 = []

# For each label, split the indices into train and validation
for label, indices in label_to_indices5.items():
    # Shuffle the indices within each label to ensure random splitting
    np.random.shuffle(indices)
    
    # Split 80% for training, 20% for validation
    split_idx = int(0.8 * len(indices))
    train_indices5.extend(indices[:split_idx])
    val_indices5.extend(indices[split_idx:])

# Convert indices to tensors
train_indices5 = torch.tensor(train_indices5)
val_indices5 = torch.tensor(val_indices5)

# Create Subsets for train and validation datasets
train_dataset5 = Subset(tokenized_datasets5, train_indices5)
eval_dataset5 = Subset(tokenized_datasets5, val_indices5)

# Dataloaders
train_dataloader5 = DataLoader(train_dataset5, shuffle=True, batch_size=12)
eval_dataloader5 = DataLoader(eval_dataset5, batch_size=12)

# Set up optimizer and scheduler
optimizer5 = AdamW(model5.parameters(), lr=5e-6)
num_epochs5 = 5
num_training_steps5 = num_epochs5 * len(train_dataloader5)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer5, num_warmup_steps=int(0.05*num_training_steps5), num_training_steps=num_training_steps5
)

# Move model to device (GPU if available)
device5 = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model5.to(device5)

# Initialize lists to track training/validation losses and accuracies
train_losses5 = []
val_losses5 = []
val_f1_scores5 = []


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9202 [00:00<?, ? examples/s]



## Testing Coherence

In [8]:
# Specify the file name from which to load the model
modelsavename = "../saved_models/COHERENCE_EPOCH4_F10.4253.pt"

# Initialize the same model architecture
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(modelsavename, "rb") as f:
            model.load_state_dict(torch.load(f))
            print(f"Model loaded from {modelsavename}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

tokenizer = AutoTokenizer.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/COHERENCE_EPOCH4_F10.4253.pt


In [None]:
all_eval_labels = []
all_eval_preds = []
model.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Coherence QWK for Validation Dataset: {qwk}")

Coherence QWK for Validation Dataset: 0.6033783627366514


In [21]:
all_train_labels = []
all_train_preds = []
model.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Coherence QWK for Training Dataset: {qwk}")

Coherence QWK for Training Dataset: 0.4219602258368118


## Testing Lexical

In [12]:
lexical_saved_model_name = "../saved_models/Lexical_epoch5.pt"

# Initialize the same model architecture
lexical_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(lexical_saved_model_name, "rb") as f:
            lexical_model.load_state_dict(torch.load(f))
            print(f"Model loaded from {lexical_saved_model_name}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
lexical_model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  lexical_model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/Lexical_epoch5.pt


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [14]:
all_eval_labels = []
all_eval_preds = []
lexical_model.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = lexical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Lexical QWK for Validation Dataset: {qwk}")

Lexical QWK for Validation Dataset: 0.4827286198835399


In [9]:
all_train_labels = []
all_train_preds = []
lexical_model.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = lexical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Lexical QWK for Training Dataset: {qwk}")

Lexical QWK for Training Dataset: 0.4798498176496018


## Testing Gramatical

In [4]:
grammatical_saved_model_name = "../saved_models/grammatical_epoch4_F10.5099.pt"

# Initialize the same model architecture
grammatical_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(grammatical_saved_model_name, "rb") as f:
            grammatical_model.load_state_dict(torch.load(f))
            print(f"Model loaded from {grammatical_saved_model_name}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
grammatical_model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  grammatical_model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/grammatical_epoch4_F10.5099.pt


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [5]:
all_eval_labels = []
all_eval_preds = []
grammatical_model.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = grammatical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Grammatical QWK for Validation Dataset: {qwk}")

Grammatical QWK for Validation Dataset: 0.6124279105810377


In [None]:
all_train_labels = []
all_train_preds = []
grammatical_model.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = grammatical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Grammatical QWK for Training Dataset: {qwk}")

Lexical QWK for Training Dataset: 0.4798498176496018


## Task Achievement

In [17]:
TA_saved_model_name = "../saved_models/task_achievement_trained.pt"

# Initialize the same model architecture
TA_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(TA_saved_model_name, "rb") as f:
            TA_model.load_state_dict(torch.load(f))
            print(f"Model loaded from {TA_saved_model_name}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TA_model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  TA_model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/task_achievement_trained.pt


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [18]:
all_eval_labels = []
all_eval_preds = []
TA_model.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = TA_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Task Achievement QWK for Validation Dataset: {qwk}")

Task Achievement QWK for Validation Dataset: 0.7160319367053913


In [15]:
all_train_labels = []
all_train_preds = []
TA_model.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = TA_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Task Achievement QWK for Training Dataset: {qwk}")

Task Achievement QWK for Training Dataset: 0.4798498176496018


## Llama3.2 1B Testing

In [2]:
import torch
import gc
# Clear cache

# For debugging purposes, check memory stats
print(f"Memory Allocated: {torch.cuda.memory_allocated() / 1e6} MB")
print(f"Memory Reserved: {torch.cuda.memory_reserved() / 1e6} MB")
torch.cuda.empty_cache()
gc.collect()
llama3_model_path = "meta-llama/Llama-3.2-1B"
file_path = "../datasets_ready/combined_dataset.csv"
checkpoint_dir = "../results/checkpoint-4410"
SAMPLE_SIZE = 9800
LEARNING_RATE = 1e-5
NUM_TRAIN_EPOCHS = 10
BATCH_SIZE = 16


def create_input_text(row):
    return (
        f"""You are a member of the IELTS essay evaluation committee.\nYour task is to evaluate the essay based on the given prompt and assign it a score
        between 4 and 9 (in 1 point increments). 4 and 9 are the lowest and highest scores possible.
        Prompt contain the topic of the essay. The essay is the text that you need to evaluate.
        Think step by step why this essay is good or bad. 
        "Prompt: {row['prompt']}\nEssay: {row['essay']}"""
    )

def map_band_to_class(band):
    if band ==  "<4":
        return band_to_class['<4']
    return band_to_class[str(band)]

def tokenize_function(example):
    return tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

band_classes = ['<4'] + ['4', '4.5', '5', '5.5', '6', '6.5', '7', '7.5', '8', '8.5', '9'] 
# class_to_band = {i: band for band, i in band_to_class.items()}  # Class → Band

band_to_class = {
    '<4': 0, '4': 0,
    '4.5': 1, '5': 1,
    '5.5': 2, '6': 2,
    '6.5': 3, '7': 3,
    '7.5': 4, '8': 4,
    '8.5': 5, '9': 5
}

# Map bands to classes
def map_band_to_class(band):
    if band ==  "<4":
        return band_to_class['<4']
    return band_to_class[str(band)]

def tokenize_function(example):
    return tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

data = pd.read_csv(file_path).sample(n=SAMPLE_SIZE, random_state=42)
data['input_text'] = data.apply(create_input_text, axis=1)
data['labels'] = data['band'].apply(map_band_to_class)
# Drop unnecessary columns. Might need them later
data = data.drop(columns=["evaluation", "band", "prompt", "essay"])

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

# Prepare datasets for Hugging Face Trainer
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

print(f"Train dataset length: {len(train_dataset)}, Test dataset length: {len(test_dataset)}")

tokenizer = AutoTokenizer.from_pretrained(llama3_model_path)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
print(f"train_dataset columns: {train_dataset.column_names}")
print(f"train_dataset[0]: {train_dataset[0]},\ntrain_dataset['labels'][0]: {train_dataset['labels'][0]}")

Memory Allocated: 0.0 MB
Memory Reserved: 0.0 MB
Train dataset length: 7840, Test dataset length: 1960


Map:   0%|          | 0/7840 [00:00<?, ? examples/s]

Map:   0%|          | 0/1960 [00:00<?, ? examples/s]

train_dataset columns: ['input_text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask']
train_dataset[0]: {'input_text': 'You are a member of the IELTS essay evaluation committee.\nYour task is to evaluate the essay based on the given prompt and assign it a score\n        between 4 and 9 (in 1 point increments). 4 and 9 are the lowest and highest scores possible.\n        Prompt contain the topic of the essay. The essay is the text that you need to evaluate.\n        Think step by step why this essay is good or bad. \n        "Prompt: You can get up-to-date news from the radio, TV and the Internet. Which kind of media do you think is the best to get the news?\nEssay: Nowadays, there are several channels to get news, such as the radio, TV, newspapers and the Internet. I think the Internet is the best among these. .Since its invention, the Internet has been booming as a prospective industry. Not only because it is a combination of text, audio and video, but also due to its . 

In [3]:
num_labels = 6  # Total number of unique band scores

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint_dir,
    num_labels=num_labels,
)
t = model.config.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Freeze the base model
for param in model.base_model.parameters():
    param.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Device: {device}")

Device: cuda


In [4]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=BATCH_SIZE,
)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    qwk = cohen_kappa_score(labels, preds, weights="quadratic")
    mae = mean_absolute_error(labels, preds)
    return {
        "accuracy": acc,
        "QWK": qwk,
        "MAE": mae,
    }
    
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
test_results = trainer.evaluate()
print(f"Test Results: {test_results}")


  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/123 [00:00<?, ?it/s]

Test Results: {'eval_loss': 1.6765220165252686, 'eval_model_preparation_time': 0.004, 'eval_accuracy': 0.29846938775510207, 'eval_QWK': 0.3236651161035259, 'eval_MAE': 1.1418367346938776, 'eval_runtime': 784.3186, 'eval_samples_per_second': 2.499, 'eval_steps_per_second': 0.157}
