In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
import torch
from torch.nn.functional import softmax
import datasets
from datasets import load_dataset, Dataset
import random
from torch.utils.data import DataLoader, Subset
import torch.nn.functional as F
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
from tqdm import tqdm
from torchmetrics import F1Score
import pandas as pd
import numpy as np
pd.options.mode.copy_on_write = True
LOAD_MODEL = True
SAVE_MODEL = False

In [2]:
fpath = "../datasets_ready/Grammatical.csv"
df5 = pd.read_csv(fpath)
df5['score'] = df5['score'].round(1)

df_filtered5 = df5[(df5['score'] > 3.0) & (df5['score'] < 12.0)]

reverse_mapping_3 = {
    3.5: 0, 4.0: 0,
    4.5: 1, 5.0: 1,
    5.5: 2, 6.0: 2,
    6.5: 3, 7.0: 3,
    7.5: 4, 8.0: 4,
    8.5: 5, 9.0: 5
}

# Apply mapping
df_filtered5['score'] = df_filtered5['score'].map(reverse_mapping_3)

# Sample the maximum available size for each class
df_sampled5 = df_filtered5.groupby('score', group_keys=False).apply(
    lambda x: x.sample(len(x), random_state=42)
).reset_index(drop=True)

dataset5 = Dataset.from_pandas(df_sampled5)

  df_sampled5 = df_filtered5.groupby('score', group_keys=False).apply(


In [3]:
num_labels_5 = 6

# Load the tokenizer and model
tokenizer5 = AutoTokenizer.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis")
model5 = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis", num_labels=num_labels_5, ignore_mismatched_sizes=True)

# Tokenization function
def tokenize_function(examples):
    # Concatenate the input columns for each example in the batch
    combined_text = [
        p + " " + e + " " + t for p, e, t in zip(examples["prompt"], examples["essay"], examples["text"])
    ]
    # Tokenize the concatenated text
    return tokenizer5(combined_text, padding="max_length", truncation=True, max_length=1024)

# Tokenize the dataset
tokenized_datasets5 = dataset5.map(tokenize_function, batched=True)
tokenized_datasets5 = tokenized_datasets5.remove_columns(["prompt", "essay", "text"])
tokenized_datasets5 = tokenized_datasets5.rename_column("score", "labels")
tokenized_datasets5.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Get the labels from the tokenized dataset
labels5 = tokenized_datasets5["labels"]

# Get the unique labels
unique_labels5 = np.unique(labels5)

# Store the indices for each label
label_to_indices5 = {label: np.where(labels5 == label)[0] for label in unique_labels5}

# Lists to hold the train and validation indices
train_indices5 = []
val_indices5 = []

# For each label, split the indices into train and validation
for label, indices in label_to_indices5.items():
    # Shuffle the indices within each label to ensure random splitting
    np.random.shuffle(indices)
    
    # Split 80% for training, 20% for validation
    split_idx = int(0.8 * len(indices))
    train_indices5.extend(indices[:split_idx])
    val_indices5.extend(indices[split_idx:])

# Convert indices to tensors
train_indices5 = torch.tensor(train_indices5)
val_indices5 = torch.tensor(val_indices5)

# Create Subsets for train and validation datasets
train_dataset5 = Subset(tokenized_datasets5, train_indices5)
eval_dataset5 = Subset(tokenized_datasets5, val_indices5)

# Dataloaders
train_dataloader5 = DataLoader(train_dataset5, shuffle=True, batch_size=12)
eval_dataloader5 = DataLoader(eval_dataset5, batch_size=12)

# Set up optimizer and scheduler
optimizer5 = AdamW(model5.parameters(), lr=5e-6)
num_epochs5 = 5
num_training_steps5 = num_epochs5 * len(train_dataloader5)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer5, num_warmup_steps=int(0.05*num_training_steps5), num_training_steps=num_training_steps5
)

# Move model to device (GPU if available)
device5 = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model5.to(device5)

# Initialize lists to track training/validation losses and accuracies
train_losses5 = []
val_losses5 = []
val_f1_scores5 = []


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8352 [00:00<?, ? examples/s]



## Testing Coherence

In [None]:
# Specify the file name from which to load the model
modelsavename = "../saved_models/COHERENCE_EPOCH5_F10.4195.pt"

# Initialize the same model architecture
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(modelsavename, "rb") as f:
            model.load_state_dict(torch.load(f))
            print(f"Model loaded from {modelsavename}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

tokenizer = AutoTokenizer.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/COHERENCE_EPOCH4_F10.4253.pt


In [5]:
all_eval_labels = []
all_eval_preds = []
model5.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = model5(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Coherence QWK for Validation Dataset: {qwk}")

Coherence QWK for Validation Dataset: 0.01657020351825189


In [6]:
all_train_labels = []
all_train_preds = []
model5.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = model5(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Coherence QWK for Training Dataset: {qwk}")

train QWK for Validation Dataset: 0.01081455554608668


## Testing Lexical

In [None]:
lexical_saved_model_name = "../saved_models/Lexical_epoch5.pt"

# Initialize the same model architecture
lexical_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(lexical_saved_model_name, "rb") as f:
            lexical_model.load_state_dict(torch.load(f))
            print(f"Model loaded from {lexical_slexical_saved_model_nameaved_model_name}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
lexical_model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  lexical_model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/Lexical_epoch5.pt


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [8]:
all_eval_labels = []
all_eval_preds = []
lexical_model.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = lexical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Lexical QWK for Validation Dataset: {qwk}")

Lexical QWK for Validation Dataset: 0.5029868678667244


In [9]:
all_train_labels = []
all_train_preds = []
lexical_model.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = lexical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Lexical QWK for Training Dataset: {qwk}")

Lexical QWK for Training Dataset: 0.4798498176496018


## Testing Gramatical

In [10]:
grammatical_saved_model_name = "../saved_models/grammatical_epoch4_F10.5099.pt"

# Initialize the same model architecture
grammatical_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(lexical_saved_model_name, "rb") as f:
            grammatical_model.load_state_dict(torch.load(f))
            print(f"Model loaded from {grammatical_saved_model_name}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
grammatical_model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  grammatical_model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/grammatical_epoch4_F10.5099.pt


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [11]:
all_eval_labels = []
all_eval_preds = []
grammatical_model.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = grammatical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Grammatical QWK for Validation Dataset: {qwk}")

Grammatical QWK for Validation Dataset: 0.5029868678667244


In [12]:
all_train_labels = []
all_train_preds = []
grammatical_model.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = grammatical_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Lexical QWK for Training Dataset: {qwk}")

Lexical QWK for Training Dataset: 0.4798498176496018


## Task Achievement

In [13]:
TA_saved_model_name = "../saved_models/task_achievement_trained.pt"

# Initialize the same model architecture
TA_model = AutoModelForSequenceClassification.from_pretrained("mrm8488/deberta-v3-ft-financial-news-sentiment-analysis",num_labels=6, ignore_mismatched_sizes=True)

# Load the saved state_dict into the model
if LOAD_MODEL:
    try:
        with open(lexical_saved_model_name, "rb") as f:
            TA_model.load_state_dict(torch.load(f))
            print(f"Model loaded from {TA_saved_model_name}")
    except Exception as e:
        print(e)
# Move model to the device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TA_model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mrm8488/deberta-v3-ft-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  TA_model.load_state_dict(torch.load(f))


Model loaded from ../saved_models/task_achievement_trained.pt


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [14]:
all_eval_labels = []
all_eval_preds = []
TA_model.eval()
# len(eval_dataloader5) = 140
for batch in eval_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = TA_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_eval_labels.append(labels)
    all_eval_preds.append(predictions)
all_eval_labels = torch.cat(all_eval_labels).cpu().numpy()
all_eval_preds = torch.cat(all_eval_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_eval_labels, all_eval_preds, weights="quadratic")
print(f"Task Achievement QWK for Validation Dataset: {qwk}")

Task Achievement QWK for Validation Dataset: 0.5029868678667244


In [15]:
all_train_labels = []
all_train_preds = []
TA_model.eval()
for batch in train_dataloader5:
    batch = {k: v.to(device5) for k, v in batch.items()}
    labels = batch["labels"].to(device)
    with torch.no_grad():
        outputs = TA_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_train_labels.append(labels)
    all_train_preds.append(predictions)
all_train_labels = torch.cat(all_train_labels).cpu().numpy()
all_train_preds = torch.cat(all_train_preds).cpu().numpy()

# Compute QWK
qwk = cohen_kappa_score(all_train_labels, all_train_preds, weights="quadratic")
print(f"Task Achievement QWK for Training Dataset: {qwk}")

Task Achievement QWK for Training Dataset: 0.4798498176496018
