In [1]:
from utils import compute_accuracy_metric, DataCollatorForSequences, MultiInputTextDs
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from models import WordSentRegressor
from tokens import WANDB_TOKEN
import pandas as pd
import os
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from model_utils import TrainerForEssayScoring
import wandb
from torch.utils.data import DataLoader
from tqdm import tqdm
tqdm.pandas()
torch.manual_seed(8)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7fa5d45bc8f0>

In [2]:
# Defining folders
data_folder = "./learning-agency-lab-automated-essay-scoring-2/"
model_out = './word_sent_regressor'
log_out = './word_sent_regressor_logs'
if not os.path.exists(model_out):
    os.makedirs(model_out)
if not os.path.exists(log_out):
    os.makedirs(log_out)
os.listdir(data_folder)

['sample_submission.csv:Zone.Identifier',
 'sample_submission.csv',
 'test.csv:Zone.Identifier',
 'train.csv',
 'test.csv',
 'train.csv:Zone.Identifier']

In [3]:
# Loading data
essays_data = pd.read_csv(os.path.join(data_folder,'train.csv'))

Checking the distribution of essay lengths

In [4]:
essays_data['full_text'].str.len().describe()

count    17307.000000
mean      2071.617265
std        925.910701
min        712.000000
25%       1397.000000
50%       1924.000000
75%       2541.000000
max      20459.000000
Name: full_text, dtype: float64

In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(essays_data['full_text'], essays_data['score'], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

- Approach
    - We want to build both sentence level features & word level features. 
    - For word level features, we can use a distill bert token embeddings & for sentences level attributes we can utilize a sentence transformer model

In [6]:
# Loading Word embedding model & tokenizer - We will use 
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased")

In [7]:
# Loading sentence embedding model
sentence_bert = SentenceTransformer("thenlper/gte-small")

In [8]:
# Creating train, test & val datasets
train_ds = MultiInputTextDs(X_train.values.tolist(), y_train.values.tolist(), tokenizer, model ,sentence_bert)
val_ds = MultiInputTextDs(X_val.values.tolist(), y_val.values.tolist(), tokenizer, model ,sentence_bert)
test_ds = MultiInputTextDs(X_test.values.tolist(), y_test.values.tolist(), tokenizer, model ,sentence_bert)

Extracting Sentences


  0%|          | 0/12114 [00:00<?, ?it/s]

In [9]:
for i, train_sample in tqdm(enumerate(train_ds),total=len(train_ds)):
    try:
        if not train_sample['word_embeddings'].shape[1] == 768:
            print(f"Error in word embeddings at index : {i}")
        if not train_sample['sentence_embeddings'].shape[1] == 384:
            print(f"Error in sentence embeddings at index : {i}")
    except Exception as e:
        print(f"Error at index : {i}")
        print(e)

100%|██████████| 12114/12114 [05:59<00:00, 33.73it/s]


In [30]:
train_ds[8096]['sentence_embeddings'].unsqueeze(0)

torch.Size([1, 384])

In [8]:
import pickle

# Assuming train_dataset, test_dataset, and val_dataset are your dataset objects
datasets = {
    'train': train_ds,
    'test': test_ds,
    'val': val_ds
}

# Save each dataset to a separate .pkl file
for name, dataset in datasets.items():
    with open(f'{name}_dataset.pkl', 'wb') as f:
        pickle.dump(dataset, f)

NameError: name 'train_ds' is not defined

In [8]:
import pickle

# Load each dataset from the .pkl file
loaded_datasets = {}
for name in ['train', 'test', 'val']:
    with open(f'{name}_dataset.pkl', 'rb') as f:
        loaded_datasets[name] = pickle.load(f)

# Access the loaded datasets
train_ds = loaded_datasets['train']
test_ds = loaded_datasets['test']
val_ds = loaded_datasets['val']

  return torch.load(io.BytesIO(b))


In [9]:
regressor_model = WordSentRegressor(word_embed_dim=768, sent_embed_dim=384)

In [10]:
print(regressor_model)

WordSentRegressor(
  (word_lstm_1): LSTM(768, 128, batch_first=True)
  (word_lstm_2): LSTM(128, 64, batch_first=True)
  (sent_lstm_1): LSTM(384, 128, batch_first=True)
  (sent_lstm_2): LSTM(128, 64, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)


In [11]:
regressor_model.load_state_dict(torch.load('./best_model.pth'))

  regressor_model.load_state_dict(torch.load('./best_model.pth'))


<All keys matched successfully>

In [12]:
# Creating dataloader with the custom data collator
data_collator = DataCollatorForSequences()
train_dataloader = DataLoader(train_ds, batch_size=16, collate_fn=data_collator, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=16, collate_fn=data_collator, shuffle=False)

In [15]:
trainer = TrainerForEssayScoring(regressor_model, train_dataloader, val_dataloader)

In [16]:
trainer.train(20)

Epoch 1/20, Batch 1/758, Loss: 10.6840
Epoch 1/20, Batch 101/758, Loss: 3.7652
Epoch 1/20, Batch 201/758, Loss: 0.3931
Epoch 1/20, Batch 301/758, Loss: 0.8645
Epoch 1/20, Batch 401/758, Loss: 0.3738
Epoch 1/20, Batch 501/758, Loss: 0.6025
Epoch 1/20, Batch 601/758, Loss: 1.1160
Epoch 1/20, Batch 701/758, Loss: 0.6162
Epoch 1/20, Training Loss: 1.6113
Epoch 1/20, Validation Loss: 0.4780
Epoch 2/20, Batch 1/758, Loss: 0.3101
Epoch 2/20, Batch 101/758, Loss: 0.4671
Epoch 2/20, Batch 201/758, Loss: 0.3209
Epoch 2/20, Batch 301/758, Loss: 0.4745
Epoch 2/20, Batch 401/758, Loss: 0.3150
Epoch 2/20, Batch 501/758, Loss: 0.2865
Epoch 2/20, Batch 601/758, Loss: 0.3575
Epoch 2/20, Batch 701/758, Loss: 0.2878
Epoch 2/20, Training Loss: 0.4375
Epoch 2/20, Validation Loss: 0.4088
Epoch 3/20, Batch 1/758, Loss: 0.4811
Epoch 3/20, Batch 101/758, Loss: 0.2135
Epoch 3/20, Batch 201/758, Loss: 0.2115
Epoch 3/20, Batch 301/758, Loss: 0.5741
Epoch 3/20, Batch 401/758, Loss: 0.4833
Epoch 3/20, Batch 501/758

KeyboardInterrupt: 

In [13]:
for row in train_ds:
    print(row['word_embeddings'].shape)
    print(row['sentence_embeddings'].shape)
    break

torch.Size([178, 768])
torch.Size([6, 384])


In [22]:
train_ds.sentences[20]

['The United States has always had people complaining about the Electoral College, and that\'s exactly what I\'m about to do.\xa0 I think that the Electoral College was probably a good idea at first, but it\'s not very helpful for the elections.\xa0 The Electoral College, as you know, has people from every state, some states having more on it than others based on the population.\xa0 It\'s not right to have a few people have just as big of a say as hundreds of thousands of people.\n\nI read from "The Indefensible Electoral College: Why even the best-laid defenses of the system are wrong" and learned that in 2000, Al Gore won the popular vote from the people, but then lost the election because the Electoral College voted against him.\xa0 That means that the president was not picked based on who the people of this country wanted, but who only a small amount of people wanted.\xa0 How does that make any sense at all? There\'s no way to defend that.\xa0 If it\'s OUR country, WE should be the

In [13]:
from model_utils import Evaluator

<module 'model_utils' from '/home/giridhar/github/essay_scoring/model_utils.py'>

In [14]:
test_dataloader = DataLoader(test_ds, batch_size=16, collate_fn=data_collator, shuffle=False)
qwk = Evaluator(regressor_model, test_dataloader).evaluate()

100%|██████████| 163/163 [01:10<00:00,  2.30it/s]

Quadratic weighted kappa is : 0.7950





In [17]:
torch.round(torch.tensor([0.5, 0.6, 0.7, 0.8, 0.9])).long().numpy()

array([0, 1, 1, 1, 1])

In [19]:
train_ds[0]['label']

tensor(2., device='cuda:0')