In [1]:
from utils import compute_accuracy_metric, DataCollatorForSequences, MultiInputTextDs
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from models import WordSentRegressorWithStats
from tokens import WANDB_TOKEN
import pandas as pd
import os
import spacy
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from model_utils import TrainerForEssayScoring
import wandb
from torch.utils.data import DataLoader
from transformers import DebertaTokenizerFast
from tqdm import tqdm
tqdm.pandas()
torch.manual_seed(8)

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


<torch._C.Generator at 0x7f2aa3daa9b0>

In [2]:
# Defining folders
data_folder = "./learning-agency-lab-automated-essay-scoring-2/"
model_out = './word_sent_regressor'
log_out = './word_sent_regressor_logs'
if not os.path.exists(model_out):
    os.makedirs(model_out)
if not os.path.exists(log_out):
    os.makedirs(log_out)
os.listdir(data_folder)

['sample_submission.csv:Zone.Identifier',
 'sample_submission.csv',
 'test.csv:Zone.Identifier',
 'train.csv',
 'test.csv',
 'train.csv:Zone.Identifier']

In [3]:
# Loading data
essays_data = pd.read_csv(os.path.join(data_folder,'train.csv'))

Checking the distribution of essay lengths

In [4]:
essays_data['full_text'].str.len().describe()

count    17307.000000
mean      2071.617265
std        925.910701
min        712.000000
25%       1397.000000
50%       1924.000000
75%       2541.000000
max      20459.000000
Name: full_text, dtype: float64

In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(essays_data['full_text'], essays_data['score'], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

- Approach
    - We want to build both sentence level features & word level features. 
    - For word level features, we can use a distill bert token embeddings & for sentences level attributes we can utilize a sentence transformer model

In [6]:
# Loading Word embedding model & tokenizer - We will use 
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased")



In [7]:
# Loading sentence embedding model
sentence_bert = SentenceTransformer("thenlper/gte-small")

  _torch_pytree._register_pytree_node(


In [8]:
spacy_model = spacy.load('en_core_web_lg')

In [9]:
# import importlib
# import utils
# import models
# importlib.reload(models)
# from models import WordSentRegressorWithStats
# from utils import compute_accuracy_metric, DataCollatorForSequences, MultiInputTextDs

In [10]:
# Creating train, test & val datasets
train_ds = MultiInputTextDs(X_train.values.tolist(), y_train.values.tolist(), tokenizer, model ,sentence_bert, spacy_model,True)
val_ds = MultiInputTextDs(X_val.values.tolist(), y_val.values.tolist(), tokenizer, model ,sentence_bert, spacy_model,True)
test_ds = MultiInputTextDs(X_test.values.tolist(), y_test.values.tolist(), tokenizer, model ,sentence_bert, spacy_model,True)

Extracting Sentences


100%|██████████| 12114/12114 [02:05<00:00, 96.30it/s] 


Extracting number_of_sentences


100%|██████████| 12114/12114 [00:00<00:00, 13935.29it/s]


Extracting number_of_unique_pos_tags


100%|██████████| 12114/12114 [00:00<00:00, 15786.59it/s]


Extracting number_of_unique_dep_tags


100%|██████████| 12114/12114 [00:00<00:00, 15550.67it/s]


Extracting number_of_unique_ents


100%|██████████| 12114/12114 [00:00<00:00, 59633.95it/s]


Extracting average_words_per_sentence


100%|██████████| 12114/12114 [00:00<00:00, 69552.06it/s]


Extracting percent_passive_sentences


100%|██████████| 12114/12114 [00:01<00:00, 9274.03it/s]


Extracting percent_simple_sentences


100%|██████████| 12114/12114 [00:04<00:00, 2718.69it/s]


Extracting percent_compound_sentences


100%|██████████| 12114/12114 [00:08<00:00, 1478.27it/s]


Extracting percent_complex_sentences


100%|██████████| 12114/12114 [00:04<00:00, 2820.20it/s]


Extracting Sentences


100%|██████████| 2596/2596 [00:31<00:00, 83.54it/s] 


Extracting number_of_sentences


100%|██████████| 2596/2596 [00:00<00:00, 61519.59it/s]


Extracting number_of_unique_pos_tags


100%|██████████| 2596/2596 [00:00<00:00, 15231.32it/s]


Extracting number_of_unique_dep_tags


100%|██████████| 2596/2596 [00:00<00:00, 15932.94it/s]


Extracting number_of_unique_ents


100%|██████████| 2596/2596 [00:00<00:00, 64890.78it/s]


Extracting average_words_per_sentence


100%|██████████| 2596/2596 [00:00<00:00, 73860.31it/s]


Extracting percent_passive_sentences


100%|██████████| 2596/2596 [00:00<00:00, 8331.45it/s]


Extracting percent_simple_sentences


100%|██████████| 2596/2596 [00:00<00:00, 2603.84it/s]


Extracting percent_compound_sentences


100%|██████████| 2596/2596 [00:01<00:00, 1472.93it/s]


Extracting percent_complex_sentences


100%|██████████| 2596/2596 [00:00<00:00, 2619.33it/s]


Extracting Sentences


100%|██████████| 2597/2597 [00:29<00:00, 87.46it/s] 


Extracting number_of_sentences


100%|██████████| 2597/2597 [00:00<00:00, 37102.31it/s]


Extracting number_of_unique_pos_tags


100%|██████████| 2597/2597 [00:00<00:00, 14618.89it/s]


Extracting number_of_unique_dep_tags


100%|██████████| 2597/2597 [00:00<00:00, 14662.26it/s]


Extracting number_of_unique_ents


100%|██████████| 2597/2597 [00:00<00:00, 58028.30it/s]


Extracting average_words_per_sentence


100%|██████████| 2597/2597 [00:00<00:00, 65621.28it/s]


Extracting percent_passive_sentences


100%|██████████| 2597/2597 [00:00<00:00, 9669.95it/s]


Extracting percent_simple_sentences


100%|██████████| 2597/2597 [00:01<00:00, 2495.87it/s]


Extracting percent_compound_sentences


100%|██████████| 2597/2597 [00:01<00:00, 1437.27it/s]


Extracting percent_complex_sentences


100%|██████████| 2597/2597 [00:01<00:00, 2497.38it/s]


In [11]:
# import pickle

# # Assuming train_dataset, test_dataset, and val_dataset are your dataset objects
# datasets = {
#     'train': train_ds,
#     'test': test_ds,
#     'val': val_ds
# }

# # Save each dataset to a separate .pkl file
# for name, dataset in datasets.items():
#     with open(f'{name}_dataset.pkl', 'wb') as f:
#         pickle.dump(dataset, f)

In [8]:
# import pickle

# # Assuming train_dataset, test_dataset, and val_dataset are your dataset objects
# datasets = {
#     'train': train_ds,
#     'test': test_ds,
#     'val': val_ds
# }

# # Save each dataset to a separate .pkl file
# for name, dataset in datasets.items():
#     with open(f'{name}_dataset.pkl', 'wb') as f:
#         pickle.dump(dataset, f)

NameError: name 'train_ds' is not defined

In [8]:
# import pickle

# # Load each dataset from the .pkl file
# loaded_datasets = {}
# for name in ['train', 'test', 'val']:
#     with open(f'{name}_dataset.pkl', 'rb') as f:
#         loaded_datasets[name] = pickle.load(f)

# # Access the loaded datasets
# train_ds = loaded_datasets['train']
# test_ds = loaded_datasets['test']
# val_ds = loaded_datasets['val']

  return torch.load(io.BytesIO(b))


In [15]:
train_ds[0]['stats_features'].shape

torch.Size([9])

In [11]:
regressor_model = WordSentRegressorWithStats(word_embed_dim=768, sent_embed_dim=384,num_stats_features=9)

In [12]:
print(regressor_model)

WordSentRegressorWithStats(
  (word_lstm_1): LSTM(768, 128, batch_first=True)
  (word_lstm_2): LSTM(128, 64, batch_first=True)
  (sent_lstm_1): LSTM(384, 128, batch_first=True)
  (sent_lstm_2): LSTM(128, 64, batch_first=True)
  (stats_seq): Sequential(
    (0): Linear(in_features=9, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=2, bias=True)
    (3): ReLU()
  )
  (fc1): Linear(in_features=130, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)


In [11]:
# regressor_model.load_state_dict(torch.load('./best_model.pth'))

  regressor_model.load_state_dict(torch.load('./best_model.pth'))


<All keys matched successfully>

In [16]:
import importlib
import model_utils
importlib.reload(model_utils)

from model_utils import TrainerForEssayScoring

In [13]:
# Creating dataloader with the custom data collator
data_collator = DataCollatorForSequences(add_stats_feat=True)
train_dataloader = DataLoader(train_ds, batch_size=16, collate_fn=data_collator, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=16, collate_fn=data_collator, shuffle=False)

In [14]:
for batch in train_dataloader:
    b = batch
    break

In [15]:
b

{'word_embeddings': tensor([[[-2.4498e-01,  1.5105e-01, -5.0275e-02,  ...,  1.3494e-02,
            6.1262e-01,  4.5011e-01],
          [-1.0392e+00, -2.3028e-01, -7.0552e-01,  ...,  1.0505e-01,
            4.5517e-01,  5.2601e-02],
          [-1.0891e+00, -6.2702e-01, -3.6885e-01,  ...,  3.3292e-01,
            5.6844e-01, -1.7009e-01],
          ...,
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00]],
 
         [[-5.9412e-02,  1.6419e-01,  1.4551e-01,  ..., -2.6199e-01,
            5.2251e-01,  3.7234e-01],
          [-2.6248e-01,  1.3539e-01,  4.1303e-01,  ..., -5.3538e-01,
            1.2222e-02,  2.6746e-01],
          [ 4.9204e-02,  4.3229e-01,  2.9315e-01,  ..., -2.7759e-01,
           -2.0797e-02, -2.3140e-01],


In [17]:
trainer = TrainerForEssayScoring(regressor_model, train_dataloader, val_dataloader, has_stats_features=True)

In [18]:
trainer.train(1)

Epoch 1/1, Batch 1/758, Loss: 7.9931
Epoch 1/1, Batch 101/758, Loss: 2.7026
Epoch 1/1, Batch 201/758, Loss: 0.9056


KeyboardInterrupt: 

In [13]:
for row in train_ds:
    print(row['word_embeddings'].shape)
    print(row['sentence_embeddings'].shape)
    break

torch.Size([178, 768])
torch.Size([6, 384])


In [22]:
train_ds.sentences[20]

['The United States has always had people complaining about the Electoral College, and that\'s exactly what I\'m about to do.\xa0 I think that the Electoral College was probably a good idea at first, but it\'s not very helpful for the elections.\xa0 The Electoral College, as you know, has people from every state, some states having more on it than others based on the population.\xa0 It\'s not right to have a few people have just as big of a say as hundreds of thousands of people.\n\nI read from "The Indefensible Electoral College: Why even the best-laid defenses of the system are wrong" and learned that in 2000, Al Gore won the popular vote from the people, but then lost the election because the Electoral College voted against him.\xa0 That means that the president was not picked based on who the people of this country wanted, but who only a small amount of people wanted.\xa0 How does that make any sense at all? There\'s no way to defend that.\xa0 If it\'s OUR country, WE should be the

In [13]:
from model_utils import Evaluator

<module 'model_utils' from '/home/giridhar/github/essay_scoring/model_utils.py'>

In [14]:
test_dataloader = DataLoader(test_ds, batch_size=16, collate_fn=data_collator, shuffle=False)
qwk = Evaluator(regressor_model, test_dataloader).evaluate()

100%|██████████| 163/163 [01:10<00:00,  2.30it/s]

Quadratic weighted kappa is : 0.7950





In [17]:
torch.round(torch.tensor([0.5, 0.6, 0.7, 0.8, 0.9])).long().numpy()

array([0, 1, 1, 1, 1])

In [19]:
train_ds[0]['label']

tensor(2., device='cuda:0')