In [1]:
!gdown 1dvWORIUL3zWi4Df4hNUeHYfzPM_YRRGs

Downloading...
From: https://drive.google.com/uc?id=1dvWORIUL3zWi4Df4hNUeHYfzPM_YRRGs
To: /home/yuuhanase/FPTU/EXE101/PaperClipAI_EnglishGrading/experiments/train.csv
100%|██████████████████████████████████████| 9.29M/9.29M [00:00<00:00, 9.34MB/s]


In [2]:
from transformers import BertModel
from torch import nn
class CustomBERTModel(nn.Module):
      def __init__(self):
            super(CustomBERTModel, self).__init__()
            self.bert = BertModel.from_pretrained("bert-base-uncased")
            ### New layers:
            self.linear1 = nn.Linear(768, 6)
            self.output = nn.Sigmoid()

      def forward(self, **inputs):
            bert_outputs = self.bert(**inputs)

            # sequence_output has the following shape: (batch_size, sequence_length, 768)
            linear1_output = self.linear1(bert_outputs.pooler_output) ## extract the 1st token's embeddings
            output = self.output(linear1_output)
            return output

[2024-01-06 20:43:57,588] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
from transformers import Trainer
from torch import nn
class CustomTrainer(Trainer):
    def __int__(self, *args, **kwargs):
        super().__int__(*args, **kwargs)
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs['target']
        # forward pass
        logits = model(
            input_ids=inputs['input_ids'].to('cuda'),
            attention_mask=inputs['attention_mask'].to('cuda'),
        )
        loss_fct = nn.MSELoss()
        loss = loss_fct(logits[-1], labels)
        return (loss, logits) if return_outputs else loss

In [4]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = CustomBERTModel() # You can pass the parameters if required to have more flexible model
model.to(torch.device("cuda")) ## can be gpu

CustomBERTModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [10]:
from unstructured.cleaners.core import clean_extra_whitespace

def clean_text(batch):
    text = batch['full_text']
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    text = text.replace("\r", ' ')
    
    text = clean_extra_whitespace(text)
    batch['full_text'] = text
    return batch

def transform(batch):
    tokenized_input = tokenizer(batch['full_text'], return_tensors='pt', truncation=True)
    input_ids = tokenized_input['input_ids'][0]
    attention_mask = tokenized_input['attention_mask'][0]
    targets_feat = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    targets = []
    for feat in targets_feat:
        targets.append(torch.tensor(batch[feat])/5.0)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'target': targets
    }

In [6]:
from datasets import load_dataset, Dataset
dataset = load_dataset("csv", data_files="train.csv", split='train')
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    num_rows: 3911
})

In [11]:
clean_ds = dataset.map(clean_text)
transform_ds = clean_ds.map(transform)
train_ds = Dataset.from_dict({
    'input_ids': transform_ds['input_ids'],
    'attention_mask': transform_ds['attention_mask'],
    'target': transform_ds['target']
})
clean_ds, transform_ds, train_ds

Map:   0%|          | 0/3911 [00:00<?, ? examples/s]

(Dataset({
     features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
     num_rows: 3911
 }),
 Dataset({
     features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'input_ids', 'attention_mask', 'target'],
     num_rows: 3911
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'target'],
     num_rows: 3911
 }))

In [12]:
train_ds[0]

{'input_ids': [101,
  1045,
  2228,
  2008,
  2493,
  2052,
  5770,
  2013,
  4083,
  2012,
  2188,
  1010,
  2138,
  2027,
  2180,
  2102,
  2031,
  2000,
  2689,
  1998,
  2131,
  2039,
  2220,
  1999,
  1996,
  2851,
  2000,
  6457,
  1998,
  2079,
  2045,
  2606,
  1012,
  2635,
  2069,
  4280,
  7126,
  2068,
  2138,
  2012,
  2045,
  2160,
  2027,
  1005,
  2222,
  2022,
  3477,
  2062,
  3086,
  1012,
  2027,
  2097,
  2022,
  6625,
  2012,
  2188,
  1012,
  1996,
  18263,
  2112,
  1997,
  2082,
  2003,
  2893,
  3201,
  1012,
  2017,
  5256,
  2039,
  2175,
  8248,
  2115,
  4091,
  1998,
  2175,
  2000,
  2115,
  9346,
  1998,
  2298,
  2012,
  2115,
  8416,
  2015,
  1012,
  2044,
  2017,
  2228,
  2017,
  3856,
  1037,
  11018,
  1057,
  2175,
  2298,
  1999,
  1996,
  5259,
  1998,
  2017,
  3363,
  2593,
  2025,
  2066,
  2009,
  2030,
  2017,
  2298,
  1998,
  2156,
  1037,
  21101,
  1012,
  2059,
  2017,
  1005,
  2222,
  2031,
  2000,
  2689,
  1012,
  2007,
  1996,
 

In [13]:
from transformers import TrainingArguments, DataCollatorWithPadding
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "no",
    "num_train_epochs": 5,
    "log_level": "error",
    "logging_steps": 1,
    "report_to": "none",
    "full_determinism": False
}
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    remove_unused_columns=False,
    **default_args)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = CustomTrainer(model=model, 
                        args=training_args,
                        train_dataset=train_ds,
                        data_collator=data_collator)
trainer.train()

  return F.mse_loss(input, target, reduction=self.reduction)


{'loss': 0.0188, 'learning_rate': 4.9989775051124746e-05, 'epoch': 0.0}
{'loss': 0.0229, 'learning_rate': 4.997955010224949e-05, 'epoch': 0.0}
{'loss': 0.0102, 'learning_rate': 4.996932515337424e-05, 'epoch': 0.0}
{'loss': 0.0098, 'learning_rate': 4.995910020449898e-05, 'epoch': 0.0}
{'loss': 0.0226, 'learning_rate': 4.9948875255623726e-05, 'epoch': 0.01}
{'loss': 0.0318, 'learning_rate': 4.993865030674847e-05, 'epoch': 0.01}
{'loss': 0.0175, 'learning_rate': 4.992842535787321e-05, 'epoch': 0.01}
{'loss': 0.0251, 'learning_rate': 4.9918200408997955e-05, 'epoch': 0.01}
{'loss': 0.0118, 'learning_rate': 4.9907975460122705e-05, 'epoch': 0.01}
{'loss': 0.0258, 'learning_rate': 4.989775051124745e-05, 'epoch': 0.01}
{'loss': 0.0149, 'learning_rate': 4.988752556237219e-05, 'epoch': 0.01}
{'loss': 0.0094, 'learning_rate': 4.9877300613496935e-05, 'epoch': 0.01}
{'loss': 0.0256, 'learning_rate': 4.986707566462168e-05, 'epoch': 0.01}
{'loss': 0.0203, 'learning_rate': 4.985685071574642e-05, 'epoch

KeyboardInterrupt: 