<a href="https://colab.research.google.com/github/iam-Dylan/automated-essay-scoring/blob/dylan/deberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
!pip install datasets



In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import torch
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')

In [4]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    AdamW
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=1024, truncation=True)

In [6]:
df_train = pd.read_csv('/gdrive/MyDrive/PTDLTM - Project /Notebook/learning-agency-lab-automated-essay-scoring-2/train.csv')
df_train.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [7]:
df_train['fold'] = df_train.essay_id.map(lambda x: int(x, base=16) % 5)
df_train['labels'] = df_train.score.map(lambda x: x-1)

In [8]:
# remove .iloc[:3000] and .iloc[:200] to use all data
FOLD = 0
ds_train = Dataset.from_pandas(df_train[df_train.fold!=FOLD].iloc[:3000].copy())
ds_eval = Dataset.from_pandas(df_train[df_train.fold==FOLD].iloc[:200].copy())

In [9]:
ds_train = ds_train.map(tokenize).remove_columns(['essay_id', 'full_text', 'score', 'fold', '__index_level_0__'])
ds_eval = ds_eval.map(tokenize).remove_columns(['essay_id', 'full_text', 'score', 'fold', '__index_level_0__'])

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [10]:
!pip install transformers[torch]



In [11]:
!pip install accelerate



In [12]:
def compute_metrics(p):
    preds, labels = p
    score = cohen_kappa_score(labels, preds.argmax(-1), weights='quadratic')
    return {'qwk':score }

train_args = TrainingArguments(
    output_dir='/kaggle/working/deberta-small-fold0',
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    report_to="none",
    evaluation_strategy="steps",
    do_eval=True,
    eval_steps=100,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    lr_scheduler_type='linear',
    metric_for_best_model="qwk",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_safetensors=True
)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", num_labels=6)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    data_collator=DataCollatorWithPadding(tokenizer),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer.train()

Step,Training Loss,Validation Loss,Qwk
100,1.6054,1.315559,0.056859
200,1.2324,1.007828,0.682709
300,1.0445,0.984713,0.740933


TrainOutput(global_step=375, training_loss=1.2285215454101563, metrics={'train_runtime': 253.496, 'train_samples_per_second': 11.835, 'train_steps_per_second': 1.479, 'total_flos': 406780510564800.0, 'train_loss': 1.2285215454101563, 'epoch': 1.0})

In [15]:
df_test = pd.read_csv('/gdrive/MyDrive/PTDLTM - Project /Notebook/learning-agency-lab-automated-essay-scoring-2/test.csv')
ds = Dataset.from_pandas(df_test).map(tokenize).remove_columns(['essay_id', 'full_text'])

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [16]:
df_test['score'] = trainer.predict(ds).predictions.argmax(-1) + 1
df_test.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4


In [17]:
df_test[['essay_id', 'score']].to_csv('submission.csv', index=False)