In [1]:
!set CUDA_LAUNCH_BLOCKING=1

In [2]:
from utils import TextClassifDs, preprocess_text, compute_accuracy_metric
from tokens import WANDB_TOKEN
import pandas as pd
import os
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np
import wandb

torch.manual_seed(8)


<torch._C.Generator at 0x7d3e18332cf0>

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
# wandb.login(key=WANDB_TOKEN)

In [5]:
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="automated-essay-scoring"
# )

### Loading data & quick view

In [6]:
data_folder = "./learning-agency-lab-automated-essay-scoring-2/"
os.listdir(data_folder)

['train.csv', 'sample_submission.csv', 'test.csv']

In [7]:

essays_data = pd.read_csv(os.path.join(data_folder,'train.csv'))

In [8]:
essays_data.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [9]:
essays_data['full_text'].str.len().describe()

count    17307.000000
mean      2071.617265
std        925.910701
min        712.000000
25%       1397.000000
50%       1924.000000
75%       2541.000000
max      20459.000000
Name: full_text, dtype: float64

In [10]:
essays_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   score      17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB


In [11]:
essays_data.score.value_counts()

score
3    6280
2    4723
4    3926
1    1252
5     970
6     156
Name: count, dtype: int64

In [None]:
# Since our labels start from 1 we need to set them to start from 0 to work with Cross entropy loss function
essays_data['score'] = essays_data['score'].apply(lambda x : x-1)

**We have around 6 classes & 17K samples to work with. Text length distribution is also bit skewed.**

In [12]:
X_train, X_val, y_train, y_val = train_test_split(essays_data['full_text'], essays_data['score'], test_size=0.3, random_state=42)

In [13]:
X_train = X_train.apply(preprocess_text)
X_val = X_val.apply(preprocess_text)

### Loading a pre-trained tokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased",
                                          padding=True, 
                                          truncation=True,
                                          )

In [15]:
train_ds = TextClassifDs(text=X_train.values.tolist(),
                         labels=y_train.values.tolist(),
                         tokenizer=tokenizer)
val_ds = TextClassifDs(text=X_val.values.tolist(),
                         labels=y_val.values.tolist(),
                         tokenizer=tokenizer)

In [16]:
X_train.apply(lambda x : type(x)).value_counts()

full_text
<class 'str'>    12114
Name: count, dtype: int64

**Loading a pretrianed model in text classification fashion**

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased", num_labels=6
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
padding_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
training_args = TrainingArguments(
    output_dir="essay_scoring_bert_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    # report_to="wandb"
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy_metric,
    data_collator=padding_collator
)



In [21]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgiridharmunagala8[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112354955555443, max=1.0…

  0%|          | 0/7575 [00:00<?, ?it/s]

../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
tokenizer.model_max_length

512

In [None]:
tokenizer?

[0;31mSignature:[0m     
[0mtokenizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtext[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_pair[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_target[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
