In [None]:
# !pip install transformers
# !pip install datasets

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

commonlit_lm		       test.csv        train_duo.csv
commonlitreadabilityprize.zip  train-orig.csv
sample_submission.csv	       train.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train_duo.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,4626100d8,,,"The commutator is peculiar, consisting of only...",-3.676268,0.623621
1,493b80aa7,,,The Dunwich horror itself came between Lammas ...,-3.668360,0.571404
2,fe44cbd14,,,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892,0.644398
3,284eaa5ad,,,As to surface-slope its measurement—from nearl...,-3.639936,0.603819
4,9e9eacb49,,,"The tree is dioecious, bearing male catkins on...",-3.636834,0.606822
...,...,...,...,...,...,...
5662,016913371,https://www.africanstorybook.org/,CC BY 4.0,Grandma's garden was wonderful. It was full of...,1.466629,0.599600
5663,7a1d484be,https://www.africanstorybook.org/,CC BY 4.0,More people came to the bus stop just before 9...,1.504669,0.606997
5664,8f35441e3,https://www.africanstorybook.org/#,CC BY 4.0,"Every day, Emeka's father took him to school i...",1.562759,0.624776
5665,849971671,https://www.africanstorybook.org/,CC BY 4.0,"For her last birthday, Sisanda had a special t...",1.590858,0.596349


In [6]:
train_df[train_df['excerpt'].notna()]

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,4626100d8,,,"The commutator is peculiar, consisting of only...",-3.676268,0.623621
1,493b80aa7,,,The Dunwich horror itself came between Lammas ...,-3.668360,0.571404
2,fe44cbd14,,,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892,0.644398
3,284eaa5ad,,,As to surface-slope its measurement—from nearl...,-3.639936,0.603819
4,9e9eacb49,,,"The tree is dioecious, bearing male catkins on...",-3.636834,0.606822
...,...,...,...,...,...,...
5662,016913371,https://www.africanstorybook.org/,CC BY 4.0,Grandma's garden was wonderful. It was full of...,1.466629,0.599600
5663,7a1d484be,https://www.africanstorybook.org/,CC BY 4.0,More people came to the bus stop just before 9...,1.504669,0.606997
5664,8f35441e3,https://www.africanstorybook.org/#,CC BY 4.0,"Every day, Emeka's father took him to school i...",1.562759,0.624776
5665,849971671,https://www.africanstorybook.org/,CC BY 4.0,"For her last birthday, Sisanda had a special t...",1.590858,0.596349


In [7]:
train_df['target'].max()

1.711389827

In [8]:
train_df = train_df.drop(train_df[train_df['excerpt'].str.contains('White Hawk to pause')].index)

In [9]:
test_df['excerpt'].values.shape

(7,)

### Configuration

In [10]:
class CONFIG():
    model_name = 'distilroberta'
    batch_size = 14
    max_len = 512
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 30
    pretrained_transformers_model = f'{model_name}-base'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [11]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [12]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [13]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [14]:
all_text = train_df['excerpt'].values
valid_text = test_df['excerpt'].values

In [15]:
common_lit_text_file = commonlit_lm_path/'text.txt'
common_lit_valid_file = commonlit_lm_path/'valid.txt'

In [16]:
def write_to_text_file(data, file):
    with open(file, 'w') as f:
        for t in data:
            f.write(f'{t}\n')

In [17]:
write_to_text_file(all_text, common_lit_text_file)
write_to_text_file(valid_text, common_lit_valid_file)

In [18]:
dataset = load_dataset('text', data_files=[str(common_lit_text_file)])
valid_dataset = load_dataset('text', data_files=[str(common_lit_valid_file)])

Using custom data configuration default-21f0b1b8382867fd


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-21f0b1b8382867fd/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-21f0b1b8382867fd/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


Using custom data configuration default-2f60d4b47d0afb88


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-2f60d4b47d0afb88/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-2f60d4b47d0afb88/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


In [19]:
column_names = dataset["train"].column_names
column_names[0]

'text'

In [20]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [21]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=True)

In [22]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors






In [23]:
tokenized_valid_datasets = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [24]:
tokenized_datasets['train']

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 18441
})

In [25]:
tokenized_valid_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
        num_rows: 21
    })
})

In [26]:
train_dataset = tokenized_datasets["train"]
valid_dataset = tokenized_valid_datasets["train"]

### Model

In [27]:
model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_transformers_model)

In [28]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_probability)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='distilroberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None)

### Training

In [29]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}-lm'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
#         fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5,
        no_cuda=False
    )
    return training_args

In [30]:
training_args = create_training_args()

In [31]:
training_args.output_dir

'/home/commonlit/models/distilroberta-lm'

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if cfg.do_train else None,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [33]:
# import wandb

In [34]:
!rm -rf {training_args.output_dir}

In [35]:
# train_dataloader = trainer.get_train_dataloader()

In [36]:
# inputs['input_ids'].device

In [37]:
# model(inputs['input_ids'][2:3].cuda(), inputs['attention_mask'][1:2].cuda())

In [38]:
# tokenizer.decode(inputs['input_ids'][1])

In [39]:
# for i, inputs in enumerate(train_dataloader):
#     inputs['input_ids'] = inputs['input_ids'].cuda()
#     inputs['attention_mask'] = inputs['attention_mask'].cuda()
#     inputs['labels'] = inputs['labels'].cuda()
#     model(**inputs)
#     if i % 100 == 0:
#         print(i)

In [40]:
%%time

trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss


RuntimeError: CUDA error: device-side assert triggered

In [None]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

In [None]:
import shutil

model_zip_file = shutil.make_archive(commonlit_lm_path, 'zip', trainer.state.best_model_checkpoint)

In [None]:
export_file_name = '/home/commonlit/models/distilroberta-lm/commonlit_distil_roberta_lm.zip'

In [None]:
!mv {model_zip_file} {export_file_name}
!du -h {export_file_name}

In [None]:
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/distilroberta-lm/best_model