In [None]:
# !pip install transformers
# !pip install datasets

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

01_data_enhancements.ipynb     sample_submission.csv  train-orig.csv
02_synonymizer.ipynb	       test-enhanced.csv      train.csv
commonlit_lm		       test.csv		      train_duo.csv
commonlit_lm.zip	       thumbelina	      train_enhancements.csv
commonlitreadabilityprize.zip  tokenizer.vocab.txt
extra_data		       train-mix.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train-mix.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
3109,659e6b1af,,,"Meanwhile they came to the palace, to the fath...",-0.095750,0.464406
3110,659e6b1af,,,"To which the shepherd replied, \n""If you wish ...",-0.095750,0.464406
3111,659e6b1af,,,"""Now you understand the language of animals, a...",-0.095750,0.464406
3112,659e6b1af,,,"""What if that shepherd only knew that undernea...",-0.095750,0.464406


In [6]:
train_df[train_df['id'] == '5127fb10f']['excerpt'].values

array(['The Battle of Waterloo was a battle that was fought mostly between French and British forces. Napoleon was crowned as Emperor of France in 1804. Then he launched many successful attacks on other countries in Europe. France soon had an empire that stretched from Spain to the Russian border. The only country that was still not captured was Great Britain. The Royal Navy had many ships, so invasion by France was not possible. However, Great Britain was not strong enough to stop Napoleon and his army from taking over most of mainland Europe.\nNapoleon seemed unstoppable until two separate campaigns caused his empire to fall apart. He gathered a huge army to invade and conquer Russia once and for all in 1812. However, he did not think that he would have very many difficulties and it turned out he did. His army was caught by the Russian winter and destroyed by the weather and lack of food.'],
      dtype=object)

In [7]:
test_df['excerpt'].values.shape

(7,)

### Configuration

In [46]:
import re

'deberta-large'

In [23]:
class CONFIG():
    model_name = 'microsoft/deberta-large'
    batch_size = 12
    max_len = 512
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 15
    pretrained_transformers_model = f'{model_name}'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [24]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [25]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [26]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [27]:
all_text = train_df['excerpt'].values
valid_text = test_df['excerpt'].values

In [28]:
# def write_to_text_file(data, file):
#     with open(file, 'w') as f:
#         for t in data:
#             f.write(f'{t}\n')

In [29]:
# write_to_text_file(all_text, common_lit_text_file)
# write_to_text_file(valid_text, common_lit_valid_file)

In [30]:
train_text_dict = {'text': all_text.tolist()}
valid_text_dict = {'text': valid_text.tolist()}

In [31]:
import datasets

dataset = datasets.Dataset.from_dict(train_text_dict)
valid_dataset = datasets.Dataset.from_dict(valid_text_dict)

In [32]:
dataset

Dataset({
    features: ['text'],
    num_rows: 3114
})

In [33]:
column_names = dataset.column_names

In [34]:
column_names[0]

'text'

In [35]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [36]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=True)

In [37]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [38]:
tokenized_valid_datasets = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [39]:
for i, inputs in enumerate(tokenized_datasets):
    input_length = len(inputs['input_ids'])
    if input_length > 512:
        print(i, input_length)

In [40]:
tokenized_datasets

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'token_type_ids'],
    num_rows: 3114
})

In [41]:
tokenized_valid_datasets

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'token_type_ids'],
    num_rows: 7
})

In [42]:
train_dataset = tokenized_datasets
valid_dataset = tokenized_valid_datasets

### Model

In [43]:
model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_transformers_model)

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'config', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['cls.predictions.bias', '

In [47]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_probability)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='microsoft/deberta-large', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None)

### Training

In [56]:
import os
os.environ['WANDB_DISABLED'] = "true"

In [57]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f"{re.sub(r'.+/', '', cfg.model_name)}-lm"),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5
    )
    return training_args

In [58]:
training_args = create_training_args()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [59]:
training_args.output_dir

'/home/commonlit/models/deberta-large-lm'

In [60]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if cfg.do_train else None,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [61]:
!rm -rf {training_args.output_dir}

In [62]:
%%time

trainer.train()
trainer.save_model()

Epoch,Training Loss,Validation Loss
1,4.1506,3.089156
2,2.7598,2.370032
3,2.3888,2.077657
4,2.1574,2.221605
5,1.9726,2.147031
6,1.8346,2.137856
7,1.7562,1.686588
8,1.6613,2.175741
9,1.5712,1.650662
10,1.5261,1.634042


CPU times: user 23min 6s, sys: 7min 53s, total: 31min
Wall time: 30min 57s


In [63]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

best_model_checkpoint /home/commonlit/models/deberta-large-lm/checkpoint-3120


In [64]:
MODELS_PATH

PosixPath('/home/commonlit/models')

In [68]:
import shutil

model_zip_file = shutil.make_archive(f'{MODELS_PATH}/{cfg.model_name}/commonlit_lm', 'zip', trainer.state.best_model_checkpoint)

In [69]:
export_file_name = f'{MODELS_PATH}/{cfg.model_name}/commonlit_lm.zip'

In [73]:
!mv {trainer.state.best_model_checkpoint} {Path(trainer.state.best_model_checkpoint).parent}/best_lm

In [79]:
!ls {MODELS_PATH}/{re.sub(r'.+/', '', cfg.model_name)}-lm/best_lm

config.json	   rng_state.pth	    tokenizer.json	   vocab.json
merges.txt	   scaler.pt		    tokenizer_config.json
optimizer.pt	   scheduler.pt		    trainer_state.json
pytorch_model.bin  special_tokens_map.json  training_args.bin


In [80]:
!echo {MODELS_PATH}/{re.sub(r'.+/', '', cfg.model_name)}-lm/best_lm

/home/commonlit/models/deberta-large-lm/best_lm


In [70]:
!du -h {export_file_name}

4.2G	/home/commonlit/models/microsoft/deberta-large/commonlit_lm.zip


In [70]:
!mkdir /home/commonlit/models/distilroberta_lm
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/distilroberta_lm/best_model

mkdir: cannot create directory ‘/home/commonlit/models/distilroberta_lm’: File exists
