In [1]:
# !pip install transformers
# !pip install datasets

In [2]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [3]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [4]:
!ls {DATA_PATH}

commonlit_lm		       sample_submission.csv  train-orig.csv
commonlitreadabilityprize.zip  test.csv		      train.csv


In [5]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [6]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
2,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
3,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
4,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
...,...,...,...,...,...,...
2836,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2837,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2838,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2839,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [7]:
train_df[train_df['id'] == '8f576a796']

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
110,8f576a796,,,"Enda took the helmet, dress, and spear, and it...",-0.708095,0.456212
111,8f576a796,,,"Derin took the helmet, dress, and sword, and i...",-0.708095,0.456212


In [8]:
test_df['excerpt'].values.shape

(7,)

### Configuration

In [9]:
class CONFIG():
    model_name = 'valhalla/distilbart-mnli-12-9'
    batch_size = 16
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 30
    pretrained_transformers_model = f'{model_name}'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [10]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [11]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [12]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [13]:
all_text = train_df['excerpt'].values
valid_text = test_df['excerpt'].values

In [14]:
common_lit_text_file = commonlit_lm_path/'text.txt'
common_lit_valid_file = commonlit_lm_path/'valid.txt'

In [15]:
def write_to_text_file(data, file):
    with open(file, 'w') as f:
        for t in data:
            f.write(f'{t}\n')

In [16]:
write_to_text_file(all_text, common_lit_text_file)
write_to_text_file(valid_text, common_lit_valid_file)

In [17]:
dataset = load_dataset('text', data_files=[str(common_lit_text_file)])
valid_dataset = load_dataset('text', data_files=[str(common_lit_valid_file)])

Using custom data configuration default-f000a5052191079a


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-f000a5052191079a/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-f000a5052191079a/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


Using custom data configuration default-95e6fbc2b094c547


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-95e6fbc2b094c547/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-95e6fbc2b094c547/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


In [18]:
column_names = dataset["train"].column_names
column_names[0]

'text'

In [19]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [20]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=True)

In [21]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [22]:
tokenized_valid_datasets = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [23]:
tokenized_datasets['train']

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 7114
})

In [24]:
tokenized_valid_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
        num_rows: 21
    })
})

In [25]:
train_dataset = tokenized_datasets["train"]
valid_dataset = tokenized_valid_datasets["train"]

### Model

In [26]:
model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_transformers_model)

Some weights of the model checkpoint at valhalla/distilbart-mnli-12-9 were not used when initializing BartForConditionalGeneration: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
- This IS expected if you are initializing BartForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_probability)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='valhalla/distilbart-mnli-12-9', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None)

### Training

In [28]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}-lm'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5
    )
    return training_args

In [29]:
training_args = create_training_args()

In [30]:
training_args.output_dir

'/home/commonlit/models/valhalla/distilbart-mnli-12-9-lm'

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if cfg.do_train else None,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [32]:
# import wandb

In [33]:
!rm -rf {training_args.output_dir}

In [34]:
%%time

trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss
1,4.4484,2.610618
2,3.1301,2.866814
3,2.8197,2.50631
4,2.5838,2.300451
5,2.4383,2.341337
6,2.2616,2.145563
7,2.1036,2.093242
8,2.015,2.617436
9,1.9186,1.989464
10,1.8442,1.955994


CPU times: user 3h 29min 39s, sys: 1h 1min 46s, total: 4h 31min 25s
Wall time: 2h 8min 36s


In [35]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

best_model_checkpoint /home/commonlit/models/valhalla/distilbart-mnli-12-9-lm/checkpoint-10235


In [43]:
import shutil

model_zip_file = shutil.make_archive(commonlit_lm_path, 'zip', trainer.state.best_model_checkpoint)

FileNotFoundError: [Errno 2] No such file or directory: '/home/commonlit/models/valhalla/distilbart-mnli-12-9-lm/checkpoint-10235'

In [44]:
commonlit_lm_path

PosixPath('/home/commonlit/data/commonlit_lm')

In [41]:
export_file_name = f'{cfg.save_dir}/commonlit_distil_bart.zip'

In [42]:
!mv {model_zip_file} {export_file_name}
!du -h {export_file_name}

mv: cannot move '/home/commonlit/data/commonlit_lm.zip' to 'trained/valhalla/distilbart-mnli-12-9/commonlit_distil_bart.zip': No such file or directory
du: cannot access 'trained/valhalla/distilbart-mnli-12-9/commonlit_distil_bart.zip': No such file or directory


In [40]:
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/distilroberta-lm/best_model