In [1]:
# !pip install transformers
# !pip install datasets

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

commonlit_lm		       test.csv        train_duo.csv
commonlitreadabilityprize.zip  train-orig.csv
sample_submission.csv	       train.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
2,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
3,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
4,c12129c31,,,"At last the game was concluded, as Roger Farri...",-0.340259,0.464009
...,...,...,...,...,...,...
2844,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2845,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2846,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2847,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [6]:
train_df[train_df['id'] == '8f576a796']

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
114,8f576a796,,,"Enda took the helmet, dress, and spear, and it...",-0.708095,0.456212
115,8f576a796,,,"Derin took the helmet, dress, and sword, and i...",-0.708095,0.456212


In [7]:
test_df['excerpt'].values.shape

(7,)

### Configuration

In [8]:
class CONFIG():
    model_name = 'valhalla/distilbart-mnli-12-9'
    batch_size = 16
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 30
    pretrained_transformers_model = f'{model_name}'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [9]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [10]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [11]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [12]:
all_text = train_df['excerpt'].values
valid_text = test_df['excerpt'].values

In [13]:
common_lit_text_file = commonlit_lm_path/'text.txt'
common_lit_valid_file = commonlit_lm_path/'valid.txt'

In [14]:
def write_to_text_file(data, file):
    with open(file, 'w') as f:
        for t in data:
            f.write(f'{t}\n')

In [15]:
write_to_text_file(all_text, common_lit_text_file)
write_to_text_file(valid_text, common_lit_valid_file)

In [16]:
dataset = load_dataset('text', data_files=[str(common_lit_text_file)])
valid_dataset = load_dataset('text', data_files=[str(common_lit_valid_file)])

Using custom data configuration default-18449568a8bcfbb2


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-18449568a8bcfbb2/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-18449568a8bcfbb2/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


Using custom data configuration default-1904499637fc0156


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-1904499637fc0156/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-1904499637fc0156/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


In [17]:
column_names = dataset["train"].column_names
column_names[0]

'text'

In [18]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389.0, style=ProgressStyle(description…




In [19]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=True)

In [20]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [21]:
tokenized_valid_datasets = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [22]:
tokenized_datasets['train']

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 7133
})

In [23]:
tokenized_valid_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
        num_rows: 21
    })
})

In [24]:
train_dataset = tokenized_datasets["train"]
valid_dataset = tokenized_valid_datasets["train"]

### Model

In [25]:
model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_transformers_model)

Some weights of the model checkpoint at valhalla/distilbart-mnli-12-9 were not used when initializing BartForConditionalGeneration: ['classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias']
- This IS expected if you are initializing BartForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_probability)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='valhalla/distilbart-mnli-12-9', vocab_size=50265, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None)

### Training

In [27]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}-lm'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5
    )
    return training_args

In [28]:
training_args = create_training_args()

In [29]:
training_args.output_dir

'/home/commonlit/models/valhalla/distilbart-mnli-12-9-lm'

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if cfg.do_train else None,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [31]:
# import wandb

In [32]:
!rm -rf {training_args.output_dir}

In [33]:
%%time

trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss
1,4.4595,2.922998
2,3.1422,2.938402
3,2.8323,2.612397
4,2.6066,2.468287
5,2.459,2.594625
6,2.2854,2.035901
7,2.1496,2.037127
8,2.014,1.76729
9,1.9444,1.816922
10,1.8631,1.990374


CPU times: user 1h 59min 24s, sys: 35min 51s, total: 2h 35min 15s
Wall time: 1h 13min 40s


In [34]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

best_model_checkpoint /home/commonlit/models/valhalla/distilbart-mnli-12-9-lm/checkpoint-3568


In [35]:
import shutil

model_zip_file = shutil.make_archive(commonlit_lm_path, 'zip', trainer.state.best_model_checkpoint)

In [36]:
commonlit_lm_path

PosixPath('/home/commonlit/data/commonlit_lm')

In [41]:
export_file_name = f'{cfg.save_dir}/commonlit_distil_bart.zip'

In [42]:
!mv {model_zip_file} {export_file_name}
!du -h {export_file_name}

mv: cannot move '/home/commonlit/data/commonlit_lm.zip' to 'trained/valhalla/distilbart-mnli-12-9/commonlit_distil_bart.zip': No such file or directory
du: cannot access 'trained/valhalla/distilbart-mnli-12-9/commonlit_distil_bart.zip': No such file or directory


In [40]:
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/distilroberta-lm/best_model