In [None]:
# !pip install transformers
# !pip install datasets

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

commonlit_lm		       test.csv        train_duo.csv
commonlitreadabilityprize.zip  train-orig.csv
sample_submission.csv	       train.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
2,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
3,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
4,c12129c31,,,"At last the game was concluded, as Roger Farri...",-0.340259,0.464009
...,...,...,...,...,...,...
2844,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2845,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2846,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2847,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [6]:
train_df[train_df['id'] == '5127fb10f']['excerpt'].values

array(['The Battle of Waterloo was a battle that was fought mostly between French and British forces. Napoleon was crowned as Emperor of France in 1804. Then he launched many successful attacks on other countries in Europe. France soon had an empire that stretched from Spain to the Russian border. The only country that was still not captured was Great Britain. The Royal Navy had many ships, so invasion by France was not possible. However, Great Britain was not strong enough to stop Napoleon and his army from taking over most of mainland Europe.\nNapoleon seemed unstoppable until two separate campaigns caused his empire to fall apart. He gathered a huge army to invade and conquer Russia once and for all in 1812. However, he did not think that he would have very many difficulties and it turned out he did. His army was caught by the Russian winter and destroyed by the weather and lack of food.',
       'Napoleon was crowned as Emperor of France in 1804, and then launched the successful Na

In [7]:
test_df['excerpt'].values.shape

(7,)

### Configuration

In [8]:
class CONFIG():
    model_name = 'distilroberta'
    batch_size = 48
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 30
    pretrained_transformers_model = f'{model_name}-base'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [9]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [10]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [11]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [12]:
all_text = train_df['excerpt'].values
valid_text = test_df['excerpt'].values

In [13]:
common_lit_text_file = commonlit_lm_path/'text.txt'
common_lit_valid_file = commonlit_lm_path/'valid.txt'

In [14]:
# def write_to_text_file(data, file):
#     with open(file, 'w') as f:
#         for t in data:
#             f.write(f'{t}\n')

In [15]:
# write_to_text_file(all_text, common_lit_text_file)
# write_to_text_file(valid_text, common_lit_valid_file)

In [16]:
train_text_dict = {'text': all_text.tolist()}
valid_text_dict = {'text': valid_text.tolist()}

In [17]:
import datasets

dataset = datasets.Dataset.from_dict(train_text_dict)
valid_dataset = datasets.Dataset.from_dict(valid_text_dict)

In [18]:
dataset

Dataset({
    features: ['text'],
    num_rows: 2849
})

In [19]:
column_names = dataset.column_names

In [20]:
column_names[0]

'text'

In [21]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [22]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=True)

In [23]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [24]:
tokenized_valid_datasets = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [25]:
tokenized_datasets

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 2849
})

In [26]:
tokenized_valid_datasets

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 7
})

In [27]:
train_dataset = tokenized_datasets
valid_dataset = tokenized_valid_datasets

### Model

In [28]:
model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_transformers_model)

In [29]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_probability)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='distilroberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None)

### Training

In [30]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}-lm'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5
    )
    return training_args

In [31]:
training_args = create_training_args()

In [32]:
training_args.output_dir

'/home/commonlit/models/distilroberta-lm'

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if cfg.do_train else None,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [34]:
# import wandb

In [35]:
!rm -rf {training_args.output_dir}

In [36]:
%%time

trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss
1,2.0532,1.649594
2,1.9659,1.641555
3,1.9451,1.850668
4,1.9077,1.557601
5,1.887,1.367707
6,1.8651,1.565059
7,1.856,1.686972
8,1.8032,1.469128
9,1.8198,1.55647
10,1.7942,1.565947


CPU times: user 11min 54s, sys: 2min 33s, total: 14min 28s
Wall time: 7min 55s


In [37]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

best_model_checkpoint /home/commonlit/models/distilroberta-lm/checkpoint-300


In [39]:
MODELS_PATH

PosixPath('/home/commonlit/models')

In [40]:
import shutil

model_zip_file = shutil.make_archive(f'{MODELS_PATH}/{cfg.model_name}/commonlit_distil_roberta_lm', 'zip', trainer.state.best_model_checkpoint)

In [41]:
export_file_name = '/home/commonlit/models/distilroberta-lm/commonlit_distil_roberta_lm.zip'

In [42]:
!du -h {export_file_name}

du: cannot access '/home/commonlit/models/distilroberta-lm/commonlit_distil_roberta_lm.zip': No such file or directory


In [43]:
!mkdir /home/commonlit/models/distilroberta_lm
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/distilroberta_lm/best_model