In [None]:
# !pip install transformers
# !pip install datasets

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

about_hiv.txt		       stories_for_little_boys.txt
aunt_may_shirl.txt	       test-enhanced.csv
commonlit_lm		       test.csv
commonlit_lm.zip	       the_beekeeper.txt
commonlitreadabilityprize.zip  the_huge_hunter.txt
data_enhancements.ipynb        the_twin_stars.txt
jason_golden_fleece.txt        thumbelina
little_bear_story.txt	       train-orig.csv
mapping_the_oceans.txt	       train.csv
pecks_uncle_ike.txt	       train_duo.csv
rebel_of_the_school.txt        train_enhancements.csv
sample_submission.csv	       understood_betsy.txt
sports_in_adolescence.txt      why_the_swallows_tail_is_forked.txt


In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test-enhanced.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2922,09ee04799,,,"I thought I was to be shot, and tried to get a...",-0.672698,0.458735
2923,4be5d5b1b,,,Imagine that there was something about you tha...,0.472264,0.515370
2924,4be5d5b1b,,,If HIV is transmitted from mother to child (du...,0.472264,0.515370
2925,4be5d5b1b,,,All people with HIV have to make decisions abo...,0.472264,0.515370


In [6]:
train_df[train_df['id'] == '8f576a796']

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
107,8f576a796,,,"Enda took the helmet, dress, and spear, and it...",-0.708095,0.456212


In [7]:
test_df['excerpt'].values.shape

(13,)

In [8]:
test_df

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...
5,12537fe78,,,"To explain transitivity, let us look first at ..."
6,965e592c0,https://www.africanstorybook.org/#,CC BY 4.0,Milka and John are playing in the garden. Her ...
7,f0953f0a5,,,""" WHAT are you hunting for on your hands and k..."
8,f0953f0a5,,,"Dotty was frowning at Prudy behind a chair. ""Y..."
9,f0953f0a5,,,Still the little girl did not understand. Her ...


### Configuration

In [9]:
class CONFIG():
    model_name = 'roberta-base'
    batch_size = 32
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 30
    pretrained_transformers_model = f'{model_name}'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [10]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [11]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [12]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [13]:
all_text = train_df['excerpt'].values
valid_text = test_df['excerpt'].values

In [14]:
common_lit_text_file = commonlit_lm_path/'text.txt'
common_lit_valid_file = commonlit_lm_path/'valid.txt'

In [15]:
def write_to_text_file(data, file):
    with open(file, 'w') as f:
        for t in data:
            f.write(f'{t}\n')

In [16]:
write_to_text_file(all_text, common_lit_text_file)
write_to_text_file(valid_text, common_lit_valid_file)

In [17]:
dataset = load_dataset('text', data_files=[str(common_lit_text_file)])
valid_dataset = load_dataset('text', data_files=[str(common_lit_valid_file)])

Using custom data configuration default-4d8aa0b9b9c5f386


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-4d8aa0b9b9c5f386/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-4d8aa0b9b9c5f386/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


Using custom data configuration default-b6db9da9cad725ca


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-b6db9da9cad725ca/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-b6db9da9cad725ca/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


In [18]:
column_names = dataset["train"].column_names
column_names[0]

'text'

In [19]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [20]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=True)

In [21]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [22]:
tokenized_valid_datasets = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [23]:
tokenized_datasets['train']

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 7403
})

In [24]:
tokenized_valid_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
        num_rows: 52
    })
})

In [25]:
train_dataset = tokenized_datasets["train"]
valid_dataset = tokenized_valid_datasets["train"]

### Model

In [26]:
model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_transformers_model)

In [27]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_probability)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None)

### Training

In [28]:
import os
os.environ['WANDB_DISABLED'] = "true"

In [29]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}-lm'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5
    )
    return training_args

In [30]:
training_args = create_training_args()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [31]:
training_args.output_dir

'/home/commonlit/models/roberta-base-lm'

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if cfg.do_train else None,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [33]:
# import wandb

In [34]:
!rm -rf {training_args.output_dir}

In [35]:
%%time

trainer.train()
trainer.save_model()

Epoch,Training Loss,Validation Loss
1,1.7149,1.980046
2,1.6706,1.871126
3,1.6251,1.968279
4,1.5918,1.681834
5,1.5419,1.831322
6,1.5145,1.495426
7,1.4831,1.727448
8,1.4667,1.587577
9,1.43,1.863876
10,1.4009,1.67069


CPU times: user 13min 56s, sys: 3min 51s, total: 17min 47s
Wall time: 17min 45s


In [37]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

best_model_checkpoint /home/commonlit/models/roberta-base-lm/checkpoint-2784


In [36]:
training_args.output_dir

'/home/commonlit/models/roberta-base-lm'

In [37]:
!rm -rf {MODELS_PATH}/{cfg.model_name}_lm

In [38]:
!mv {trainer.state.best_model_checkpoint} {MODELS_PATH}/{cfg.model_name}_lm

In [None]:
import shutil

model_zip_file = shutil.make_archive(commonlit_lm_path, 'zip', trainer.state.best_model_checkpoint)

In [None]:
commonlit_lm_path

In [None]:
export_file_name = f'/home/commonlit/models/commonlit_{cfg.model_name}.zip'

In [None]:
!ls -la {model_zip_file}

In [None]:
!mv {model_zip_file} {export_file_name}
!du -h {export_file_name}

In [None]:
!mkdir /home/commonlit/models/{cfg.model_name}
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/{cfg.model_name}-lm/best_model