In [None]:
# !pip install transformers
# !pip install datasets

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

commonlit_lm		       test.csv        train_duo.csv
commonlitreadabilityprize.zip  train-orig.csv
sample_submission.csv	       train.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
2,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
3,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
4,c12129c31,,,"At last the game was concluded, as Roger Farri...",-0.340259,0.464009
...,...,...,...,...,...,...
2844,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2845,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2846,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2847,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [6]:
train_df[train_df['id'] == '5127fb10f']['excerpt'].values

array(['The Battle of Waterloo was a battle that was fought mostly between French and British forces. Napoleon was crowned as Emperor of France in 1804. Then he launched many successful attacks on other countries in Europe. France soon had an empire that stretched from Spain to the Russian border. The only country that was still not captured was Great Britain. The Royal Navy had many ships, so invasion by France was not possible. However, Great Britain was not strong enough to stop Napoleon and his army from taking over most of mainland Europe.\nNapoleon seemed unstoppable until two separate campaigns caused his empire to fall apart. He gathered a huge army to invade and conquer Russia once and for all in 1812. However, he did not think that he would have very many difficulties and it turned out he did. His army was caught by the Russian winter and destroyed by the weather and lack of food.',
       'Napoleon was crowned as Emperor of France in 1804, and then launched the successful Na

In [7]:
test_df['excerpt'].values.shape

(7,)

### Duobert

In [8]:
train_df.sort_values(['target'], inplace=True)

In [9]:
train_df['excerpt_shifted'] = train_df['excerpt'].shift(1)

In [10]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,excerpt_shifted
1720,4626100d8,,,"The commutator is peculiar, consisting of only...",-3.676268,0.623621,
1142,493b80aa7,,,The Dunwich horror itself came between Lammas ...,-3.668360,0.571404,"The commutator is peculiar, consisting of only..."
1769,fe44cbd14,,,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892,0.644398,The Dunwich horror itself came between Lammas ...
1767,284eaa5ad,,,As to surface-slope its measurement—from nearl...,-3.639936,0.603819,"The iron cylinder weighs 23 kilogrammes; but, ..."
1735,9e9eacb49,,,"The tree is dioecious, bearing male catkins on...",-3.636834,0.606822,As to surface-slope its measurement—from nearl...
...,...,...,...,...,...,...,...
1080,016913371,https://www.africanstorybook.org/,CC BY 4.0,Grandma's garden was wonderful. It was full of...,1.467665,0.599600,There was once a young man who spent all his t...
1028,7a1d484be,https://www.africanstorybook.org/,CC BY 4.0,More people came to the bus stop just before 9...,1.541672,0.606997,Grandma's garden was wonderful. It was full of...
822,8f35441e3,https://www.africanstorybook.org/#,CC BY 4.0,"Every day, Emeka's father took him to school i...",1.583847,0.624776,More people came to the bus stop just before 9...
995,849971671,https://www.africanstorybook.org/,CC BY 4.0,"For her last birthday, Sisanda had a special t...",1.597870,0.596349,"Every day, Emeka's father took him to school i..."


In [11]:
train_df['target_shifted'] = train_df['target'].shift(1)

In [12]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,excerpt_shifted,target_shifted
1720,4626100d8,,,"The commutator is peculiar, consisting of only...",-3.676268,0.623621,,
1142,493b80aa7,,,The Dunwich horror itself came between Lammas ...,-3.668360,0.571404,"The commutator is peculiar, consisting of only...",-3.676268
1769,fe44cbd14,,,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892,0.644398,The Dunwich horror itself came between Lammas ...,-3.668360
1767,284eaa5ad,,,As to surface-slope its measurement—from nearl...,-3.639936,0.603819,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892
1735,9e9eacb49,,,"The tree is dioecious, bearing male catkins on...",-3.636834,0.606822,As to surface-slope its measurement—from nearl...,-3.639936
...,...,...,...,...,...,...,...,...
1080,016913371,https://www.africanstorybook.org/,CC BY 4.0,Grandma's garden was wonderful. It was full of...,1.467665,0.599600,There was once a young man who spent all his t...,1.465592
1028,7a1d484be,https://www.africanstorybook.org/,CC BY 4.0,More people came to the bus stop just before 9...,1.541672,0.606997,Grandma's garden was wonderful. It was full of...,1.467665
822,8f35441e3,https://www.africanstorybook.org/#,CC BY 4.0,"Every day, Emeka's father took him to school i...",1.583847,0.624776,More people came to the bus stop just before 9...,1.541672
995,849971671,https://www.africanstorybook.org/,CC BY 4.0,"For her last birthday, Sisanda had a special t...",1.597870,0.596349,"Every day, Emeka's father took him to school i...",1.583847


In [13]:
train_df = train_df[train_df['target_shifted'].notna()]

In [14]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,excerpt_shifted,target_shifted
1142,493b80aa7,,,The Dunwich horror itself came between Lammas ...,-3.668360,0.571404,"The commutator is peculiar, consisting of only...",-3.676268
1769,fe44cbd14,,,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892,0.644398,The Dunwich horror itself came between Lammas ...,-3.668360
1767,284eaa5ad,,,As to surface-slope its measurement—from nearl...,-3.639936,0.603819,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892
1735,9e9eacb49,,,"The tree is dioecious, bearing male catkins on...",-3.636834,0.606822,As to surface-slope its measurement—from nearl...,-3.639936
1854,466e33a64,,,The copper even of such a conductor has been m...,-3.596751,0.567050,"The tree is dioecious, bearing male catkins on...",-3.636834
...,...,...,...,...,...,...,...,...
1080,016913371,https://www.africanstorybook.org/,CC BY 4.0,Grandma's garden was wonderful. It was full of...,1.467665,0.599600,There was once a young man who spent all his t...,1.465592
1028,7a1d484be,https://www.africanstorybook.org/,CC BY 4.0,More people came to the bus stop just before 9...,1.541672,0.606997,Grandma's garden was wonderful. It was full of...,1.467665
822,8f35441e3,https://www.africanstorybook.org/#,CC BY 4.0,"Every day, Emeka's father took him to school i...",1.583847,0.624776,More people came to the bus stop just before 9...,1.541672
995,849971671,https://www.africanstorybook.org/,CC BY 4.0,"For her last birthday, Sisanda had a special t...",1.597870,0.596349,"Every day, Emeka's father took him to school i...",1.583847


In [15]:
train_df['target_mean'] = np.mean(np.array([train_df['target'], train_df['target_shifted']]), axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['target_mean'] = np.mean(np.array([train_df['target'], train_df['target_shifted']]), axis=0)


In [16]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,excerpt_shifted,target_shifted,target_mean
1142,493b80aa7,,,The Dunwich horror itself came between Lammas ...,-3.668360,0.571404,"The commutator is peculiar, consisting of only...",-3.676268,-3.672314
1769,fe44cbd14,,,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892,0.644398,The Dunwich horror itself came between Lammas ...,-3.668360,-3.655626
1767,284eaa5ad,,,As to surface-slope its measurement—from nearl...,-3.639936,0.603819,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.642892,-3.641414
1735,9e9eacb49,,,"The tree is dioecious, bearing male catkins on...",-3.636834,0.606822,As to surface-slope its measurement—from nearl...,-3.639936,-3.638385
1854,466e33a64,,,The copper even of such a conductor has been m...,-3.596751,0.567050,"The tree is dioecious, bearing male catkins on...",-3.636834,-3.616792
...,...,...,...,...,...,...,...,...,...
1080,016913371,https://www.africanstorybook.org/,CC BY 4.0,Grandma's garden was wonderful. It was full of...,1.467665,0.599600,There was once a young man who spent all his t...,1.465592,1.466629
1028,7a1d484be,https://www.africanstorybook.org/,CC BY 4.0,More people came to the bus stop just before 9...,1.541672,0.606997,Grandma's garden was wonderful. It was full of...,1.467665,1.504669
822,8f35441e3,https://www.africanstorybook.org/#,CC BY 4.0,"Every day, Emeka's father took him to school i...",1.583847,0.624776,More people came to the bus stop just before 9...,1.541672,1.562759
995,849971671,https://www.africanstorybook.org/,CC BY 4.0,"For her last birthday, Sisanda had a special t...",1.597870,0.596349,"Every day, Emeka's father took him to school i...",1.583847,1.590858


In [17]:
train_df.loc[:, 'excerpt_duo'] = train_df['excerpt'] + '\n' +  (train_df['excerpt_shifted'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [18]:
train_df['excerpt'].values[0]



In [19]:
train_df['excerpt_shifted'].values[0]

'The commutator is peculiar, consisting of only three segments of a copper ring, while in the simplest of other continuous current generators several times that number exist, and frequently 120! segments are to be found. These three segments are made so as to be removable in a moment for cleaning or replacement. They are mounted upon a metal support, and are surrounded on all sides by a free air space, and cannot, therefore, lose their insulated condition. This feature of air insulation is peculiar to this system, and is very important as a factor in the durability of the commutator. Besides this, the commutator is sustained by supports carried in flanges upon the shaft, which flanges, as an additional safeguard, are coated all over with hard rubber, one of the finest known insulators. It may be stated, without fear of contradiction, that no other commutator made is so thoroughly insulated and protected. The three commutator segments virtually constitute a single copper ring, mounted i

In [20]:
train_df['excerpt_duo'].values[0]



In [21]:
train_duo_df = train_df[['id', 'url_legal', 'license', 'standard_error', 'excerpt_duo', 'target_mean']]

In [22]:
train_duo_df.columns = ['id', 'url_legal', 'license', 'standard_error', 'excerpt', 'target']

In [23]:
train_duo_df

Unnamed: 0,id,url_legal,license,standard_error,excerpt,target
1142,493b80aa7,,,0.571404,The Dunwich horror itself came between Lammas ...,-3.672314
1769,fe44cbd14,,,0.644398,"The iron cylinder weighs 23 kilogrammes; but, ...",-3.655626
1767,284eaa5ad,,,0.603819,As to surface-slope its measurement—from nearl...,-3.641414
1735,9e9eacb49,,,0.606822,"The tree is dioecious, bearing male catkins on...",-3.638385
1854,466e33a64,,,0.567050,The copper even of such a conductor has been m...,-3.616792
...,...,...,...,...,...,...
1080,016913371,https://www.africanstorybook.org/,CC BY 4.0,0.599600,Grandma's garden was wonderful. It was full of...,1.466629
1028,7a1d484be,https://www.africanstorybook.org/,CC BY 4.0,0.606997,More people came to the bus stop just before 9...,1.504669
822,8f35441e3,https://www.africanstorybook.org/#,CC BY 4.0,0.624776,"Every day, Emeka's father took him to school i...",1.562759
995,849971671,https://www.africanstorybook.org/,CC BY 4.0,0.596349,"For her last birthday, Sisanda had a special t...",1.590858


In [24]:
train_duo_df['excerpt'].values[0]



In [25]:
train_duo_df.reset_index(inplace=True)

In [26]:
import re

In [28]:
train_duo_df['excerpt_shortened'] = train_duo_df['excerpt'].apply(lambda t: re.sub(r'(.+\.).+', r'\1',  t))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_duo_df['excerpt_shortened'] = train_duo_df['excerpt'].apply(lambda t: re.sub(r'(.+\.).+', r'\1',  t))


In [29]:
train_duo_df.iloc[18]['excerpt']

'Bull, John, a fine, fat, American-beef fed individual who inhabits a suffragette-infested island somewhere in the North Atlantic. Born several hundred years ago and is beginning to show his age. Is fond of the sea and is said to have a fine fleet. This has had off years, notably 1812. B. has had trouble with a son who wishes to leave the paternal protection. Is fearless except when faced by a hunger strike, the Pankhurst family, and thoughts of Germany. Patronizes a costly social organization known as the Royal Family, or a reception committee for American heiresstocracy, which also dedicates buildings, poses for stamps, post-cards, motion pictures and raises princesses of Wales for magazine articles and crowning purposes. B. is a monitor of English style; wears a monocle, spats, \'i \'at, cane, pipe, awful accent, and never makes his appearance without a cawld bawth. He detests the word "egotism." Is a celebrated humorist, seeing through all jokes but himself. Ambition: \'Ome sweet \

In [30]:
train_duo_df.iloc[18]['excerpt_shortened']

'Bull, John, a fine, fat, American-beef fed individual who inhabits a suffragette-infested island somewhere in the North Atlantic. Born several hundred years ago and is beginning to show his age. Is fond of the sea and is said to have a fine fleet. This has had off years, notably 1812. B. has had trouble with a son who wishes to leave the paternal protection. Is fearless except when faced by a hunger strike, the Pankhurst family, and thoughts of Germany. Patronizes a costly social organization known as the Royal Family, or a reception committee for American heiresstocracy, which also dedicates buildings, poses for stamps, post-cards, motion pictures and raises princesses of Wales for magazine articles and crowning purposes. B. is a monitor of English style; wears a monocle, spats, \'i \'at, cane, pipe, awful accent, and never makes his appearance without a cawld bawth. He detests the word "egotism." Is a celebrated humorist, seeing through all jokes but himself. Ambition: \'Ome sweet \

In [31]:
train_duo_df.to_csv(DATA_PATH/'train_duo.csv')

### Configuration

In [32]:
class CONFIG():
    model_name = 'distilroberta'
    batch_size = 12
    max_len = 512
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 15
    pretrained_transformers_model = f'{model_name}-base'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [33]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [34]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [None]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [35]:
all_text = train_duo_df['excerpt_shortened'].values
valid_text = test_df['excerpt'].values

In [36]:
# def write_to_text_file(data, file):
#     with open(file, 'w') as f:
#         for t in data:
#             f.write(f'{t}\n')

In [37]:
# write_to_text_file(all_text, common_lit_text_file)
# write_to_text_file(valid_text, common_lit_valid_file)

In [38]:
train_text_dict = {'text': all_text.tolist()}
valid_text_dict = {'text': valid_text.tolist()}

In [39]:
import datasets

dataset = datasets.Dataset.from_dict(train_text_dict)
valid_dataset = datasets.Dataset.from_dict(valid_text_dict)

In [40]:
dataset

Dataset({
    features: ['text'],
    num_rows: 2848
})

In [41]:
column_names = dataset.column_names

In [42]:
column_names[0]

'text'

In [43]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [44]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=True)

In [45]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [46]:
tokenized_valid_datasets = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=cfg.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not cfg.overwrite_cache,
)





In [47]:
for i, inputs in enumerate(tokenized_datasets):
    input_length = len(inputs['input_ids'])
    if input_length > 512:
        print(i, input_length)

In [50]:
tokenized_datasets

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 2848
})

In [48]:
tokenized_valid_datasets

Dataset({
    features: ['attention_mask', 'input_ids', 'special_tokens_mask'],
    num_rows: 7
})

In [51]:
train_dataset = tokenized_datasets
valid_dataset = tokenized_valid_datasets

### Model

In [52]:
model = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_transformers_model)

In [53]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=cfg.mlm_probability)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='distilroberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None)

### Training

In [54]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}-lm'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5
    )
    return training_args

In [55]:
training_args = create_training_args()

In [56]:
training_args.output_dir

'/home/commonlit/models/distilroberta-lm'

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if cfg.do_train else None,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [58]:
# import wandb

In [59]:
!rm -rf {training_args.output_dir}

In [60]:
%%time

trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss
1,2.0595,1.734749
2,1.9503,1.778868
3,1.8985,1.743046
4,1.8422,1.722952
5,1.8144,1.727978
6,1.7604,1.826205
7,1.7192,1.396691
8,1.7018,1.597278
9,1.6551,2.027725
10,1.6323,1.680905


CPU times: user 24min 28s, sys: 5min 13s, total: 29min 42s
Wall time: 16min 29s


In [61]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

best_model_checkpoint /home/commonlit/models/distilroberta-lm/checkpoint-3332


In [62]:
MODELS_PATH

PosixPath('/home/commonlit/models')

In [63]:
import shutil

model_zip_file = shutil.make_archive(f'{MODELS_PATH}/{cfg.model_name}/commonlit_distil_roberta_lm', 'zip', trainer.state.best_model_checkpoint)

In [68]:
export_file_name = f'{MODELS_PATH}/{cfg.model_name}/commonlit_distil_roberta_lm.zip'

In [69]:
!du -h {export_file_name}

873M	/home/commonlit/models/distilroberta/commonlit_distil_roberta_lm.zip


In [70]:
!mkdir /home/commonlit/models/distilroberta_lm
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/distilroberta_lm/best_model

mkdir: cannot create directory ‘/home/commonlit/models/distilroberta_lm’: File exists
