In [None]:
# !pip install transformers
# !pip install datasets

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
from transformers import BertTokenizerFast
from transformers import BertModel, PegasusForConditionalGeneration
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, AutoModelWithLMHead, AutoModelForSeq2SeqLM
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach
from transformers import EarlyStoppingCallback

from datasets import load_dataset

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

commonlit_lm		       sample_submission.csv  train-orig.csv
commonlitreadabilityprize.zip  test.csv		      train.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
2,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
3,c12129c31,,,"Patty concluded to move very slowly, thinking ...",-0.340259,0.464009
4,c12129c31,,,"At last the game was concluded, as Roger Farri...",-0.340259,0.464009
...,...,...,...,...,...,...
2843,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2844,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2845,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2846,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [6]:
train_df[train_df['id'] == '8f576a796']

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
114,8f576a796,,,"Enda took the helmet, dress, and spear, and it...",-0.708095,0.456212
115,8f576a796,,,"Derin took the helmet, dress, and sword, and i...",-0.708095,0.456212


In [7]:
test_df['excerpt'].values.shape

(7,)

### Configuration

In [8]:
class CONFIG():
    model_name = 'google/pegasus-xsum'
    batch_size = 6
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 2
    pretrained_transformers_model = f'{model_name}'
    mlm_probability= 0.15
    preprocessing_num_workers = 2
    overwrite_cache = True
    do_train = True

In [9]:
cfg = CONFIG()

### Prepare Train / Validation Set

In [10]:
commonlit_lm_path = DATA_PATH/'commonlit_lm'

In [11]:
if not commonlit_lm_path.exists():
    commonlit_lm_path.mkdir()

In [12]:
all_text = train_df['excerpt'].values
valid_text = test_df['excerpt'].values

In [13]:
common_lit_text_file = commonlit_lm_path/'text.txt'
common_lit_valid_file = commonlit_lm_path/'valid.txt'

In [14]:
def write_to_text_file(data, file):
    with open(file, 'w') as f:
        for t in data:
            f.write(f'{t}\n')

In [15]:
write_to_text_file(all_text, common_lit_text_file)
write_to_text_file(valid_text, common_lit_valid_file)

In [16]:
dataset = load_dataset('text', data_files=[str(common_lit_text_file)])
valid_dataset = load_dataset('text', data_files=[str(common_lit_valid_file)])

Using custom data configuration default-bac0048c1cb90b68


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-bac0048c1cb90b68/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-bac0048c1cb90b68/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


Using custom data configuration default-1797d8a26ed75996


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/.cache/huggingface/datasets/text/default-1797d8a26ed75996/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/.cache/huggingface/datasets/text/default-1797d8a26ed75996/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


In [17]:
column_names = dataset["train"].column_names
column_names[0]

'text'

In [18]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [19]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
summary_model = PegasusForConditionalGeneration.from_pretrained(cfg.pretrained_transformers_model).to(torch_device)

In [20]:
src_text = ["PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."]

def summarize(src_text):
    finat_res = []
    for i, txt in enumerate(src_text):
        batch = tokenizer.prepare_seq2seq_batch([txt], truncation=True, padding='longest', return_tensors='pt').to(torch_device)
        translated = summary_model.generate(**batch)
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        finat_res.extend(tgt_text)
        if i % 100 == 0:
            print(f'Processed {i} summaries')
    return finat_res

In [21]:
summarize(src_text)



Processed 0 summaries


["California's largest electricity provider has turned off power to hundreds of thousands of customers."]

In [22]:
def preprocess_function(examples, max_input_length=256, max_target_length=256):
    inputs = examples["text"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(summarize(inputs), max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

Processed 0 summaries
Processed 100 summaries
Processed 200 summaries
Processed 300 summaries
Processed 400 summaries
Processed 500 summaries
Processed 600 summaries
Processed 700 summaries
Processed 800 summaries
Processed 900 summaries
Processed 0 summaries
Processed 100 summaries
Processed 200 summaries
Processed 300 summaries
Processed 400 summaries
Processed 500 summaries
Processed 600 summaries
Processed 700 summaries
Processed 800 summaries
Processed 900 summaries
Processed 0 summaries
Processed 100 summaries
Processed 200 summaries
Processed 300 summaries
Processed 400 summaries
Processed 500 summaries
Processed 600 summaries
Processed 700 summaries
Processed 800 summaries
Processed 900 summaries
Processed 0 summaries
Processed 100 summaries
Processed 200 summaries
Processed 300 summaries
Processed 400 summaries
Processed 500 summaries
Processed 600 summaries
Processed 700 summaries
Processed 800 summaries
Processed 900 summaries
Processed 0 summaries
Processed 100 summaries
Pr

In [24]:
tokenized_datasets['train'][0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'input_ids': [434,
  109,
  758,
  200,
  2491,
  112,
  109,
  27007,
  108,
  126,
  1848,
  114,
  28977,
  1545,
  2436,
  107,
  3054,
  113,
  142,
  1987,
  2166,
  108,
  126,
  140,
  114,
  1582,
  2646,
  107,
  1],
 'labels': [614,
  113,
  109,
  205,
  1808,
  27007,
  116,
  115,
  109,
  278,
  108,
  109,
  3084,
  8205,
  1087,
  115,
  1169,
  108,
  140,
  2127,
  112,
  109,
  481,
  333,
  109,
  36847,
  107,
  1],
 'text': 'When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.'}

In [25]:
tokenized_valid_datasets = valid_dataset.map(preprocess_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Processed 0 summaries



In [26]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], return_special_tokens_mask=False)

In [27]:
tokenized_datasets.keys()

dict_keys(['train'])

In [28]:
tokenizer.decode(tokenized_datasets['train'][0]['input_ids'])

'When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.</s>'

### Model

In [29]:
model = AutoModelForSeq2SeqLM.from_pretrained(cfg.pretrained_transformers_model)

In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
data_collator

DataCollatorForSeq2Seq(tokenizer=PreTrainedTokenizerFast(name_or_path='google/pegasus-xsum', vocab_size=96103, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask_2>', 'additional_special_tokens': ['<mask_1>', '<unk_2>', '<unk_3>', '<unk_4>', '<unk_5>', '<unk_6>', '<unk_7>', '<unk_8>', '<unk_9>', '<unk_10>', '<unk_11>', '<unk_12>', '<unk_13>', '<unk_14>', '<unk_15>', '<unk_16>', '<unk_17>', '<unk_18>', '<unk_19>', '<unk_20>', '<unk_21>', '<unk_22>', '<unk_23>', '<unk_24>', '<unk_25>', '<unk_26>', '<unk_27>', '<unk_28>', '<unk_29>', '<unk_30>', '<unk_31>', '<unk_32>', '<unk_33>', '<unk_34>', '<unk_35>', '<unk_36>', '<unk_37>', '<unk_38>', '<unk_39>', '<unk_40>', '<unk_41>', '<unk_42>', '<unk_43>', '<unk_44>', '<unk_45>', '<unk_46>', '<unk_47>', '<unk_48>', '<unk_49>', '<unk_50>', '<unk_51>', '<unk_52>', '<unk_53>', '<unk_54>', '<unk_55>', '<unk_56>', '<unk_57>', '<unk_58>', '<unk_59>

### Training

In [31]:
def create_training_args():
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}_lm'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=1e-5
    )
    return training_args

In [32]:
training_args = create_training_args()

In [33]:
training_args.output_dir

'/home/commonlit/models/google/pegasus-xsum_lm'

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'] if cfg.do_train else None,
    eval_dataset=tokenized_valid_datasets['train'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
)

In [35]:
training_args.output_dir

'/home/commonlit/models/google/pegasus-xsum_lm'

In [36]:
!rm -rf {training_args.output_dir}

In [37]:
%%time

trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss
1,,0.374587


KeyboardInterrupt: 

In [None]:
AutoModel.from_pretrained(training_args.output_dir);

In [None]:
print('best_model_checkpoint', trainer.state.best_model_checkpoint)

In [None]:
!mv {trainer.state.best_model_checkpoint} {MODELS_PATH}/{cfg.model_name}-lm/best_lm

In [None]:
import shutil

model_zip_file = shutil.make_archive(commonlit_lm_path, 'zip', f'{MODELS_PATH}/{cfg.model_name}-lm/best_lm')

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}-lm {MODELS_PATH}/{cfg.model_name}_lm

In [None]:
export_file_name = f'/home/commonlit/models/commonlit_{cfg.model_name}.zip'

In [None]:
!ls -la {model_zip_file}

In [None]:
!mv {model_zip_file} {export_file_name}
!du -h {export_file_name}

In [None]:
!mkdir /home/commonlit/models/{cfg.model_name}
!mv {trainer.state.best_model_checkpoint} /home/commonlit/models/{cfg.model_name}-lm/best_model