In [1]:
# !pip install transformers

In [2]:
import os, gc, sys, time, collections
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DistilBertTokenizerFast
from transformers import DistilBertModel
from transformers import Trainer
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach

### Folders and Dataframes

In [3]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [4]:
!ls {DATA_PATH}

commonlitreadabilityprize.zip  sample_submission.csv  test.csv	train.csv


In [5]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [6]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Prepare Cross Validation

In [7]:
target = train_df['target'].to_numpy()

In [8]:
num_bins = int(np.floor(np.log2(len(train_df))) + 1)
train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)

In [9]:
train_df[['target', 'bins']].groupby(['bins']).agg(['mean', 'count'])

Unnamed: 0_level_0,target,target
Unnamed: 0_level_1,mean,count
bins,Unnamed: 1_level_2,Unnamed: 2_level_2
0,-3.413097,43
1,-2.969369,79
2,-2.526589,172
3,-2.106393,269
4,-1.652726,366
5,-1.201502,418
6,-0.748738,481
7,-0.3098,405
8,0.130016,312
9,0.560407,183


In [10]:
kf = StratifiedKFold(n_splits=num_bins)

In [11]:
for i, (t_, v_) in enumerate(kf.split(X=train_df, y=train_df.bins.values)):
    train_df.loc[v_, 'kfold'] = i

In [12]:
train_df['kfold'] = train_df['kfold'].astype(np.uint8)

In [13]:
train_df = train_df.drop('bins', axis=1)

In [14]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,kfold
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,0
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,0
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,0
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,0
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,0
...,...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900,11
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648,11
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866,11
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128,11


### Metrics

In [15]:
def rmse_score(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def rmse_score_2(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [16]:
a = np.random.rand(10)
b = np.random.rand(10)

In [17]:
rmse_score(a, b), rmse_score_2(a, b)

(0.3839645882532589, 0.3839645882532589)

### Configuration

In [18]:
class CONFIG():
    batch_size = 64
    max_len = 256
    save_dir = 'trained/distilbert'
    num_workers = 2

In [19]:
cfg = CONFIG()

### Prepare train test split

In [20]:
def create_split(fold = [1]):
    valid_df = train_df[train_df['kfold'].isin(fold)]
    valid_text = valid_df['excerpt'].values
    valid_target = valid_df['target'].values
    training_df = train_df[~train_df['kfold'].isin(fold)]
    train_text = training_df['excerpt'].values
    train_target = training_df['target'].values
    return train_text, train_target, valid_text, valid_target

In [21]:
train_text, train_target, valid_text, valid_target = create_split([1])

In [22]:
len(train_text), len(valid_text)

(2597, 237)

### Prepare Tokenizers

In [23]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# Save the tokenizer so that you can download the files and move it to a Kaggle dataset.
tokenizer.save_pretrained(cfg.save_dir)

('trained/distilbert/tokenizer_config.json',
 'trained/distilbert/special_tokens_map.json',
 'trained/distilbert/vocab.txt',
 'trained/distilbert/added_tokens.json',
 'trained/distilbert/tokenizer.json')

In [24]:
tokenizer(train_df['excerpt'].values[0],
                                return_tensors='pt',
                                max_length=cfg.max_len,
                                padding='max_length',
                                truncation=True)

{'input_ids': tensor([[  101,  2043,  1996,  2402,  2111,  2513,  2000,  1996, 14307,  1010,
          2009,  3591,  1037, 27873,  2904,  3311,  1012,  2612,  1997,  2019,
          4592,  3496,  1010,  2009,  2001,  1037,  3467,  5957,  1012,  1996,
          2723,  2001,  3139,  2007,  4586,  1011,  2317, 10683,  1010,  2025,
          4201,  2006, 15299,  1010,  2021, 19379, 21132,  2058, 18548,  1998,
          2940, 25384,  1010,  2066,  1037,  2613,  4586,  2492,  1012,  1996,
          3365,  9486,  1998, 16899,  2015,  2008,  2018,  7429,  1996,  2282,
          1010,  2020,  9898,  2098,  2007, 13724,  1998, 25259,  2007, 25252,
          1997,  6557,  1010,  2066,  4586,  1012,  2036,  6323,  6497,  2018,
          2042,  8217, 11867,  6657, 19859,  2006,  2068,  1010,  1998, 20332,
          6121, 24582, 20921,  5112,  2013,  1996,  5628,  1012,  2012,  2169,
          2203,  1997,  1996,  2282,  1010,  2006,  1996,  2813,  1010,  5112,
          1037,  3376,  4562,  1011,  

In [25]:
# train_encodings = tokenizer(list(train_text), truncation=True, padding=True, max_length=cfg.max_len, return_tensors='pt')
# valid_encodings = tokenizer(list(valid_text), truncation=True, padding=True, max_length=cfg.max_len, return_tensors='pt')

### DataSet

In [26]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, target, tokenizer, max_len=128):
        self.excerpt = text
        self.target = target
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return InputFeatures(input_ids=convert_to_list(encode['input_ids']),
                      attention_mask=convert_to_list(encode['attention_mask']),
                      label=torch.tensor(self.target[idx]))
    
    def __len__(self):
        return len(self.excerpt)

In [27]:
train_ds = CommonLitDataset(train_text, train_target, tokenizer, cfg.max_len)
valid_ds = CommonLitDataset(valid_text, valid_target, tokenizer, cfg.max_len)

In [28]:
# train_dl = D.DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers)
# train_dl = D.DataLoader(valid_ds, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers)

In [29]:
# encode, target = next(iter(train_dl))

In [30]:
# encode.keys(), target.shape, encode['input_ids'].shape, encode['attention_mask'].shape

In [31]:
# encode['input_ids'][0].squeeze()

### Model

In [32]:
# You can use a Transformer model of your choice.
transformer_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
# transformer_out = transformer_model(input_ids=encode['input_ids'].squeeze(), attention_mask=encode['attention_mask'].squeeze())

In [34]:
# dict(transformer_out)['last_hidden_state'].shape

In [35]:
# torch.mean(transformer_out.last_hidden_state, axis=1).shape

In [36]:
# sample_layer = nn.Linear(768, 1)

In [37]:
# sample_layer(torch.mean(transformer_out.last_hidden_state, axis=1)).shape

In [38]:
class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        self.transformer_model = transformer_model
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
        
    def forward(self, input_ids, attention_mask):
        transformer_out = transformer_model(input_ids=input_ids.squeeze(), attention_mask=attention_mask.squeeze())
        x = torch.mean(transformer_out.last_hidden_state, axis=1)
        x = self.drop(x)
        x = self.out(x)
        return x

In [39]:
model = CommonLitModel()

### Trainer

In [40]:
loss_fct = nn.MSELoss()

In [41]:
a = torch.randn([128, 1])
b = torch.randn([128, 1])

In [42]:
logger = logging.get_logger(__name__)

class CommonLitTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        input_ids = inputs.pop("input_ids")
        attention_mask = inputs.pop("attention_mask")
        outputs = model(input_ids, attention_mask)
        logits = outputs
        loss = loss_fct(logits.flatten(),
                        labels.float().flatten())
        zero_cat = torch.zeros([1, 1]).to(outputs.device)
        return (loss, torch.cat([zero_cat, outputs])) if return_outputs else loss

In [43]:
training_args = TrainingArguments(
    output_dir=str(MODELS_PATH/'distilbert-1'),
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_gpu_train_batch_size=cfg.batch_size,
    per_gpu_eval_batch_size=cfg.batch_size,
    num_train_epochs=10,
    logging_strategy="epoch",
    logging_first_step=True,
    save_steps=40000,
    fp16=True,
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='mse'
)

In [45]:
eval_pred_global = {}
def compute_metrics(eval_pred):
    global eval_pred_global
    logits, labels = eval_pred
    eval_pred_global = eval_pred
    return {'mse': mean_squared_error(logits, labels)}

In [46]:
trainer = CommonLitTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics
)

In [47]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Epoch,Training Loss,Validation Loss,Mse
1,0.7167,0.414813,0.414813
2,0.2732,0.35254,0.35254
3,0.1765,0.325181,0.325181
4,0.1138,0.356465,0.356465
5,0.0762,0.308585,0.308585
6,0.0529,0.36494,0.36494
7,0.0391,0.310718,0.310718
8,0.0283,0.307912,0.307912
9,0.0232,0.365395,0.365395
10,0.0193,0.347911,0.34791


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_de

CPU times: user 3min 6s, sys: 36.7 s, total: 3min 43s
Wall time: 1min 59s


TrainOutput(global_step=410, training_loss=0.15461654706699093, metrics={'train_runtime': 119.5782, 'train_samples_per_second': 3.429, 'total_flos': 0.0, 'epoch': 10.0, 'init_mem_cpu_alloc_delta': 2056634368, 'init_mem_gpu_alloc_delta': 266587648, 'init_mem_cpu_peaked_delta': 250880000, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 23638016, 'train_mem_gpu_alloc_delta': 802396160, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 6896181760})

In [48]:
# !cat /opt/conda/lib/python3.8/site-packages/transformers/trainer.py

In [49]:
trainer.save_model()
# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)

tokenizer.save_pretrained(training_args.output_dir)

('/home/commonlit/models/distilbert-1/tokenizer_config.json',
 '/home/commonlit/models/distilbert-1/special_tokens_map.json',
 '/home/commonlit/models/distilbert-1/vocab.txt',
 '/home/commonlit/models/distilbert-1/added_tokens.json',
 '/home/commonlit/models/distilbert-1/tokenizer.json')

In [50]:
result = trainer.evaluate()

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
