In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import Trainer
from transformers import AutoModel, AutoTokenizer
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach

from tqdm.notebook import tqdm

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

commonlitreadabilityprize.zip  sample_submission.csv  test.csv	train.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
sample_df

Unnamed: 0,id,target
0,c0f722661,0.0
1,f0953f0a5,0.0
2,0df072751,0.0
3,04caf4e0c,0.0
4,0e63f8bea,0.0
5,12537fe78,0.0
6,965e592c0,0.0


In [6]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [7]:
train_df['target'][2829]

1.7113898269999999

In [8]:
train_df['excerpt_len'] = train_df['excerpt'].apply(lambda r: len(r))

In [9]:
train_df['excerpt_len'].describe()

count    2834.000000
mean      971.732886
std       117.257578
min       669.000000
25%       885.000000
50%       971.000000
75%      1058.000000
max      1341.000000
Name: excerpt_len, dtype: float64

In [10]:
train_df['excerpt_len'].argmax()

2805

In [11]:
class CONFIG():
    model_name = 'distilbert'
    batch_size = 128
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 20
    pretrained_transformers_model = f'{model_name}-base-uncased'
    n_folds = 5

In [12]:
cfg = CONFIG()

In [13]:
assert cfg.model_name == CONFIG.model_name

### Read Existing Models

In [14]:
model_path = MODELS_PATH/'distilbert/best'
assert model_path.exists()

In [15]:
!ls {model_path}

0_pytorch_model.bin   4_pytorch_model.bin  tokenizer-0	 tokenizer-4
10_pytorch_model.bin  5_pytorch_model.bin  tokenizer-1	 tokenizer-5
11_pytorch_model.bin  6_pytorch_model.bin  tokenizer-10  tokenizer-6
1_pytorch_model.bin   7_pytorch_model.bin  tokenizer-11  tokenizer-7
2_pytorch_model.bin   8_pytorch_model.bin  tokenizer-2	 tokenizer-8
3_pytorch_model.bin   9_pytorch_model.bin  tokenizer-3	 tokenizer-9


In [16]:
from transformers import PreTrainedModel
from typing import Dict, Any, Union

class CommonLitModel(PreTrainedModel):
    def __init__(self):
        super(PreTrainedModel, self).__init__()
        self.transformer_model = AutoModel.from_pretrained(cfg.pretrained_transformers_model)
        self.drop = nn.Dropout(0.5)
        self.out = nn.Linear(768 * 2, 1)
        self.config = self.transformer_model.config
        
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer_model(input_ids=input_ids.squeeze(), attention_mask=attention_mask.squeeze(), output_hidden_states=False)
        mean_pooling = torch.mean(transformer_out.last_hidden_state, axis=1)
        max_pooling, _ = torch.max(transformer_out.last_hidden_state, axis=1)
        x = torch.cat([mean_pooling, max_pooling], axis=1)
        x = self.drop(x)
        x = self.out(x)
        return x
    
    def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
        """
        For models that inherit from :class:`~transformers.PreTrainedModel`, uses that method to compute the number of
        floating point operations for every backward + forward pass. If using another model, either implement such a
        method in the model or subclass and override this method.
        Args:
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
        Returns:
            :obj:`int`: The number of floating-point operations.
        """
        return 0

In [17]:
inference_model = CommonLitModel()
inference_model = inference_model.cuda()
inference_model.load_state_dict(torch.load(str(model_path/'0_pytorch_model.bin')))
inference_model.eval();

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### DataSet and Tokenizers

In [18]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_path/'tokenizer-0')

In [20]:
tokenizer

PreTrainedTokenizerFast(name_or_path='/home/commonlit/models/distilbert/best/tokenizer-0', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [21]:
test_excerpt = train_df['excerpt'].values[2805]
encoded_dict = tokenizer(test_excerpt,
                                return_tensors='pt',
                                max_length=cfg.max_len,
                                padding='max_length',
                                truncation=True)
decoded = tokenizer.decode(encoded_dict["input_ids"].squeeze())
decoded

"[CLS] a major step forward in pacemaker function has been to attempt to mimic nature by utilizing various inputs to produce a rate - responsive pacemaker using parameters such as the qt interval, po2 – pco2 ( dissolved oxygen or carbon dioxide levels ) in the arterial - venous system, physical activity as determined by an accelerometer, body temperature, atp levels, adrenaline, etc. instead of producing a static, predetermined heart rate, or intermittent control, such a pacemaker, a'dynamic pacemaker ', could compensate for both actual respiratory loading and potentially anticipated respiratory loading. the first dynamic pacemaker was invented by anthony rickards of the national heart hospital, london, uk, in 1982. dynamic pacemaking technology could also be applied to future artificial hearts. advances in transitional tissue welding would support this and other artificial organ / joint / tissue replacement efforts. stem cells may be of interest in transitional tissue welding. many ad

In [22]:
len(test_excerpt)

1341

In [23]:
test_excerpt

"A major step forward in pacemaker function has been to attempt to mimic nature by utilizing various inputs to produce a rate-responsive pacemaker using parameters such as the QT interval, pO2 – pCO2 (dissolved oxygen or carbon dioxide levels) in the arterial-venous system, physical activity as determined by an accelerometer, body temperature, ATP levels, adrenaline, etc. Instead of producing a static, predetermined heart rate, or intermittent control, such a pacemaker, a 'Dynamic Pacemaker', could compensate for both actual respiratory loading and potentially anticipated respiratory loading. The first dynamic pacemaker was invented by Anthony Rickards of the National Heart Hospital, London, UK, in 1982.\nDynamic pacemaking technology could also be applied to future artificial hearts. Advances in transitional tissue welding would support this and other artificial organ/joint/tissue replacement efforts. Stem cells may be of interest in transitional tissue welding.\nMany advancements hav

In [24]:
def create_dl(df):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.max_len)
    return DataLoader(ds, 
                      batch_size = cfg.batch_size,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [25]:
test_dl = create_dl(test_df)
train_dl = create_dl(train_df)

In [26]:
transformer_model = inference_model.transformer_model

In [27]:
transformer_model.device

device(type='cuda', index=0)

#### Extract Embeddings

In [28]:
def get_cls_embeddings(dl):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
            cls_embeddings.extend(output[0][:,0,:].detach().cpu().numpy())
    return np.array(cls_embeddings)

In [29]:
%%time

train_embeddings_array = get_cls_embeddings(train_dl)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=23.0), HTML(value='')))


CPU times: user 6.58 s, sys: 163 ms, total: 6.74 s
Wall time: 6.87 s


In [30]:
test_embeddings_array = get_cls_embeddings(test_dl)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




#### Extract Number of Bins

In [31]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [32]:
bins

array([7, 7, 6, ..., 8, 7, 8])

In [33]:
train_target = train_df['target'].values

#### Training

In [34]:
from sklearn.metrics import mean_squared_error

def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [35]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold

In [36]:
kfold = StratifiedKFold(n_splits=cfg.n_folds)

In [37]:
X = train_embeddings_array
y = train_target
X_test = test_embeddings_array

In [47]:
scores = []
for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):
    print('Fold', k, train_idx.shape, valid_idx.shape)
    model = SVR(C=10, kernel='rbf', gamma='auto')
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    model.fit(X_train, y_train)
    prediction = model.predict(X_valid)
    print('rmse_score', rmse_score(prediction, y_valid))
    scores.append(model.predict(X_test))

Fold 0 (2267,) (567,)
rmse_score 0.4492405579198507
Fold 1 (2267,) (567,)
rmse_score 0.35553832810703057
Fold 2 (2267,) (567,)
rmse_score 0.35085656296059303
Fold 3 (2267,) (567,)
rmse_score 0.3626662587678037
Fold 4 (2268,) (566,)
rmse_score 0.373989350409932


In [56]:
sample_df['target'] = np.mean(np.array(scores), axis=0)

In [57]:
sample_df

Unnamed: 0,id,target
0,c0f722661,-0.384063
1,f0953f0a5,-0.616383
2,0df072751,-0.55753
3,04caf4e0c,-2.567553
4,0e63f8bea,-1.850043
5,12537fe78,-1.004356
6,965e592c0,0.236562


In [58]:
pd.DataFrame(sample_df).to_csv('submission.csv', index=False)

In [59]:
!cat submission.csv

id,target
c0f722661,-0.38406342950091055
f0953f0a5,-0.6163831246876486
0df072751,-0.5575295642876805
04caf4e0c,-2.567552600658923
0e63f8bea,-1.8500426314170995
12537fe78,-1.0043563930612398
965e592c0,0.23656155728070188
