In [1]:
# !pip uninstall fsspec -y
# !pip install --upgrade fsspec
# !pip install transformers accelerate datasets

# Define

In [2]:
import os
import math
import random
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
import gc
gc.enable()

In [3]:
# test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
# submission_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [4]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/clrp-roberta-base/clrp_roberta_base/"
TOKENIZER_PATH = "../input/clrp-roberta-base/clrp_roberta_base/"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [6]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [7]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [8]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

# Predict and submit

## old

In [9]:
# all_predictions = np.zeros((5, len(test_df)))

# test_dataset = LitDataset(test_df, inference_only=True)
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
#                          drop_last=False, shuffle=False, num_workers=2)

# for index in range(5):
#     #CHANGEME
#     model_path = f"../input/roberta-base-20210711202147-sche/model_{index + 1}.pth"
#     print(f"\nUsing {model_path}")
                        
#     model = LitModel()
#     model.load_state_dict(torch.load(model_path))    
#     model.to(DEVICE)
    
#     all_predictions[index] = predict(model, test_loader)
    
#     del model
#     gc.collect()
    
# predictions = all_predictions.mean(axis=0)
# submission_df.target = predictions
# print(submission_df)
# submission_df.to_csv("submission.csv", index=False)

## + SVM

In [10]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')


num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()


def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

config = {
    'batch_size': BATCH_SIZE,
    'max_len': MAX_LEN,
    'nfolds':10,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [11]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    #cuda使えたら使う構文
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    #tokenizer = AutoTokenizer.from_pretrained('../input/clrp-roberta-base/clrp_roberta_base')
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
    
    ds = CLRPDataset(df, tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    #以下でpredictionsを抽出するために使った構文を使ってembeddingsをreturnしている.
    #SVMの手法とは、embeddingsの意味は？
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [12]:
#train/testでembeddingsを取得している
train_embeddings1 =  get_embeddings(train_data,'../input/roberta-base-20210711202147-sche/model_1.pth')
test_embeddings1 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_1.pth')

train_embeddings2 =  get_embeddings(train_data,'../input/roberta-base-20210711202147-sche/model_2.pth')
test_embeddings2 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_2.pth')

train_embeddings3 =  get_embeddings(train_data,'../input/roberta-base-20210711202147-sche/model_3.pth')
test_embeddings3 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_3.pth')

train_embeddings4 =  get_embeddings(train_data,'../input/roberta-base-20210711202147-sche/model_4.pth')
test_embeddings4 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_4.pth')

train_embeddings5 =  get_embeddings(train_data,'../input/roberta-base-20210711202147-sche/model_5.pth')
test_embeddings5 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_5.pth')

cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
178it [00:24,  7.25it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  6.05it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
178it [00:23,  7.50it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.37it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
178it [00:23,  7.52it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.35it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
178it [00:23,  7.53it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.95it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
178it [00:23,  7.51it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.39it/s]


In [13]:
len(target)

2834

In [14]:
len(train_embeddings1)

2834

In [15]:
len(test_embeddings1)

7

In [16]:
#SVMをアンサンブル処理している
def get_preds_svm(X,y,X_test,bins=bins,nfolds=10,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    preds_ridge = np.zeros((X_test.shape[0]))
    preds_lasso = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=5,kernel=kernel,gamma='auto')
        model_ridge = Ridge(alpha=20)
        model_lasso = Lasso(alpha=0.05)
        #print(train_idx)
        #print(valid_idx)
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        model_ridge.fit(X_train, y_train)
        model_lasso.fit(X_train,y_train)
        
        prediction = model.predict(X_valid)
        pred_ridge = model_ridge.predict(X_valid)
        pred_lasso = model_lasso.predict(X_valid)
        
        pred_mean = (prediction + pred_ridge + pred_lasso)/3
        
        #score = rmse_score(prediction,y_valid)
        score = rmse_score(y_valid, pred_mean)
        print(f'Fold {k} , rmse score: {score}')
        
        scores.append(score)
        preds += model.predict(X_test)
        preds_ridge += model_ridge.predict(X_test)
        preds_lasso += model_lasso.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return (np.array(preds)/nfolds + np.array(preds_ridge)/nfolds + np.array(preds_lasso)/nfolds)/3
#     return (np.array(preds)/nfolds + np.array(preds_ridge)/nfolds)/2

In [17]:


svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)

Fold 0 , rmse score: 0.3285651438956938
Fold 1 , rmse score: 0.3029792457327855
Fold 2 , rmse score: 0.35447089363498874
Fold 3 , rmse score: 0.3362108589664572
Fold 4 , rmse score: 0.32410958774096893
Fold 5 , rmse score: 0.3297230391591488
Fold 6 , rmse score: 0.3185978833156834
Fold 7 , rmse score: 0.3017912288240006
Fold 8 , rmse score: 0.35286312766052275
Fold 9 , rmse score: 0.3464304850294086
mean rmse 0.3295741493959658
Fold 0 , rmse score: 0.3200832028531208
Fold 1 , rmse score: 0.3082577203255921
Fold 2 , rmse score: 0.3279085728218055
Fold 3 , rmse score: 0.2924494066072712
Fold 4 , rmse score: 0.2901116618653528
Fold 5 , rmse score: 0.29874962004491723
Fold 6 , rmse score: 0.2936847318402417
Fold 7 , rmse score: 0.29110866731537965
Fold 8 , rmse score: 0.3187259015387141
Fold 9 , rmse score: 0.29699939829499905
mean rmse 0.30380788835073946
Fold 0 , rmse score: 0.30625173941414546
Fold 1 , rmse score: 0.2764677547304623
Fold 2 , rmse score: 0.3296070676539521
Fold 3 , rmse 

In [18]:
svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

In [19]:
sample.target = svm_preds
sample.to_csv('submission.csv',index=False)

In [20]:
sample

Unnamed: 0,id,target
0,c0f722661,-0.519064
1,f0953f0a5,-0.655383
2,0df072751,-0.42362
3,04caf4e0c,-2.476178
4,0e63f8bea,-1.73717
5,12537fe78,-1.366135
6,965e592c0,0.046722
