In [1]:
# !pip uninstall fsspec -y
# !pip install --upgrade fsspec
# !pip install transformers accelerate datasets

# Define

In [1]:
import os
import math
import random
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold
import gc
gc.enable()

In [2]:
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [3]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/clrp-roberta-base/clrp_roberta_base/"
TOKENIZER_PATH = "../input/clrp-roberta-base/clrp_roberta_base/"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [5]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [6]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [7]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

# Predict and submit

## + SVM

In [8]:
train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')


num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()


def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

config = {
    'batch_size': BATCH_SIZE,
    'max_len': MAX_LEN,
    'nfolds':10,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

In [9]:
##############################
# make kfold
##############################


train_data.drop(train_data[(train_data.target == 0) & (train_data.standard_error == 0)].index,
              inplace=True)
train_data.reset_index(drop=True, inplace=True)

NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 248
SEED = 1000


kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_data)): 
    
    print("********",fold,"********")
    train_data.loc[val_indices, 'fold'] = fold
    # traindf1,val_df1 = train_df.iloc[train_indices],train_df.iloc[val_indices]

train_data

******** 0 ********
******** 1 ********
******** 2 ********
******** 3 ********
******** 4 ********


Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,bins,fold
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,7,3.0
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,7,1.0
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,6,4.0
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,5,0.0
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,8,0.0
...,...,...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900,11,2.0
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648,8,4.0
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866,8,3.0
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128,7,1.0


In [10]:
def get_embeddings(df,path,plot_losses=True, verbose=True):
    #cuda使えたら使う構文
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    #tokenizer = AutoTokenizer.from_pretrained('../input/clrp-roberta-base/clrp_roberta_base')
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
    
    ds = CLRPDataset(df, tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    #以下でpredictionsを抽出するために使った構文を使ってembeddingsをreturnしている.
    #SVMの手法とは、embeddingsの意味は？
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    
    gc.collect
    return np.array(embeddings)

In [11]:
# #train/testでembeddingsを取得している

# train1 = train_data[train_data["fold"] != 0]
# train_embeddings1 =  get_embeddings(train1,'../input/roberta-base-20210711202147-sche/model_1.pth')
# test_embeddings1 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_1.pth')


# train2 = train_data[train_data["fold"] != 1]
# train_embeddings2 =  get_embeddings(train2,'../input/roberta-base-20210711202147-sche/model_2.pth')
# test_embeddings2 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_2.pth')


# train3 = train_data[train_data["fold"] != 2]
# train_embeddings3 =  get_embeddings(train3,'../input/roberta-base-20210711202147-sche/model_3.pth')
# test_embeddings3 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_3.pth')


# train4 = train_data[train_data["fold"] != 3]
# train_embeddings4 =  get_embeddings(train4,'../input/roberta-base-20210711202147-sche/model_4.pth')
# test_embeddings4 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_4.pth')


# train5 = train_data[train_data["fold"] != 4]
# train_embeddings5 =  get_embeddings(train5,'../input/roberta-base-20210711202147-sche/model_5.pth')
# test_embeddings5 = get_embeddings(test_data,'../input/roberta-base-20210711202147-sche/model_5.pth')

## CV & pred

In [12]:
### SVM、Ridgeの予測結果提出と　CV計測
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestRegressor

def run():
    preds = []
    scores = []

    svmpreds_list = []
    ridgepreds_list = []
    lassopreds_list = []

    for fold in range(5):

        predssvm = np.zeros((test_df.shape[0]))
        predsridge = np.zeros((test_df.shape[0]))
        predslasso= np.zeros((test_df.shape[0]))

        print('fold  :  ',fold)
        X_train = train_data[train_data["fold"] != fold]
        y_train = train_data[train_data["fold"] != fold]['target']
        X_valid = train_data[train_data["fold"] == fold]
        y_valid = train_data[train_data["fold"] == fold]['target']

        train_embeddings = get_embeddings(X_train,f'../input/roberta-base-20210711202147-sche/model_{fold + 1}.pth')
        valid_embeddings = get_embeddings(X_valid,f'../input/roberta-base-20210711202147-sche/model_{fold + 1}.pth')
        test_embeddings = get_embeddings(test_data,f'../input/roberta-base-20210711202147-sche/model_{fold + 1}.pth')


        model = SVR(C=5,kernel='rbf',gamma='auto')
        model_ridge = Ridge(alpha=20)
        model_lasso = Lasso(alpha=0.05)
#         model_xgb = XGBRegressor(booster = 'gblinear',lamdba = 2)#min_child_weight=0.5

        model.fit(train_embeddings,y_train)
        model_ridge.fit(train_embeddings,y_train)
        model_lasso.fit(train_embeddings,y_train)

        prediction_svm = model.predict(valid_embeddings)
        prediction_ridge = model_ridge.predict(valid_embeddings)
        prediction_lasso = model_lasso.predict(valid_embeddings)

    #     preds += model.predict(X_test)
    #     preds_ridge += model_ridge.predict(X_test)

#         pred_mean = (prediction_svm + prediction_ridge)/2
        pred_mean = (prediction_svm + prediction_ridge + prediction_lasso)/3
        score = rmse_score(y_valid, pred_mean)

        preds.append(pred_mean)

        score = rmse_score(y_valid, pred_mean)
        scores.append(score)
        print(f'fold {fold} score is  : ',score)
        print(scores)

        predssvm += model.predict(test_embeddings)
        predsridge += model_ridge.predict(test_embeddings)
        predslasso += model_lasso.predict(test_embeddings)

        svmpreds_list.append(predssvm)
        ridgepreds_list.append(predsridge)
        lassopreds_list.append(predslasso)
        
    print('mean  :  ',np.array(scores).mean())
    
    return (np.array(svmpreds_list).mean(axis=0) + np.array(ridgepreds_list).mean(axis=0) + np.array(lassopreds_list).mean(axis=0))/3

In [13]:
pred = run()

sample.target = pred
sample.to_csv('submission.csv',index=False)

fold  :   0
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.12it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:04,  7.34it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  3.95it/s]


fold 0 score is  :  0.4855494404796277
[0.4855494404796277]
fold  :   1
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.46it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:04,  7.33it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  3.96it/s]


fold 1 score is  :  0.4637120314760152
[0.4855494404796277, 0.4637120314760152]
fold  :   2
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.46it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:04,  7.41it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.43it/s]


fold 2 score is  :  0.48204202987789674
[0.4855494404796277, 0.4637120314760152, 0.48204202987789674]
fold  :   3
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.45it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:04,  7.34it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.60it/s]


fold 3 score is  :  0.48605628658460814
[0.4855494404796277, 0.4637120314760152, 0.48204202987789674, 0.48605628658460814]
fold  :   4
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.44it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:05,  7.18it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.18it/s]


fold 4 score is  :  0.4729072127812824
[0.4855494404796277, 0.4637120314760152, 0.48204202987789674, 0.48605628658460814, 0.4729072127812824]
mean  :   0.478053400239886


In [14]:
sample

Unnamed: 0,id,target
0,c0f722661,-0.517493
1,f0953f0a5,-0.65457
2,0df072751,-0.421579
3,04caf4e0c,-2.481508
4,0e63f8bea,-1.74005
5,12537fe78,-1.36786
6,965e592c0,0.050517


In [None]:
# np.savetxt('train1.csv',train_embeddings1, delimiter=',')
# np.savetxt('train2.csv',train_embeddings2, delimiter=',')
# np.savetxt('train3.csv',train_embeddings3, delimiter=',')
# np.savetxt('train4.csv',train_embeddings4, delimiter=',')
# np.savetxt('train5.csv',train_embeddings5, delimiter=',')


In [None]:
#SVMをアンサンブル処理している
def get_preds_svm(X,y,X_test,bins=bins,nfolds=10,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    preds_ridge = np.zeros((X_test.shape[0]))
    
#     kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    kfold = KFold(n_splits=5, random_state=1000, shuffle=True)
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=5,kernel=kernel,gamma='auto')
        model_ridge = Ridge(alpha=10)
        #print(train_idx)
        #print(valid_idx)
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        model_ridge.fit(X_train, y_train)
        
        prediction = model.predict(X_valid)
        pred_ridge = model_ridge.predict(X_valid)
        
        #score = rmse_score(prediction,y_valid)
        score = rmse_score(y_valid, prediction)
        print(f'Fold {k} , rmse score: {score}')
        
        scores.append(score)
        preds += model.predict(X_test)
        preds_ridge += model_ridge.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return (np.array(preds)/nfolds + np.array(preds_ridge)/nfolds)/2

In [None]:
from sklearn.linear_model import Ridge

target1 = train_data[train_data["fold"] != 0]['target']
target2 = train_data[train_data["fold"] != 1]['target']
target3 = train_data[train_data["fold"] != 2]['target']
target4 = train_data[train_data["fold"] != 3]['target']
target5 = train_data[train_data["fold"] != 4]['target']

svm_preds1 = get_preds_svm(train_embeddings1,target1,test_embeddings1,bins=bins1)
svm_preds2 = get_preds_svm(train_embeddings2,target2,test_embeddings2,bins=bins2)
svm_preds3 = get_preds_svm(train_embeddings3,target3,test_embeddings3,bins=bins3)
svm_preds4 = get_preds_svm(train_embeddings4,target4,test_embeddings4,bins=bins4)
svm_preds5 = get_preds_svm(train_embeddings5,target5,test_embeddings5,bins=bins5)

In [None]:
# svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

In [None]:
# sample.target = svm_preds
# sample.to_csv('submission.csv',index=False)

In [None]:
sample