テスト




# Define

In [1]:
import os
import math
import random
import time
import glob
import re
import gc; gc.enable()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import gc


from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.linear_model import Lasso

import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
# from torch.utils.data import Dataset
# from torch.utils.data import DataLoader
from torch.utils.data import Dataset, SequentialSampler, DataLoader


import transformers
from transformers import get_cosine_schedule_with_warmup
from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging

import gc
gc.enable()


import tensorflow as tf 
from tensorflow.keras.layers import Input,LSTM,Bidirectional,Embedding,Dense, Conv1D, Dropout , MaxPool1D , MaxPooling1D, GlobalAveragePooling2D , GlobalAveragePooling1D , GlobalMaxPooling1D , concatenate , Flatten
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model,load_model,save_model , model_from_json
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping ,LearningRateScheduler
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K

from transformers import TFBertModel, BertTokenizerFast , BertTokenizer , RobertaTokenizerFast , TFRobertaModel , RobertaConfig , TFAutoModel , AutoTokenizer



In [2]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

# roberta base embedding Lasso & Ridge CV: 0.4753
https://www.kaggle.com/iamnishipy/submit-my-roberta-base-svm?scriptVersionId=69168038

In [3]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/clrp-roberta-base/clrp_roberta_base/"
TOKENIZER_PATH = "../input/clrp-roberta-base/clrp_roberta_base/"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')


num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()


def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

config = {
    'batch_size': BATCH_SIZE,
    'max_len': MAX_LEN,
    'nfolds':10,
    'seed':42,
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])


class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    
    
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)
    

def predict(model, data_loader):
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    #tokenizer = AutoTokenizer.from_pretrained('../input/clrp-roberta-base/clrp_roberta_base')
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
    
    ds = CLRPDataset(df, tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    #以下でpredictionsを抽出するために使った構文を使ってembeddingsをreturnしている.
    #SVMの手法とは、embeddingsの意味は？
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)


def get_preds_svm(X,y,X_test,bins=bins,nfolds=10,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    preds_ridge = np.zeros((X_test.shape[0]))
    preds_lasso = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=5,kernel=kernel,gamma='auto')
        model_ridge = Ridge(alpha=20)
        model_lasso = Lasso(alpha=0.05)
        #print(train_idx)
        #print(valid_idx)
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        model_ridge.fit(X_train, y_train)
        model_lasso.fit(X_train,y_train)
        
        prediction = model.predict(X_valid)
        pred_ridge = model_ridge.predict(X_valid)
        pred_lasso = model_lasso.predict(X_valid)
        
        pred_mean = (prediction + pred_ridge + pred_lasso)/3
        
        #score = rmse_score(prediction,y_valid)
        score = rmse_score(y_valid, pred_mean)
        print(f'Fold {k} , rmse score: {score}')
        
        scores.append(score)
        preds += model.predict(X_test)
        preds_ridge += model_ridge.predict(X_test)
        preds_lasso += model_lasso.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return (np.array(preds)/nfolds + np.array(preds_ridge)/nfolds + np.array(preds_lasso)/nfolds)/3

In [4]:
train_data = pd.read_csv('../input/commonlit-train-dataset/train_stratiKfold.csv')

train_data.drop(train_data[(train_data.target == 0) & (train_data.standard_error == 0)].index,
              inplace=True)
train_data.reset_index(drop=True, inplace=True)

NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 248
SEED = 1000


# kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

# for fold, (train_indices, val_indices) in enumerate(kfold.split(train_data)): 
    
#     print("********",fold,"********")
#     train_data.loc[val_indices, 'fold'] = fold
#     # traindf1,val_df1 = train_df.iloc[train_indices],train_df.iloc[val_indices]

train_data.head()

Unnamed: 0.1,Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,kfold,bins
0,0,56a925239,https://kids.frontiersin.org/article/10.3389/f...,CC BY 4.0,What makes epilepsy and seizures so mysterious...,0.105749,0.492565,1,8
1,1,bf24448fb,,,"Anywhere there is a frontier, where there are ...",-1.866238,0.510911,3,4
2,2,7cad0f936,,,"A great violinist, Ole Bull by name, visited t...",-0.578482,0.471768,2,6
3,3,284eaa5ad,,,As to surface-slope its measurement—from nearl...,-3.639936,0.603819,1,0
4,4,91e87e7dc,,,Hans stopped snoring and awoke at supper-time....,-0.186015,0.492731,2,7


## inference

In [5]:
from sklearn.linear_model import Lasso

def run():
    preds = []
    scores = []

    svmpreds_list = []
    ridgepreds_list = []
    lassopreds_list = []

    for fold in range(5):

        predssvm = np.zeros((test_df.shape[0]))
        predsridge = np.zeros((test_df.shape[0]))
        predslasso= np.zeros((test_df.shape[0]))

        print('fold  :  ',fold)
        X_train = train_data[train_data["kfold"] != fold]
        y_train = train_data[train_data["kfold"] != fold]['target']
        X_valid = train_data[train_data["kfold"] == fold]
        y_valid = train_data[train_data["kfold"] == fold]['target']

        train_embeddings = get_embeddings(X_train,f'../input/roberta-base-20210730175534-stk/model_{fold + 1}.pth')
        valid_embeddings = get_embeddings(X_valid,f'../input/roberta-base-20210730175534-stk/model_{fold + 1}.pth')
        test_embeddings = get_embeddings(test_data,f'../input/roberta-base-20210730175534-stk/model_{fold + 1}.pth')


#         model = SVR(C=5,kernel='rbf',gamma='auto')
        model_ridge = Ridge(alpha=20)
        model_lasso = Lasso(alpha=0.05)
#         model_xgb = XGBRegressor(booster = 'gblinear',lamdba = 2)#min_child_weight=0.5

#         model.fit(train_embeddings,y_train)
        model_ridge.fit(train_embeddings,y_train)
        model_lasso.fit(train_embeddings,y_train)

#         prediction_svm = model.predict(valid_embeddings)
        prediction_ridge = model_ridge.predict(valid_embeddings)
        prediction_lasso = model_lasso.predict(valid_embeddings)

    #     preds += model.predict(X_test)
    #     preds_ridge += model_ridge.predict(X_test)

#         pred_mean = (prediction_svm + prediction_ridge)/2
        pred_mean = (prediction_ridge + prediction_lasso)/2
        score = rmse_score(y_valid, pred_mean)

        preds.append(pred_mean)

        score = rmse_score(y_valid, pred_mean)
        scores.append(score)
        print(f'fold {fold} score is  : ',score)
        print(scores)

#         predssvm += model.predict(test_embeddings)
        predsridge += model_ridge.predict(test_embeddings)
        predslasso += model_lasso.predict(test_embeddings)

#         svmpreds_list.append(predssvm)
        ridgepreds_list.append(predsridge)
        lassopreds_list.append(predslasso)
        
    print('mean  :  ',np.array(scores).mean())
    
    return (np.array(ridgepreds_list).mean(axis=0) + np.array(lassopreds_list).mean(axis=0))/2
#     return (np.array(svmpreds_list).mean(axis=0) + np.array(ridgepreds_list).mean(axis=0) + np.array(lassopreds_list).mean(axis=0))/3

In [6]:
roberta_lassoridge_pred = run()
roberta_lassoridge_pred


fold  :   0
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:20,  6.91it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:04,  7.24it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  5.01it/s]


fold 0 score is  :  0.46943715167658834
[0.46943715167658834]
fold  :   1
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.35it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:05,  7.13it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.46it/s]


fold 1 score is  :  0.47681086999077194
[0.46943715167658834, 0.47681086999077194]
fold  :   2
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.28it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:05,  7.10it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  4.40it/s]


fold 2 score is  :  0.45202914859632504
[0.46943715167658834, 0.47681086999077194, 0.45202914859632504]
fold  :   3
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.36it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:05,  6.72it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  3.91it/s]


fold 3 score is  :  0.4631896225857212
[0.46943715167658834, 0.47681086999077194, 0.45202914859632504, 0.4631896225857212]
fold  :   4
cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
142it [00:19,  7.30it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
36it [00:05,  7.00it/s]


cuda is used


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
1it [00:00,  3.55it/s]

fold 4 score is  :  0.5150616164873529
[0.46943715167658834, 0.47681086999077194, 0.45202914859632504, 0.4631896225857212, 0.5150616164873529]
mean  :   0.4753056818673519





array([-0.47041614, -0.65597635, -0.32244206, -2.42862616, -1.64267496,
       -1.37886902,  0.12796896])

# Emsemble

In [7]:
submission_df = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

predictions = pd.DataFrame()
# predictions = y_test * 0.6 + svm_ridge_preds * 0.2 + roberta_svm_ridge_preds * 0.2

## ver10 t5_large_svm利用
# predictions = y_test * 0.6 + svm_ridge_pred * 0.2 + roberta_svm_ridge_preds * 0.2 

# ## ver12 roberta meanpooling , stacking + blending + t5_large_svm + nishipy roberta svm利用
# blending_pred =  (nishipy_roberta + nishipy_robertalarge + robertalarge_meanpool + robertabase_meanpool)/3
# predictions = stacking_pred * 0.3 + blending_pred * 0.3 + svm_ridge_pred * 0.2 + roberta_svm_ridge_preds * 0.2 

## ver14 エラーの原因調査　SVMモデルを削除して　Roberta meanが悪いのか切り分け
# blending_pred =  (nishipy_roberta + nishipy_robertalarge + robertalarge_meanpool + robertabase_meanpool)/4
# predictions = stacking_pred * 0.3 + blending_pred * 0.3 + large_svmridge_pred * 0.2 + roberta_svmridge_pred * 0.2

# ## ver17
# blending_pred =  (nishipy_roberta + nishipy_robertalarge + robertalarge_meanpool)/3
# predictions = stacking_pred * 0.3 + blending_pred * 0.3 + large_svmridge_pred * 0.2 + roberta_svmridge_pred * 0.2

# ## ver17
# blending_pred =  (nishipy_roberta + nishipy_robertalarge + robertalarge_meanpool)/3
# predictions = stacking_pred * 0.3 + blending_pred * 0.3 + ((large_svmridge_pred + roberta_svmridge_pred + t5_embedding_pred)/3) * 0.4

## ver22
# blending_pred =  (nishipy_roberta + nishipy_robertalarge + robertalarge_meanpool)/3
# predictions = stacking_pred * 0.2 + blending_pred * 0.25 + roberta_svmridge_pred * 0.2 + t5_embedding_pred * 0.35

## ver22
# blending_pred =  (nishipy_roberta + nishipy_robertalarge + robertalarge_meanpool + robertabase_meanpool)/4
# predictions = stacking_pred * 0.25 + blending_pred * 0.25 + ((roberta_svmridge_pred + large_svmridge_pred + t5_embedding_pred  + svm_ridge_preds)/4) * 0.5

# blending_pred =  (nishipy_roberta + nishipy_robertalarge + robertalarge_meanpool)/3
# predictions = stacking_pred * 0.25 + blending_pred * 0.25 + ((roberta_lassoridge_pred + t5_embedding_pred)/2) * 0.5

submission_df.target = roberta_lassoridge_pred
print(submission_df)
submission_df.to_csv("submission.csv", index=False)

          id    target
0  c0f722661 -0.470416
1  f0953f0a5 -0.655976
2  0df072751 -0.322442
3  04caf4e0c -2.428626
4  0e63f8bea -1.642675
5  12537fe78 -1.378869
6  965e592c0  0.127969


# Emsemble