# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install transformers



In [None]:
import pandas as pd
import numpy as np

import random
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

from sklearn.metrics import mean_squared_error

import os
from collections import defaultdict
from tqdm import tqdm_notebook

from transformers import AutoConfig, AutoTokenizer, AutoModel, ElectraConfig, ElectraModel, ElectraTokenizer
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import torch.nn.init as init
import torch.nn.functional as F
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction
import math

from torch.utils.data import Sampler, Dataset, DataLoader
import random

from more_itertools import chunked, flatten

In [None]:
%cd drive/MyDrive/CommonLit

/content/drive/MyDrive/CommonLit


# Get folds

In [None]:
df = pd.read_csv("train_folds.csv")

# Seed Everything

In [None]:
def seed_everything(seed=12):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=12)

# Configuration

In [None]:
MAX_LEN = 256
EPOCHS = 4
ROBERTA_LARGE_PATH = "roberta-large"
ROBERTA_BASE_PATH = "roberta-base"
ELECTRA_PATH = 'google/electra-large-discriminator'
TEST_BATCH_SIZE = 32
TOKENIZER_ROBERTA = transformers.AutoTokenizer.from_pretrained(ROBERTA_LARGE_PATH)
TOKENIZER_ELECTRA = ElectraTokenizer.from_pretrained(ELECTRA_PATH)

# Dataset

In [None]:
class RobertaDataset:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER_ROBERTA(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
class ElectraDataset:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER_ELECTRA(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
        token_type_ids = token_type_ids+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

# Model Building

In [None]:
# path Models/RobertaBaseAttentionNorm/model{fold}.bin
class RobertaBaseAttention(nn.Module):
    
    def __init__(self, model_type="attention"):
        super(RobertaBaseAttention,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_BASE_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(ROBERTA_BASE_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(768, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.layer_norm1 = nn.LayerNorm(768)
            self.linear1 = nn.Linear(768, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
            norm_context_vector = self.layer_norm1(context_vector)
            logits = self.linear1(norm_context_vector)
            logits = self.linear2(self.layer_norm2(logits)) 

        return logits

In [None]:
# path : CodeRobertaLargeAttentionNorm2
class RobertaLargeAttention(nn.Module):
    
    def __init__(self, model_type="attention"):
        super(RobertaLargeAttention,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_LARGE_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(ROBERTA_LARGE_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
            norm_context_vector = self.layer_norm1(context_vector)
            logits = self.linear1(norm_context_vector)
            logits = self.linear2(self.layer_norm2(logits)) 

        return logits

In [None]:
# path : Models/CodeRobertaLargeMean/model{fold}.bin
class RobertaLargeMean(nn.Module):
    
    def __init__(self, model_type="mean"):
        super(RobertaLargeMean,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_LARGE_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(ROBERTA_LARGE_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.linear = (nn.Linear(1024, 1))
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 768)
            self.linear2 = nn.Linear(768, 1)
            self.layer_norm2 = nn.LayerNorm(768)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
            logits = self.linear(context_vector)

        return logits

In [None]:
# Attention for roberta path : Models/CodeElectraLargeBaseline/model{fold}.bin
class ElectraLarge(nn.Module):
    
    def __init__(self, model_type="mean"):
        super(ElectraLarge,self).__init__()

        self.model_type = model_type
        
        self.config = ElectraConfig.from_pretrained(ELECTRA_PATH)
        self.config.update({ 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7
                       })   
        
        self.electra = ElectraModel.from_pretrained(ELECTRA_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.linear = (nn.Linear(1024, 1))
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 768)
            self.linear2 = nn.Linear(768, 1)
            self.layer_norm2 = nn.LayerNorm(768)
        
    def forward(self, ids, mask, token_type_ids, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.electra(input_ids=ids, attention_mask=mask, token_type_ids = token_type_ids)
            last_hidden_state = outputs.last_hidden_state
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        return logits

# Create Predictions

In [None]:
def inf_fn(data_loader_electra,data_loader_roberta, model, device):

    model.eval()

    if "Electra" in model.__class__.__name__ :

        with torch.no_grad():
            for index, d in enumerate(data_loader_electra):
                ids = d["ids"]
                mask = d["mask"]
                token_type_ids = d["token_type_ids"]

                ids = ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device, dtype=torch.long)

                outputs = model(ids = ids, mask = mask, token_type_ids = token_type_ids)
                outputs = outputs.cpu().detach().numpy()

                if index == 0:
                    preds_test = outputs
                else:
                    preds_test = np.concatenate((preds_test,outputs), axis=None)

    else:

        with torch.no_grad():
          for index, d in enumerate(data_loader_roberta):
              ids = d["ids"]
              mask = d["mask"]

              ids = ids.to(device, dtype=torch.long)
              mask = mask.to(device, dtype=torch.long)

              outputs = model(ids=ids, mask=mask)

              outputs = outputs.cpu().detach().numpy()

              if index == 0:
                  preds_test = outputs
              else:
                  preds_test = np.concatenate((preds_test,outputs), axis=None)
          
    return preds_test

# Useful Functions

In [None]:
# create pytorch dataloader
def create_dataloader(fold):

    df = pd.read_csv("train_folds.csv")
    valid = df[df.kfold==fold].reset_index(drop=True)
    valid["predictions"] = 0
    valid_electra_dataset = ElectraDataset(valid)
    valid_electra_dataloader = torch.utils.data.DataLoader(valid_electra_dataset, batch_size= TEST_BATCH_SIZE)
    valid_roberta_dataset = RobertaDataset(valid)
    valid_roberta_dataloader = torch.utils.data.DataLoader(valid_roberta_dataset, batch_size= TEST_BATCH_SIZE)

    return valid_electra_dataloader, valid_roberta_dataloader, valid[["predictions", "target"]]

In [None]:
def create_model(device, name, path):
    
    if name == "RobertaLargeAttention":

        model = RobertaLargeAttention().to(device)
        model.load_state_dict(torch.load(path))

    elif name == "RobertaLargeMean":

        model = RobertaLargeMean().to(device)
        model.load_state_dict(torch.load(path))

    elif name == "ElectraLarge":

        model = ElectraLarge().to(device)
        model.load_state_dict(torch.load(path))

    elif name == "RobertaBaseAttention":

        model = RobertaBaseAttention().to(device)
        model.load_state_dict(torch.load(path))

    else:
        raise Exception(f"Unknown model: {name}")
    
    return model

# Prepare Training

In [None]:
def prepare_models():
    device = torch.device("cuda")
    electra = [_ for i in range(5)]
    robertalargemean = [_ for i in range(5)]
    robertalargeattention = [_ for i in range(5)]
    robertabase = [_ for i in range(5)]

    for fold in range(3,5):
        print(f"Creating models for fold {fold} ...")
        electra[fold] = create_model(device, "ElectraLarge", f"Models/CodeElectraLargeBaseline/model{fold}.bin")
        robertalargemean[fold] = create_model(device, "RobertaLargeMean", f"Models/CodeRobertaLargeMean/model{fold}.bin")
        robertalargeattention[fold] = create_model(device, "RobertaLargeAttention", f"Models/CodeRobertaLargeAttentionNorm2/model{fold}.bin")
        robertabase[fold] = create_model(device, "RobertaBaseAttention", f"Models/CodeRobertaBaseAttentionNorm/model{fold}.bin")

    return electra, robertalargemean, robertalargeattention, robertabase, device

# Get all our predictions

In [None]:
def create_predictions(models, fold):
    dataloader_electra, dataloader_roberta, valid_df = create_dataloader(fold)

    for model in mymodels:
      predictions = inf_fn(dataloader_electra, dataloader_roberta, model, device)
      valid_df[str(model.__class__.__name__)] = predictions

    return valid_df

# Run

In [None]:
# Create all the models for the different folds and for each model types 
electra, robertalargemean, robertalargeattention, robertabase, device = prepare_models()

Creating models for fold 3 ...


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_

Creating models for fold 4 ...


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_

In [None]:
device = torch.device("cuda")
valids = [ _ for i in range(5)]
for fold in range(3,5):

  mymodels = [electra[fold], robertalargemean[fold], robertalargeattention[fold], robertabase[fold]]
  valids[fold] = create_predictions(mymodels, fold)

In [None]:
dataset = valids[3]
for fold in range(3,5):
  dataset = pd.concat([dataset,valids[fold]])

print(f"ElectraLarge : {mean_squared_error(dataset.target, dataset.ElectraLarge, squared=False)}")
print(f"RobertaLargeMean : {mean_squared_error(dataset.target, dataset.RobertaLargeMean, squared=False)}")
print(f"RobertaLargeAttention : {mean_squared_error(dataset.target, dataset.RobertaLargeAttention, squared=False)}")
print(f"RobertaBaseAttention : {mean_squared_error(dataset.target, dataset.RobertaBaseAttention, squared=False)}")

dataset.to_csv("tuning2.csv")

ElectraLarge : 0.4897325125403068
RobertaLargeMean : 0.47503992885684787
RobertaLargeAttention : 0.4810795474955003
RobertaBaseAttention : 0.48282229526676057


# Import the dataset

In [None]:
df = pd.read_csv("tuning1.csv")
df2 = pd.read_csv("tuning2.csv")

df = pd.concat([df,df2])

print(df.shape)
df = df.drop_duplicates()
print(df.shape)

print("\n")

print(f"ElectraLarge : {mean_squared_error(dataset.target, dataset.ElectraLarge, squared=False)}")
print(f"RobertaLargeMean : {mean_squared_error(dataset.target, dataset.RobertaLargeMean, squared=False)}")
print(f"RobertaLargeAttention : {mean_squared_error(dataset.target, dataset.RobertaLargeAttention, squared=False)}")
print(f"RobertaBaseAttention : {mean_squared_error(dataset.target, dataset.RobertaBaseAttention, squared=False)}")

df.to_csv("tuning.csv")

(3968, 7)
(2834, 7)


ElectraLarge : 0.4897325125403068
RobertaLargeMean : 0.47503992885684787
RobertaLargeAttention : 0.4810795474955003
RobertaBaseAttention : 0.48282229526676057


# Optuna Tuning Function

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-2.8.0-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 15.3 MB/s 
[?25hCollecting cliff
  Downloading cliff-3.8.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 7.0 MB/s 
Collecting colorlog
  Downloading colorlog-5.0.1-py2.py3-none-any.whl (10 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting alembic
  Downloading alembic-1.6.5-py2.py3-none-any.whl (164 kB)
[K     |████████████████████████████████| 164 kB 23.2 MB/s 
Collecting Mako
  Downloading Mako-1.1.4-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 3.8 MB/s 
[?25hCollecting python-editor>=0.3
  Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)
Collecting stevedore>=2.0.1
  Downloading stevedore-3.3.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 5.2 MB/s 
[?25hCollecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.6.0-py2.py3-none-any.

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:
%cd drive/MyDrive/CommonLit
df = pd.read_csv("tuning.csv")

/content/drive/MyDrive/CommonLit


In [None]:
def objective(trial,df = df):

    r_min = 0
    r_max = 1

    a = trial.suggest_uniform('a', r_min, r_max)
    b = trial.suggest_uniform('b', r_min, r_max)
    c = trial.suggest_uniform('c', r_min, r_max)
    d = trial.suggest_uniform('d', r_min, r_max)


    df["predictions"] = a * df["ElectraLarge"] + b * df["RobertaLargeMean"] + c * df["RobertaLargeAttention"] + d * df["RobertaBaseAttention"] 


    return mean_squared_error(df.target, df.predictions, squared=False)

# Optuna Blending

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best Score:', study.best_value)

[32m[I 2021-08-01 16:49:27,782][0m A new study created in memory with name: no-name-17fc7957-9239-4909-8937-c67c0c1646eb[0m
[32m[I 2021-08-01 16:49:27,790][0m Trial 0 finished with value: 2.9014326648196715 and parameters: {'a': 0.9850085718612486, 'b': 0.7935180485128943, 'c': 0.9437979298665712, 'd': 0.5062935762571645}. Best is trial 0 with value: 2.9014326648196715.[0m
[32m[I 2021-08-01 16:49:27,795][0m Trial 1 finished with value: 1.854555386263828 and parameters: {'a': 0.6336315690622176, 'b': 0.1512905136892917, 'c': 0.8300966125877, 'd': 0.7820130282678791}. Best is trial 1 with value: 1.854555386263828.[0m
[32m[I 2021-08-01 16:49:27,803][0m Trial 2 finished with value: 0.4750958480624098 and parameters: {'a': 0.03573373223063214, 'b': 0.11323692497135895, 'c': 0.07960684273683805, 'd': 0.7796845201859508}. Best is trial 2 with value: 0.4750958480624098.[0m
[32m[I 2021-08-01 16:49:27,808][0m Trial 3 finished with value: 0.6718657388614296 and parameters: {'a': 0.2

Number of finished trials: 100
Best trial: {'a': 0.1654968128943937, 'b': 0.18849169201290306, 'c': 0.22798289283669515, 'd': 0.450639158367492}
Best Score: 0.4676331389980447


In [None]:
# Best trial: {'a': 0.1654968128943937, 'b': 0.18849169201290306, 'c': 0.22798289283669515, 'd': 0.450639158367492}
# Best Score: 0.4676331389980447 LB ?

In [None]:
# ElectraLarge : 0.4897325125403068
# RobertaLargeMean : 0.47503992885684787
# RobertaLargeAttention : 0.4810795474955003
# RobertaBaseAttention : 0.48282229526676057