In [None]:
!pip install transformers datasets sentencepiece optuna ray opendatasets iterative-stratification wandb
!nvidia-smi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 24.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 91.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 84.5 MB/s 
[?25hCollecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 100.1 MB/s 
[?25hCollecting ray
  Downloading ray-2.1.0-cp37-cp37m-manylinux2014_x86_64.whl (59.1 MB)
[K     |████████████████████████████████| 59.1 MB 104.1 MB/s 
[?25hCollecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting iterative-stratification
  

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.13.2
transformers.__version__: 4.24.0
env: TOKENIZERS_PARALLELISM=false


# CFG

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
#/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/robertabasemeanpooling/oof_df.pkl
#/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/robertabaseattentionpooling/oof_df.pkl

def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

oof_df = pd.read_pickle("/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/robertabaseattentionpooling/oof_df.pkl")
cv = get_score(oof_df[target_cols].values, oof_df[[f"pred_{c}" for c in target_cols]].values)
print(cv)

(0.45207626539736295, [0.48569662419065474, 0.44481766183957777, 0.412822196189108, 0.4525196407436513, 0.4766498200374196, 0.4399516493837667])


In [None]:
base = "/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/"

In [None]:
class CFG1:
    num_workers=4
    path=base + "debertabasev3meanpooling/"
    config_path=path + "config.pth"
    model="microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(path + "tokenizer")
    gradient_checkpointing=False
    batch_size = 48#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    init_weight="normal" # xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal, normal
    pooling="mean" # mean, attention, cls, concat
    msd_num=8
    msd=False
    
class CFG2:
    num_workers=4
    path=base + "debertabasev3attentionpooling/"
    config_path=path + "config.pth"
    model="microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(path + "tokenizer")
    gradient_checkpointing=False
    batch_size = 48#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    init_weight="normal" # xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal, normal
    pooling="attention" # mean, attention, cls, concat
    msd_num=8
    msd=False
    
class CFG3:
    num_workers=4
    path=base + "debertabasev3clspooling/"
    config_path=path + "config.pth"
    model="microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(path + "tokenizer")
    gradient_checkpointing=False
    batch_size = 48#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    init_weight="normal" # xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal, normal
    pooling="cls" # mean, attention, cls, concat
    msd_num=8
    msd=False
            
class CFG4:
    num_workers=4
    path=base + "debertabasev3attentionpoolingfgm/"
    config_path=path + 'config/config.json'
    model="microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(path + 'tokenizer')
    gradient_checkpointing=False
    batch_size = 48#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    pooling = 'attention'
    layer_start = 4
    
class CFG5:
    num_workers=4
    path=base + "debertabasev3meanpoolingfgm/"
    config_path=path + 'config/config.json'
    model="microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(path + 'tokenizer')
    gradient_checkpointing=False
    batch_size = 16#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    pooling = 'mean'
    layer_start = 4
    
class CFG6:
    num_workers=4
    path=base + "debertalargev3meanpooling/"
    config_path=path + "config.pth"
    model="microsoft/deberta-v3-large"
    tokenizer = AutoTokenizer.from_pretrained(path + "tokenizer")
    gradient_checkpointing=False
    batch_size = 32#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    init_weight="normal" # xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal, normal
    pooling="mean" # mean, attention, cls, concat
    msd_num=8
    msd=False
    
class CFG7:
    num_workers=4
    path=base + "debertalargev3attentionpooling/"
    config_path=path + "config.pth"
    model="microsoft/deberta-v3-large"
    tokenizer = AutoTokenizer.from_pretrained(path + "tokenizer")
    gradient_checkpointing=False
    batch_size = 32#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    init_weight="normal" # xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal, normal
    pooling="attention" # mean, attention, cls, concat
    msd_num=8
    msd=False
    
class CFG8:
    num_workers=4
    path=base + "debertalargev3clspooling/"
    config_path=path + "config.pth"
    model="microsoft/deberta-v3-large"
    tokenizer = AutoTokenizer.from_pretrained(path + "tokenizer")
    gradient_checkpointing=False
    batch_size = 32#4
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=list(range(n_fold))
    init_weight="normal" # xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal, normal
    pooling="cls" # mean, attention, cls, concat
    msd_num=8
    msd=False

CFG_list1 = [CFG1, CFG2, CFG3, CFG6, CFG7, CFG8]
CFG_list2 = [CFG5, CFG4]
CFG_list = CFG_list1 + CFG_list2

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def get_oof(CFG):
    oof = pd.read_pickle(CFG.path+'oof_df.pkl')
    train = pd.read_csv('/content/drive/MyDrive/Kaggle Training Results/English Language Learning/data/train.csv').drop(columns=CFG1.target_cols + ["full_text"])
    merged = pd.merge(train, oof, how="left", on="text_id")
    return merged

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# OOF

In [None]:
# ====================================================
# oof
# ====================================================
oof_preds = []
labels = []
weights = np.load("/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/ensemble_weights.npy")

for CFG in CFG_list:
    oof = get_oof(CFG)
    target = ["pred_cohesion", "pred_syntax", "pred_vocabulary", "pred_phraseology", "pred_grammar", "pred_conventions"]
    oof_preds.append(oof[target].values)
    labels = oof[CFG.target_cols].values
    
ensemble_preds = np.clip(np.average(oof_preds, weights=weights, axis=0), 1, 5)
score = get_score(labels, ensemble_preds)[0]
print(f"cv score: {score}")

cv score: 0.44169211414706844


# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        #max_length=CFG.max_len,
        #pad_to_max_length=True,
        #truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

# Model

In [None]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min = 1e-9)
        mean_embeddings = sum_embeddings/sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
    
class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim = 1)
        return min_embeddings

#Attention pooling
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

#There may be a bug in my implementation because it does not work well.
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average

# Model

In [None]:
class Model1(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        
        self.pool = MeanPooling()
        self.attention = AttentionPooling(self.config.hidden_size)
        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)
        self.concat_pool = nn.Linear(self.config.hidden_size*3, self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self._init_weights(self.concat_pool)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data) 
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data) 
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs.last_hidden_state

        if self.cfg.msd:
            mean_feature = torch.mean(torch.stack([self.pool(self.high_dropout(last_hidden_states), inputs['attention_mask']) for _ in range(self.cfg.msd_num)], dim=0), dim=0)    
            attention_feature = torch.mean(torch.stack([self.pool(self.high_dropout(last_hidden_states), inputs['attention_mask']) for _ in range(self.cfg.msd_num)], dim=0), dim=0)  
            cls_token_feature = torch.mean(torch.stack([self.high_dropout(last_hidden_states)[:, 0, :] for _ in range(self.cfg.msd_num)], dim=0), dim=0)
            combine_feature = torch.cat([mean_feature, attention_feature, cls_token_feature], dim = -1)
            feature = self.concat_pool(combine_feature)
            if self.cfg.pooling == "mean":
                return mean_feature
            elif self.cfg.pooling == "attention":
                return attention_feature
            elif self.cfg.pooling == "cls":
                return cls_token_feature
            else:
                return feature
        else:
        # mean pooled sentence representation
            mean_feature = self.pool(last_hidden_states, inputs['attention_mask'])
        # attention based sentence representation
            attention_feature = self.attention(last_hidden_states, inputs['attention_mask'])
        # CLS Token representation
            cls_token_feature = last_hidden_states[:, 0, :] # only cls token
        # Concat them
            combine_feature = torch.cat([mean_feature, attention_feature, cls_token_feature], dim = -1)
        # MLP
            feature = self.concat_pool(combine_feature)
            if self.cfg.pooling == "mean":
                return mean_feature
            elif self.cfg.pooling == "attention":
                return attention_feature
            elif self.cfg.pooling == "cls":
                return cls_token_feature
            else:
                return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
class Model2(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            #self.config = torch.load(config_path)
            self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
            LOGGER.info(self.config)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        
        if CFG.pooling == 'mean':
            self.pool = MeanPooling()
        elif CFG.pooling == 'max':
            self.pool = MaxPooling()
        elif CFG.pooling == 'min':
            self.pool = MinPooling()
        elif CFG.pooling == 'attention':
            self.pool = AttentionPooling(self.config.hidden_size)
        elif CFG.pooling == 'weightedlayer':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers, layer_start = CFG.layer_start, layer_weights = None)        

        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# inference

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
final = []
for _idx, CFG in enumerate(CFG_list):
    test = pd.read_csv('/content/drive/MyDrive/Kaggle Training Results/English Language Learning/data/prev_train.csv', index_col=0).drop_duplicates(subset=['text_id'])
    submission = pd.read_csv("/content/drive/MyDrive/Kaggle Training Results/English Language Learning/data/prev_train.csv", index_col=0).drop_duplicates(subset=['text_id'])
    # sort by length to speed up inference
    test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['full_text'].values]
    test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)

    test_dataset = TestDataset(CFG, test)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    predictions = []
    for fold in CFG.trn_fold:
        if _idx < 6:
            model = Model1(CFG, config_path=CFG.config_path, pretrained=False)
            state = torch.load(CFG.path+f"modelfold{fold + 1}normalllrdnomsdnormal.pth",
                           map_location=torch.device('cpu'))
            model.load_state_dict(state)
        else:
            model = Model2(CFG, config_path=CFG.config_path, pretrained=False)
            state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
            model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
    predictions = np.mean(predictions, axis=0)
    test[CFG.target_cols] = predictions
    submission = pd.merge(submission["text_id"], test[["text_id"] + CFG.target_cols], how="left")
    final.append(submission[CFG.target_cols].values)
    del test, submission, predictions, test_dataset, test_loader; gc.collect()
    torch.cuda.empty_cache() 

# Ensemble

In [None]:
pseudo = pd.read_csv('/content/drive/MyDrive/Kaggle Training Results/English Language Learning/data/prev_train.csv', index_col=0).drop_duplicates(subset=['text_id'])

ens = np.clip(np.average(final, weights=weights, axis=0), 1, 5)
np.save("/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/ens.npy", ens)
print(len(ens))

#ens = (sub1 + sub2)/(CFG1.weight + CFG2.weight)

pseudo[CFG1.target_cols] = ens
display(pseudo)
pseudo.to_csv(f'/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/pseudo.csv', index=False)

15142


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,423A1CA112E2,Phones\n\nModern humans today are always on th...,3.056048,3.112964,3.345940,3.206383,3.084330,3.322059
1,A8445CABFECE,Phones & Driving\n\nDrivers should not be able...,3.710860,3.760511,3.786949,3.763001,3.782464,3.918429
2,6B4F7A0165B9,Cell Phone Operation While Driving\n\nThe abil...,4.053563,4.069672,4.481086,4.328255,4.303818,4.165030
3,E05C7F5C1156,People are debating whether if drivers should ...,4.110247,4.031701,4.162265,4.095508,3.993166,4.082003
4,50B3435E475B,Texting and driving\n\nOver half of drivers in...,3.845428,3.925613,4.090289,4.076712,4.128270,3.972077
...,...,...,...,...,...,...,...,...
15137,0814426B27DF,Most people ask more than one person for advic...,3.305353,3.320829,3.459901,3.451811,3.593226,3.524821
15138,8F4B595CF9E7,Do you ever want more opinions and options whe...,3.959244,3.955805,4.054248,4.072804,4.118625,4.013373
15139,6B5809C83978,Has anyone ever gave you advice? Was the advic...,4.187916,4.175071,4.120436,4.159891,4.297247,4.299526
15140,AFEC37C2D43F,There has been at least one point in everyone'...,4.045924,4.010312,4.133028,4.121380,4.199779,3.930347
