In [1]:
import os
import gc
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import warnings
from torch.optim.swa_utils import AveragedModel
from transformers import AutoTokenizer, AutoConfig, AutoModel
from text_unidecode import unidecode
from typing import Tuple
import codecs
import re
from transformers import Trainer, TrainingArguments
from functools import partial
import datasets
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

warnings.filterwarnings("ignore")
gc.collect()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

INPUT_DIR = "../input/feedback-prize-effectiveness/"

FOLD = 4

train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
gkf = GroupKFold(n_splits=5)
for fold, (train_id, val_id) in enumerate(gkf.split(X=train, y=train.discourse_effectiveness, groups=train.essay_id)):
    train.loc[val_id, "fold"] = int(fold)
train["fold"] = train["fold"].astype(int)

test_origin = train[train.fold == FOLD].reset_index(drop=True)

# test_origin = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))


def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis]  # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]
    return e_x / div

# Token

In [2]:
# class CFG:
#     model = ""
#     batch_size = 2
#     max_len = 2048
#     trn_fold = []
#     sp_fold = []
#     num_workers = 1
#     layer_cls = -4

# def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
#     return error.object[error.start: error.end].encode("utf-8"), error.end

# def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
#     return error.object[error.start: error.end].decode("cp1252"), error.end

# # Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
# codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
# codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

# def resolve_encodings_and_normalize(text: str) -> str:
#     """Resolve the encoding problems and normalize the abnormal characters."""
#     text = (
#         text.encode("raw_unicode_escape")
#             .decode("utf-8", errors="replace_decoding_with_cp1252")
#             .encode("cp1252", errors="replace_encoding_with_utf8")
#             .decode("utf-8", errors="replace_decoding_with_cp1252")
#     )
#     text = unidecode(text)
#     return text

# def get_essay(essay_id, is_train=True):
#     parent_path = INPUT_DIR + 'train' if is_train else INPUT_DIR + 'test'
#     essay_path = os.path.join(parent_path, f"{essay_id}.txt")
#     essay_text = open(essay_path, 'r').read()
#     return essay_text

# class Collate:
#     def __init__(self, tokenizer, isTrain=True):
#         self.tokenizer = tokenizer
#         self.isTrain = isTrain

#     def __call__(self, batch):
#         output = dict()
#         output["input_ids"] = [sample["input_ids"] for sample in batch]
#         output["attention_mask"] = [sample["attention_mask"] for sample in batch]
#         if self.isTrain:
#             output["target"] = [sample["target"] for sample in batch]

#         # calculate max token length of this batch
#         batch_max = max([len(ids) for ids in output["input_ids"]])

#         # add padding
#         output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in
#                                output["input_ids"]]
#         output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]

#         # convert to tensors
#         output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
#         output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)

#         if self.isTrain:
#             output["target"] = [s + (batch_max - len(s)) * [-100] for s in output["target"]]
#             output["target"] = torch.tensor(output["target"], dtype=torch.long)

#         return output

# test = test_origin.copy()
# test['essay_text'] = test['essay_id'].apply(lambda x: get_essay(x, is_train=True))
# test["discourse_text"] = [resolve_encodings_and_normalize(x) for x in test["discourse_text"]]
# test["essay_text"] = [resolve_encodings_and_normalize(x) for x in test["essay_text"]]

# discourse_text_values = test['discourse_text'].values
# essay_text_values = test['essay_text'].values
# matches = []
# for i, dt in enumerate(discourse_text_values):
#     if dt.strip() in essay_text_values[i]:
#         matches.append(1)
#     else:
#         matches.append(0)
# test['match'] = matches

# test_grouped_df = test.groupby(["essay_id"]).agg(list)

# disc_types = [
#     "Claim",
#     "Concluding Statement",
#     "Counterclaim",
#     "Evidence",
#     "Lead",
#     "Position",
#     "Rebuttal",
# ]
# cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
# end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

# label2id = {
#     "Adequate": 0,
#     "Effective": 1,
#     "Ineffective": 2,
# }

# def find_positions(text, discourse_text):

#     # keeps track of what has already
#     # been located
#     min_idx = 0

#     # stores start and end indexes of discourse_texts
#     idxs = []

#     for dt in discourse_text:
#         # calling strip is essential
#         matches = list(re.finditer(re.escape(dt.strip()), text))

#         # If there are multiple matches, take the first one
#         # that is past the previous discourse texts.
#         if len(matches) > 1:
#             for m in matches:
#                 if m.start() >= min_idx:
#                     break
#         # If no matches are found
#         elif len(matches) == 0:
#             idxs.append([-1])  # will filter out later
#             continue
#             # If one match is found
#         else:
#             m = matches[0]

#         idxs.append([m.start(), m.end()])

#         min_idx = m.start()

#     return idxs

# class TestDataset(Dataset):
#     def __init__(self, df, tokenizer):
#         self.df = df
#         self.discourse_type = df['discourse_type'].values
#         self.discourse_text = df['discourse_text'].values
#         self.essay_text = df['essay_text'].values
#         self.essay_ids = df.index.values
#         self.tokenizer = tokenizer

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, index):
#         text = self.essay_text[index][0]
#         discourse_text = self.discourse_text[index]

#         chunks = []
#         prev = 0

#         zipped = zip(
#             find_positions(text, discourse_text),
#             self.discourse_type[index],
#         )

#         for idxs, disc_type in zipped:
#             # when the discourse_text wasn't found
#             if idxs == [-1]:
#                 continue
#             s, e = idxs
#             # if the start of the current discourse_text is not
#             # at the end of the previous one.
#             # (text in between discourse_texts)
#             if s != prev:
#                 chunks.append(text[prev:s])
#                 prev = s
#             # if the start of the current discourse_text is
#             # the same as the end of the previous discourse_text
#             if s == prev:
#                 chunks.append(cls_tokens_map[disc_type])
#                 chunks.append(text[s:e])
#                 chunks.append(end_tokens_map[disc_type])
#             prev = e

#         tokenized = self.tokenizer(
#             " ".join(chunks),
#             truncation=True,
#             add_special_tokens=True,
#             max_length=CFG.max_len,
#         )

#         return {
#             'input_ids': tokenized['input_ids'],
#             'attention_mask': tokenized['attention_mask'],
#             'essay_id': self.essay_ids[index]
#         }


# def inference_fn(test_loader, model):
#     preds = []
#     model.eval()
#     model.to(device)
#     for data in test_loader:
#         ids = data['input_ids'].to(device, dtype=torch.long)
#         mask = data['attention_mask'].to(device, dtype=torch.long)
#         with torch.no_grad():
#             y_preds = model(ids, mask).to('cpu').numpy()
#             y_preds = np.pad(y_preds,((0,0),(0,CFG.max_len-y_preds.shape[1]),(0,0)),'constant',constant_values = (0.,0.))
#         preds.append(y_preds)
#     predictions = np.concatenate(preds)
    
#     head_preds = []
#     for i, sample in enumerate(test_dataset):
#         sample_pred = []
#         sample_ids = sample['input_ids']
#         for j, tk_id in enumerate(sample_ids):
#             if tk_id in cls_ids:
#                 sample_pred.append(predictions[i][j])
#         head_preds.append(sample_pred)
    
#     final_preds = []
#     ordered_essay_ids = test['essay_id'].values
#     disordered_essay_matches = test_grouped_df['match'].values
    
#     pre_essay_id = ''
#     for essay_id in ordered_essay_ids:
#         if essay_id == pre_essay_id:
#             continue
#         pre_essay_id = essay_id
#         essay_pred = head_preds[essay_id_map[essay_id]]
#         essay_macth = disordered_essay_matches[essay_id_map[essay_id]]
#         for i, discourse_match in enumerate(essay_macth):
#             if discourse_match == 1:
#                 final_preds.append(essay_pred[i])
#             else:
#                 final_preds.append([0., 0., 0.])
    
#     return softmax(np.array(final_preds))

### Deberta v2 xlarge finetuned

In [3]:
# MODEL_PATH = "../input/tk-deberta-v2-xlarge-finetuned/"
# CONFIG_PATH = MODEL_PATH + 'config.pth'

# CFG.model = "deberta-v2-xlarge"
# CFG.batch_size = 2
# CFG.trn_fold = [FOLD]


# class FeedBackModel(nn.Module):
#     def __init__(self, config_path):
#         super(FeedBackModel, self).__init__()
#         self.config = torch.load(config_path)
#         self.config.update({"output_hidden_states": False})
#         self.model = AutoModel.from_config(self.config)
#         self.fc = nn.Linear(self.config.hidden_size, 3)

#     def forward(self, ids, mask):
#         out = self.model(input_ids=ids, attention_mask=mask)
#         cls_embeddings = out.last_hidden_state
#         outputs = self.fc(cls_embeddings)
#         return outputs
    
    
# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

# cls_id_map = {
#     label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
# }

# collate_fn = Collate(tokenizer, isTrain=False)

# test_dataset = TestDataset(test_grouped_df, tokenizer)
# test_loader = DataLoader(test_dataset,
#                           batch_size=CFG.batch_size,
#                           shuffle=False,
#                           collate_fn=collate_fn,
#                           num_workers=CFG.num_workers,
#                           pin_memory=True,
#                           drop_last=False)


# cls_ids = set(list(cls_id_map.values()))

# essay_id_map = {v['essay_id'] : k for k, v in enumerate(test_dataset)}
    
# tk_deberta_preds_v2 = []
# for fold in CFG.trn_fold:
#     print("Fold {}".format(fold))

#     model = FeedBackModel(config_path=CONFIG_PATH)
#     state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model)
#     tk_deberta_preds_v2.append(prediction)
#     del model, state, prediction
#     gc.collect()
#     torch.cuda.empty_cache()
            
# model_preds_1 = np.mean(tk_deberta_preds_v2, axis=0)

### Deberta v3 large finetuned

In [4]:
# MODEL_PATH = "../input/tk-deberta-v3-large-finetuned/"
# CONFIG_PATH = MODEL_PATH + 'config.pth'

# CFG.model = "deberta-v3-large"
# CFG.batch_size = 2
# CFG.trn_fold = [FOLD]


# class FeedBackModel(nn.Module):
#     def __init__(self, config_path):
#         super(FeedBackModel, self).__init__()
#         self.config = torch.load(config_path)
#         self.config.update({"output_hidden_states": False})
#         self.model = AutoModel.from_config(self.config)
#         self.fc = nn.Linear(self.config.hidden_size, 3)

#     def forward(self, ids, mask):
#         out = self.model(input_ids=ids, attention_mask=mask)
#         cls_embeddings = out.last_hidden_state
#         outputs = self.fc(cls_embeddings)
#         return outputs
    
    
# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

# cls_id_map = {
#     label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
# }

# collate_fn = Collate(tokenizer, isTrain=False)

# test_dataset = TestDataset(test_grouped_df, tokenizer)
# test_loader = DataLoader(test_dataset,
#                           batch_size=CFG.batch_size,
#                           shuffle=False,
#                           collate_fn=collate_fn,
#                           num_workers=CFG.num_workers,
#                           pin_memory=True,
#                           drop_last=False)


# cls_ids = set(list(cls_id_map.values()))

# essay_id_map = {v['essay_id'] : k for k, v in enumerate(test_dataset)}
    
# tk_deberta_preds = []
# for fold in CFG.trn_fold:
#     print("Fold {}".format(fold))

#     model = FeedBackModel(config_path=CONFIG_PATH)
#     state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model)
#     tk_deberta_preds.append(prediction)
#     del model, state, prediction
#     gc.collect()
#     torch.cuda.empty_cache()
            
# model_preds_2 = np.mean(tk_deberta_preds, axis=0)

### Longformer large

In [5]:
# MODEL_PATH = "../input/tk-longformer-large-finetuned/"
# CONFIG_PATH = MODEL_PATH + 'config.pth'

# CFG.model = "longformer-large"
# CFG.batch_size = 2
# CFG.trn_fold = [FOLD]
# CFG.sp_fold = [4]


# class FeedBackModel(nn.Module):
#     def __init__(self, config_path):
#         super(FeedBackModel, self).__init__()
#         self.config = torch.load(config_path)
#         self.config.update({"output_hidden_states": False})
#         self.model = AutoModel.from_config(self.config)
#         self.fc = nn.Linear(self.config.hidden_size, 3)

#     def forward(self, ids, mask):
#         out = self.model(input_ids=ids, attention_mask=mask)
#         cls_embeddings = out.last_hidden_state
#         outputs = self.fc(cls_embeddings)
#         return outputs

    
# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

# cls_id_map = {
#     label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
# }

# collate_fn = Collate(tokenizer, isTrain=False)

# test_dataset = TestDataset(test_grouped_df, tokenizer)
# test_loader = DataLoader(test_dataset,
#                           batch_size=CFG.batch_size,
#                           shuffle=False,
#                           collate_fn=collate_fn,
#                           num_workers=CFG.num_workers,
#                           pin_memory=True,
#                           drop_last=False)

# cls_ids = set(list(cls_id_map.values()))

# essay_id_map = {v['essay_id'] : k for k, v in enumerate(test_dataset)}

# longformer_preds = []
# for fold in CFG.trn_fold:
#     print("Fold {}".format(fold))
    
#     model = FeedBackModel(config_path=CONFIG_PATH)
#     if fold in CFG.sp_fold:
#         state = torch.load(f"../input/new-tk-longformer-large/{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                            map_location=torch.device('cpu'))
#     else:
#         state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                            map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model)
#     longformer_preds.append(prediction)
#     del model, state, prediction
#     gc.collect()
#     torch.cuda.empty_cache()
            
# model_preds_3 = np.mean(longformer_preds, axis=0)

# Sequence

In [6]:
class CFG:
    model = ""
    batch_size = 16
    max_len = 512
    trn_fold = []
    num_workers = 1
    layer_cls = -4

def get_essay(essay_id, is_train=True):
    parent_path = INPUT_DIR + 'train' if is_train else INPUT_DIR + 'test'
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

test = test_origin.copy()
test['essay_text'] = test['essay_id'].apply(lambda x: get_essay(x, is_train=True))

class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.text = df['text'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.tokenizer.encode_plus(
            self.text[item],
            truncation=True,
            add_special_tokens=True,
            max_length=CFG.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }

        return samples

class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in
                                   output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in
                                   output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

def inference_fn(test_loader, model):
    preds = []
    model.eval()
    model.to(device)
    for data in test_loader:
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        with torch.no_grad():
            y_preds = model(ids, mask)
        y_preds = softmax(y_preds.to('cpu').numpy())
        preds.append(y_preds)
    predictions = np.concatenate(preds)
    return predictions

### Deberta v2 xlarge

In [7]:
# MODEL_PATH = "../input/sq-deberta-v2-xlarge/"
# CONFIG_PATH = MODEL_PATH + 'config.pth'

# CFG.model = "microsoft/deberta-v2-xlarge"
# CFG.batch_size = 16
# CFG.trn_fold = [FOLD]


# class FeedBackModel(nn.Module):
#     def __init__(self, config_path):
#         super(FeedBackModel, self).__init__()
#         self.config = torch.load(config_path)
#         self.config.update({"output_hidden_states": True})
#         self.model = AutoModel.from_config(self.config)
#         self.fc = nn.Linear(self.config.hidden_size, 3)

#     def forward(self, ids, mask):
#         out = self.model(input_ids=ids, attention_mask=mask)
#         all_hidden_states = torch.stack(out.hidden_states)
#         cls_embeddings = all_hidden_states[CFG.layer_cls, :, 0]
#         outputs = self.fc(cls_embeddings)
#         return outputs


# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

# SEP = tokenizer.sep_token
# test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + SEP + test['essay_text']

# collate_fn = Collate(tokenizer, isTrain=False)

# test_dataset = TestDataset(test, tokenizer)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG.batch_size,
#                          shuffle=False,
#                          collate_fn=collate_fn,
#                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


# sq_deberta_preds_v2 = []
# for fold in CFG.trn_fold:
#     print("Fold {}".format(fold))

#     model = FeedBackModel(config_path=CONFIG_PATH)
#     state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model)
#     sq_deberta_preds_v2.append(prediction)
#     del model, state, prediction
#     gc.collect()
#     torch.cuda.empty_cache()

# model_preds_4 = np.mean(sq_deberta_preds_v2, axis=0)

### Deberta v3 large

In [8]:
MODEL_PATH = "../input/sq-deberta-v3-large-new/"
CONFIG_PATH = MODEL_PATH + 'config.pth'

CFG.model = "microsoft/deberta-v3-large"
CFG.batch_size = 16
CFG.trn_fold = [FOLD]
CFG.sp_fold = [0]


class FeedBackModel(nn.Module):
    def __init__(self, config_path):
        super(FeedBackModel, self).__init__()
        self.config = torch.load(config_path)
        self.config.update({"output_hidden_states": True})
        self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 3)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask)
        all_hidden_states = torch.stack(out.hidden_states)
        cls_embeddings = all_hidden_states[CFG.layer_cls, :, 0]
        outputs = self.fc(cls_embeddings)
        return outputs


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

SEP = tokenizer.sep_token
test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + SEP + test['essay_text']

collate_fn = Collate(tokenizer, isTrain=False)

test_dataset = TestDataset(test, tokenizer)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=collate_fn,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


sq_deberta_preds = []
for fold in CFG.trn_fold:
    print("Fold {}".format(fold))

    model = FeedBackModel(config_path=CONFIG_PATH)
    if fold in CFG.sp_fold:
        swa_model = AveragedModel(model)
        state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best_swa.pth",
                           map_location=torch.device('cpu'))
        swa_model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, swa_model)
        sq_deberta_preds.append(prediction)
        del swa_model, state, prediction
    else:
        state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model)
        sq_deberta_preds.append(prediction)
        del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

model_preds_5 = np.mean(sq_deberta_preds, axis=0)

Fold 4


In [9]:
# MODEL_PATH = "../input/feedback-deberta-v3-large-sep/"
# CONFIG_PATH = MODEL_PATH + 'config.pth'

# CFG.model = "microsoft/deberta-v3-large"
# CFG.batch_size = 16
# CFG.trn_fold = [FOLD]


# class FeedBackModel(nn.Module):
#     def __init__(self, config_path):
#         super(FeedBackModel, self).__init__()
#         self.config = torch.load(config_path)
#         self.config.update({"output_hidden_states": True})
#         self.model = AutoModel.from_config(self.config)
#         self.fc = nn.Linear(self.config.hidden_size, 3)

#     def forward(self, ids, mask):
#         out = self.model(input_ids=ids, attention_mask=mask)
#         all_hidden_states = torch.stack(out.hidden_states)
#         cls_embeddings = all_hidden_states[CFG.layer_cls, :, 0]
#         outputs = self.fc(cls_embeddings)
#         return outputs


# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

# SEP = tokenizer.sep_token
# test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + SEP + test['essay_text']

# collate_fn = Collate(tokenizer, isTrain=False)

# test_dataset = TestDataset(test, tokenizer)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG.batch_size,
#                          shuffle=False,
#                          collate_fn=collate_fn,
#                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


# sq_deberta_preds = []
# for fold in CFG.trn_fold:
#     print("Fold {}".format(fold))

#     model = FeedBackModel(config_path=CONFIG_PATH)
#     swa_model = AveragedModel(model)
#     state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     swa_model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, swa_model)
#     sq_deberta_preds.append(prediction)
#     del model, swa_model, state, prediction
#     gc.collect()
#     torch.cuda.empty_cache()

# model_preds_5 = np.mean(sq_deberta_preds, axis=0)

### roberta large

In [10]:
# MODEL_PATH = "../input/sq-roberta-large/"
# CONFIG_PATH = MODEL_PATH + 'config.pth'

# CFG.model = "roberta-large"
# CFG.batch_size = 16
# CFG.trn_fold = [FOLD]


# class FeedBackModel(nn.Module):
#     def __init__(self, config_path):
#         super(FeedBackModel, self).__init__()
#         self.config = torch.load(config_path)
#         self.config.update({"output_hidden_states": False})
#         self.model = AutoModel.from_config(self.config)
#         self.fc = nn.Linear(self.config.hidden_size, 3)

#     def forward(self, ids, mask):
#         out = self.model(input_ids=ids, attention_mask=mask)
#         cls_embeddings = out.last_hidden_state[:, 0]
#         outputs = self.fc(cls_embeddings)
#         return outputs


# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

# SEP = tokenizer.sep_token
# test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + SEP + test['essay_text']

# collate_fn = Collate(tokenizer, isTrain=False)

# test_dataset = TestDataset(test, tokenizer)
# test_loader = DataLoader(test_dataset,
#                          batch_size=CFG.batch_size,
#                          shuffle=False,
#                          collate_fn=collate_fn,
#                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


# roberta_predictions = []
# for fold in CFG.trn_fold:
#     print("Fold {}".format(fold))

#     model = FeedBackModel(config_path=CONFIG_PATH)
#     state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
#                        map_location=torch.device('cpu'))
#     model.load_state_dict(state['model'])
#     prediction = inference_fn(test_loader, model)
#     roberta_predictions.append(prediction)
#     del model, state, prediction
#     gc.collect()
#     torch.cuda.empty_cache()

# model_preds_6 = np.mean(roberta_predictions, axis=0)

# Ensemble

In [11]:
# mpdel_preds = [model_preds_1, model_preds_2, model_preds_3, model_preds_4, model_preds_5]
# for i, preds in enumerate(mpdel_preds):
#     col_1 = 'preds' + str(i + 1) + '_' + str(1)
#     col_2 = 'preds' + str(i + 1) + '_' + str(2)
#     col_3 = 'preds' + str(i + 1) + '_' + str(3)
#     test_origin[col_1] = preds[:, 0]
#     test_origin[col_2] = preds[:, 1]
#     test_origin[col_3] = preds[:, 2]

col_1 = 'preds' + str(5) + '_' + str(1)
col_2 = 'preds' + str(5) + '_' + str(2)
col_3 = 'preds' + str(5) + '_' + str(3)
test_origin[col_1] = model_preds_5[:, 0]
test_origin[col_2] = model_preds_5[:, 1]
test_origin[col_3] = model_preds_5[:, 2]

In [12]:
test_origin.to_csv(f'train_further_deb_{FOLD}.csv', index=False)

In [13]:
# m1 = 0.22  # v2 tk
# m2 = 0.22  # v3 tk
# m3 = 0.06  # longformer tk
# m4 = 0.22  # v2 sq
# m5 = 0.22  # v3 sq

# preds_Ineffective = model_preds_1[:, 2] * m1 + model_preds_2[:, 2] * m2 + model_preds_4[:, 0] * m4 + model_preds_5[:, 0] * m5 + model_preds_3[:, 2] * m3
# preds_Adequate = model_preds_1[:, 0] * m1 + model_preds_2[:, 0] * m2 + model_preds_4[:, 1] * m4 + model_preds_5[:, 1] * m5 + model_preds_3[:, 0] * m3
# preds_Effective = model_preds_1[:, 1] * m1 + model_preds_2[:, 1] * m2 + model_preds_4[:, 2] * m4 + model_preds_5[:, 2] * m5 + model_preds_3[:, 1] * m3

# sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

# sample['Ineffective'] = preds_Ineffective
# sample['Adequate'] = preds_Adequate
# sample['Effective'] = preds_Effective

# sample.to_csv('submission.csv', index=False)

# display(sample)