In [1]:
import os
import gc
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import warnings
from torch.optim.swa_utils import AveragedModel
from transformers import AutoTokenizer, AutoConfig, AutoModel
from text_unidecode import unidecode
from typing import Tuple
import codecs
import re

warnings.filterwarnings("ignore")
gc.collect()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

INPUT_DIR = "../input/feedback-prize-effectiveness/"

test_origin = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis]  # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]
    return e_x / div

# Token

In [2]:
class CFG:
    model = ""
    batch_size = 2
    max_len = 2048
    trn_fold = []
    sp_fold = []
    num_workers = 1
    layer_cls = -4

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start: error.end].encode("utf-8"), error.end

def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start: error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
            .decode("utf-8", errors="replace_decoding_with_cp1252")
            .encode("cp1252", errors="replace_encoding_with_utf8")
            .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def get_essay(essay_id, is_train=True):
    parent_path = INPUT_DIR + 'train' if is_train else INPUT_DIR + 'test'
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in
                               output["input_ids"]]
        output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)

        if self.isTrain:
            output["target"] = [s + (batch_max - len(s)) * [-100] for s in output["target"]]
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

test = test_origin.copy()
test['essay_text'] = test['essay_id'].apply(lambda x: get_essay(x, is_train=False))
test["discourse_text"] = [resolve_encodings_and_normalize(x) for x in test["discourse_text"]]
test["essay_text"] = [resolve_encodings_and_normalize(x) for x in test["essay_text"]]

discourse_text_values = test['discourse_text'].values
essay_text_values = test['essay_text'].values
matches = []
for i, dt in enumerate(discourse_text_values):
    if dt.strip() in essay_text_values[i]:
        matches.append(1)
    else:
        matches.append(0)
test['match'] = matches

test_grouped_df = test.groupby(["essay_id"]).agg(list)

disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]
cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

def find_positions(text, discourse_text):

    # keeps track of what has already
    # been located
    min_idx = 0

    # stores start and end indexes of discourse_texts
    idxs = []

    for dt in discourse_text:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))

        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1])  # will filter out later
            continue
            # If one match is found
        else:
            m = matches[0]

        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.discourse_type = df['discourse_type'].values
        self.discourse_text = df['discourse_text'].values
        self.essay_text = df['essay_text'].values
        self.essay_ids = df.index.values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.essay_text[index][0]
        discourse_text = self.discourse_text[index]

        chunks = []
        prev = 0

        zipped = zip(
            find_positions(text, discourse_text),
            self.discourse_type[index],
        )

        for idxs, disc_type in zipped:
            # when the discourse_text wasn't found
            if idxs == [-1]:
                continue
            s, e = idxs
            # if the start of the current discourse_text is not
            # at the end of the previous one.
            # (text in between discourse_texts)
            if s != prev:
                chunks.append(text[prev:s])
                prev = s
            # if the start of the current discourse_text is
            # the same as the end of the previous discourse_text
            if s == prev:
                chunks.append(cls_tokens_map[disc_type])
                chunks.append(text[s:e])
                chunks.append(end_tokens_map[disc_type])
            prev = e

        tokenized = self.tokenizer(
            " ".join(chunks),
            truncation=True,
            add_special_tokens=True,
            max_length=CFG.max_len,
        )

        return {
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'essay_id': self.essay_ids[index]
        }


def inference_fn(test_loader, model):
    preds = []
    model.eval()
    model.to(device)
    for data in test_loader:
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        with torch.no_grad():
            y_preds = model(ids, mask).to('cpu').numpy()
            y_preds = np.pad(y_preds,((0,0),(0,CFG.max_len-y_preds.shape[1]),(0,0)),'constant',constant_values = (0.,0.))
        preds.append(y_preds)
    predictions = np.concatenate(preds)
    
    head_preds = []
    for i, sample in enumerate(test_dataset):
        sample_pred = []
        sample_ids = sample['input_ids']
        for j, tk_id in enumerate(sample_ids):
            if tk_id in cls_ids:
                sample_pred.append(predictions[i][j])
        head_preds.append(sample_pred)
    
    final_preds = []
    ordered_essay_ids = test['essay_id'].values
    disordered_essay_matches = test_grouped_df['match'].values
    
    pre_essay_id = ''
    for essay_id in ordered_essay_ids:
        if essay_id == pre_essay_id:
            continue
        pre_essay_id = essay_id
        essay_pred = head_preds[essay_id_map[essay_id]]
        essay_macth = disordered_essay_matches[essay_id_map[essay_id]]
        for i, discourse_match in enumerate(essay_macth):
            if discourse_match == 1:
                final_preds.append(essay_pred[i])
            else:
                final_preds.append([0., 0., 0.])
    
    return softmax(np.array(final_preds))

### Deberta v2 xlarge finetuned

In [3]:
MODEL_PATH = "../input/tk-deberta-v2-xlarge-finetuned/"
CONFIG_PATH = MODEL_PATH + 'config.pth'

CFG.model = "deberta-v2-xlarge"
CFG.batch_size = 2
CFG.trn_fold = [0, 1, 3, 4]


class FeedBackModel(nn.Module):
    def __init__(self, config_path):
        super(FeedBackModel, self).__init__()
        self.config = torch.load(config_path)
        self.config.update({"output_hidden_states": False})
        self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 3)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask)
        cls_embeddings = out.last_hidden_state
        outputs = self.fc(cls_embeddings)
        return outputs
    
    
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

cls_id_map = {
    label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
}

collate_fn = Collate(tokenizer, isTrain=False)

test_dataset = TestDataset(test_grouped_df, tokenizer)
test_loader = DataLoader(test_dataset,
                          batch_size=CFG.batch_size,
                          shuffle=False,
                          collate_fn=collate_fn,
                          num_workers=CFG.num_workers,
                          pin_memory=True,
                          drop_last=False)


cls_ids = set(list(cls_id_map.values()))

essay_id_map = {v['essay_id'] : k for k, v in enumerate(test_dataset)}
    
tk_deberta_preds_v2 = []
for fold in CFG.trn_fold:
    print("Fold {}".format(fold))

    model = FeedBackModel(config_path=CONFIG_PATH)
    state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model)
    tk_deberta_preds_v2.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()
            
# model_preds_1 = np.mean(tk_deberta_preds_v2, axis=0)
model_preds_1 = np.array(tk_deberta_preds_v2)

Fold 0
Fold 1
Fold 3
Fold 4


### Deberta v3 large finetuned

In [4]:
MODEL_PATH = "../input/tk-deberta-v3-large-finetuned/"
CONFIG_PATH = MODEL_PATH + 'config.pth'

CFG.model = "deberta-v3-large"
CFG.batch_size = 2
CFG.trn_fold = [0, 1, 3, 4]


class FeedBackModel(nn.Module):
    def __init__(self, config_path):
        super(FeedBackModel, self).__init__()
        self.config = torch.load(config_path)
        self.config.update({"output_hidden_states": False})
        self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 3)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask)
        cls_embeddings = out.last_hidden_state
        outputs = self.fc(cls_embeddings)
        return outputs
    
    
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

cls_id_map = {
    label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
}

collate_fn = Collate(tokenizer, isTrain=False)

test_dataset = TestDataset(test_grouped_df, tokenizer)
test_loader = DataLoader(test_dataset,
                          batch_size=CFG.batch_size,
                          shuffle=False,
                          collate_fn=collate_fn,
                          num_workers=CFG.num_workers,
                          pin_memory=True,
                          drop_last=False)


cls_ids = set(list(cls_id_map.values()))

essay_id_map = {v['essay_id'] : k for k, v in enumerate(test_dataset)}
    
tk_deberta_preds = []
for fold in CFG.trn_fold:
    print("Fold {}".format(fold))

    model = FeedBackModel(config_path=CONFIG_PATH)
    state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model)
    tk_deberta_preds.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()
            
# model_preds_2 = np.mean(tk_deberta_preds, axis=0)
model_preds_2 = np.array(tk_deberta_preds)

Fold 0
Fold 1
Fold 3
Fold 4


### Longformer large

In [5]:
MODEL_PATH = "../input/tk-longformer-large-finetuned/"
CONFIG_PATH = MODEL_PATH + 'config.pth'

CFG.model = "longformer-large"
CFG.batch_size = 2
CFG.trn_fold = [0, 1, 3, 4]
CFG.sp_fold = [4]


class FeedBackModel(nn.Module):
    def __init__(self, config_path):
        super(FeedBackModel, self).__init__()
        self.config = torch.load(config_path)
        self.config.update({"output_hidden_states": False})
        self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 3)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask)
        cls_embeddings = out.last_hidden_state
        outputs = self.fc(cls_embeddings)
        return outputs

    
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

cls_id_map = {
    label: tokenizer.encode(tkn)[1] for label, tkn in cls_tokens_map.items()
}

collate_fn = Collate(tokenizer, isTrain=False)

test_dataset = TestDataset(test_grouped_df, tokenizer)
test_loader = DataLoader(test_dataset,
                          batch_size=CFG.batch_size,
                          shuffle=False,
                          collate_fn=collate_fn,
                          num_workers=CFG.num_workers,
                          pin_memory=True,
                          drop_last=False)

cls_ids = set(list(cls_id_map.values()))

essay_id_map = {v['essay_id'] : k for k, v in enumerate(test_dataset)}

longformer_preds = []
for fold in CFG.trn_fold:
    print("Fold {}".format(fold))
    
    model = FeedBackModel(config_path=CONFIG_PATH)
    if fold in CFG.sp_fold:
        state = torch.load(f"../input/new-tk-longformer-large/{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
    else:
        state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model)
    longformer_preds.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()
            
# model_preds_3 = np.mean(longformer_preds, axis=0)
model_preds_3 = np.array(longformer_preds)

Fold 0
Fold 1
Fold 3
Fold 4


# Sequence

In [6]:
class CFG:
    model = ""
    batch_size = 16
    max_len = 512
    trn_fold = []
    num_workers = 1
    layer_cls = -4

def get_essay(essay_id, is_train=True):
    parent_path = INPUT_DIR + 'train' if is_train else INPUT_DIR + 'test'
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

test = test_origin.copy()
test['essay_text'] = test['essay_id'].apply(lambda x: get_essay(x, is_train=False))

class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.text = df['text'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.tokenizer.encode_plus(
            self.text[item],
            truncation=True,
            add_special_tokens=True,
            max_length=CFG.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }

        return samples

class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in
                                   output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in
                                   output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

def inference_fn(test_loader, model):
    preds = []
    model.eval()
    model.to(device)
    for data in test_loader:
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        with torch.no_grad():
            y_preds = model(ids, mask)
        y_preds = softmax(y_preds.to('cpu').numpy())
        preds.append(y_preds)
    predictions = np.concatenate(preds)
    return predictions

### Deberta v2 xlarge

In [7]:
MODEL_PATH = "../input/sq-deberta-v2-xlarge/"
CONFIG_PATH = MODEL_PATH + 'config.pth'

CFG.model = "microsoft/deberta-v2-xlarge"
CFG.batch_size = 16
CFG.trn_fold = [0, 1, 3, 4]


class FeedBackModel(nn.Module):
    def __init__(self, config_path):
        super(FeedBackModel, self).__init__()
        self.config = torch.load(config_path)
        self.config.update({"output_hidden_states": True})
        self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 3)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask)
        all_hidden_states = torch.stack(out.hidden_states)
        cls_embeddings = all_hidden_states[CFG.layer_cls, :, 0]
        outputs = self.fc(cls_embeddings)
        return outputs


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

SEP = tokenizer.sep_token
test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + SEP + test['essay_text']

collate_fn = Collate(tokenizer, isTrain=False)

test_dataset = TestDataset(test, tokenizer)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=collate_fn,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


sq_deberta_preds_v2 = []
for fold in CFG.trn_fold:
    print("Fold {}".format(fold))

    model = FeedBackModel(config_path=CONFIG_PATH)
    state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model)
    sq_deberta_preds_v2.append(prediction)
    del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

# model_preds_4 = np.mean(sq_deberta_preds_v2, axis=0)
model_preds_4 = np.array(sq_deberta_preds_v2)

Fold 0
Fold 1
Fold 3
Fold 4


### Deberta v3 large

In [8]:
MODEL_PATH = "../input/sq-deberta-v3-large-new/"
CONFIG_PATH = MODEL_PATH + 'config.pth'

CFG.model = "microsoft/deberta-v3-large"
CFG.batch_size = 16
CFG.trn_fold = [0, 1, 3, 4]
CFG.sp_fold = [0]


class FeedBackModel(nn.Module):
    def __init__(self, config_path):
        super(FeedBackModel, self).__init__()
        self.config = torch.load(config_path)
        self.config.update({"output_hidden_states": True})
        self.model = AutoModel.from_config(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 3)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask)
        all_hidden_states = torch.stack(out.hidden_states)
        cls_embeddings = all_hidden_states[CFG.layer_cls, :, 0]
        outputs = self.fc(cls_embeddings)
        return outputs


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH + 'tokenizer', use_fast=True)

SEP = tokenizer.sep_token
test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + SEP + test['essay_text']

collate_fn = Collate(tokenizer, isTrain=False)

test_dataset = TestDataset(test, tokenizer)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=collate_fn,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


sq_deberta_preds = []
for fold in CFG.trn_fold:
    print("Fold {}".format(fold))

    model = FeedBackModel(config_path=CONFIG_PATH)
    if fold in CFG.sp_fold:
        swa_model = AveragedModel(model)
        state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best_swa.pth",
                           map_location=torch.device('cpu'))
        swa_model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, swa_model)
        sq_deberta_preds.append(prediction)
        del swa_model, state, prediction
    else:
        state = torch.load(MODEL_PATH + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model)
        sq_deberta_preds.append(prediction)
        del model, state, prediction
    gc.collect()
    torch.cuda.empty_cache()

# model_preds_5 = np.mean(sq_deberta_preds, axis=0)
model_preds_5 = np.array(sq_deberta_preds)

Fold 0
Fold 1
Fold 3
Fold 4


# Ensemble

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from collections import Counter
from nltk.corpus import stopwords
from stop_words import get_stop_words
import joblib
import nltk
import tensorflow as tf

stopwords = list(get_stop_words('en'))

def max_repeated_word_count(text):
    words = [word for word in text.split() if word not in stopwords]

    word_counts = Counter(words)
    try:
        return word_counts.most_common(1)[0][1]
    
    except IndexError:
        return 0
        
    return max_count

def get_pos_tags(x):
    tokens = nltk.tokenize.word_tokenize(x)
    tags = nltk.pos_tag(tokens, tagset='universal')
    return Counter(tag for word, tag in tags)

In [10]:
# char length
test_origin['char_length'] = test_origin['discourse_text'].apply(lambda x: len(x))
# word count
test_origin['word_count'] = test_origin['discourse_text'].apply(lambda x: len(x.split()))
# avg word length
test_origin['avg_word_length'] = test_origin['char_length'] / test_origin['word_count']
# contains 'source'
test_origin['contains_source'] = test_origin['discourse_text'].apply(lambda x: 'source' in x.lower().split())
# contains 'i'
test_origin['contains_I'] = test_origin['discourse_text'].apply(lambda x: 'i' in x.lower().split())
# repeated word count
test_origin['max_repeated_word_count'] = test_origin['discourse_text'].apply(max_repeated_word_count)

# POS
test_origin["pos_tags"] = test_origin['discourse_text'].apply(lambda x: get_pos_tags(x))
test_origin = pd.concat([test_origin, test_origin["pos_tags"].apply(pd.Series).fillna(0)], axis=1)
test_origin.drop("pos_tags", axis=1, inplace=True)

In [11]:
# encoder
encode_cols = ['discourse_type', 'contains_source', 'contains_I']


# standard scaler
ss_X = joblib.load(f'../input/tf-model-0/scaler.pkl')
# tf model
model_tf = tf.keras.models.load_model(f'../input/tf-model-0/model')

y_preds_0 = []
for fold in range(4):

    data = test_origin.copy()

    mpdel_preds = [model_preds_1[fold], model_preds_2[fold], model_preds_3[fold], model_preds_4[fold], model_preds_5[fold]]
    for i, preds in enumerate(mpdel_preds):
        col_1 = 'preds' + str(i + 1) + '_' + str(1)
        col_2 = 'preds' + str(i + 1) + '_' + str(2)
        col_3 = 'preds' + str(i + 1) + '_' + str(3)
        data[col_1] = preds[:, 0]
        data[col_2] = preds[:, 1]
        data[col_3] = preds[:, 2]

    cols = ['discourse_type', 'char_length', 'word_count', 'avg_word_length', 'contains_source', 'contains_I', 'max_repeated_word_count', 
            'preds2_1', 'preds2_2', 'preds2_3', 
            'preds3_1', 'preds3_2', 'preds3_3', 
            'preds4_1', 'preds4_2', 'preds4_3', 
            'preds5_1', 'preds5_2', 'preds5_3',
            'ADJ', 'ADV', 'PRON', 'NUM', 'CONJ']
    X = data[cols]

    for i, col in enumerate(encode_cols):
        labelencoder = joblib.load(f'../input/tf-model-0/encoder_{i}.pkl')
        X[col] = labelencoder.transform(X[col])

    X = ss_X.transform(X)

    # predict
    y_preds_0.append(model_tf.predict(X))
    
del model_tf    
gc.collect()

y_preds_0 = np.mean(y_preds_0, axis=0)

2022-08-23 08:49:12.055908: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-23 08:49:12.057024: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-23 08:49:12.057777: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-23 08:49:12.058888: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [12]:
# standard scaler
ss_X = joblib.load(f'../input/tf-model-1/scaler.pkl')
# tf model
model_tf = tf.keras.models.load_model(f'../input/tf-model-1/model')

y_preds_1 = []
for fold in range(4):

    data = test_origin.copy()

    mpdel_preds = [model_preds_1[fold], model_preds_2[fold], model_preds_3[fold], model_preds_4[fold], model_preds_5[fold]]
    for i, preds in enumerate(mpdel_preds):
        col_1 = 'preds' + str(i + 1) + '_' + str(1)
        col_2 = 'preds' + str(i + 1) + '_' + str(2)
        col_3 = 'preds' + str(i + 1) + '_' + str(3)
        data[col_1] = preds[:, 0]
        data[col_2] = preds[:, 1]
        data[col_3] = preds[:, 2]

    cols = ['discourse_type', 'char_length', 'word_count', 'avg_word_length', 'contains_source', 'contains_I', 'max_repeated_word_count', 
            'preds1_1', 'preds1_2', 'preds1_3', 
            'preds3_1', 'preds3_2', 'preds3_3', 
            'preds4_1', 'preds4_2', 'preds4_3', 
            'preds5_1', 'preds5_2', 'preds5_3',
            'ADJ', 'ADV', 'PRON', 'NUM', 'CONJ']
    X = data[cols]

    for i, col in enumerate(encode_cols):
        labelencoder = joblib.load(f'../input/tf-model-1/encoder_{i}.pkl')
        X[col] = labelencoder.transform(X[col])

    X = ss_X.transform(X)

    # predict
    y_preds_1.append(model_tf.predict(X))
    
del model_tf    
gc.collect()

y_preds_1 = np.mean(y_preds_1, axis=0)

In [13]:
# standard scaler
ss_X = joblib.load(f'../input/tf-model-2/scaler.pkl')
# tf model
model_tf = tf.keras.models.load_model(f'../input/tf-model-2/model')

y_preds_2 = []
for fold in range(4):

    data = test_origin.copy()

    mpdel_preds = [model_preds_1[fold], model_preds_2[fold], model_preds_3[fold], model_preds_4[fold], model_preds_5[fold]]
    for i, preds in enumerate(mpdel_preds):
        col_1 = 'preds' + str(i + 1) + '_' + str(1)
        col_2 = 'preds' + str(i + 1) + '_' + str(2)
        col_3 = 'preds' + str(i + 1) + '_' + str(3)
        data[col_1] = preds[:, 0]
        data[col_2] = preds[:, 1]
        data[col_3] = preds[:, 2]

    cols = ['discourse_type', 'char_length', 'word_count', 'avg_word_length', 'contains_source', 'contains_I', 'max_repeated_word_count', 
            'preds1_1', 'preds1_2', 'preds1_3', 
            'preds2_1', 'preds2_2', 'preds2_3', 
            'preds3_1', 'preds3_2', 'preds3_3', 
            'preds4_1', 'preds4_2', 'preds4_3',
            'ADJ', 'ADV', 'PRON', 'NUM', 'CONJ']
    X = data[cols]

    for i, col in enumerate(encode_cols):
        labelencoder = joblib.load(f'../input/tf-model-2/encoder_{i}.pkl')
        X[col] = labelencoder.transform(X[col])

    X = ss_X.transform(X)

    # predict
    y_preds_2.append(model_tf.predict(X))
    
del model_tf    
gc.collect()

y_preds_2 = np.mean(y_preds_2, axis=0)

## Submit

In [14]:
model_preds_1_mean = np.mean(model_preds_1, axis=0)
preds_Ineffective_0 = y_preds_0[:, 0] * 0.75 + model_preds_1_mean[:, 2] * 0.25
preds_Adequate_0 = y_preds_0[:, 1] * 0.75 + model_preds_1_mean[:, 0] * 0.25
preds_Effective_0 = y_preds_0[:, 2] * 0.75 + model_preds_1_mean[:, 1] * 0.25

model_preds_2_mean = np.mean(model_preds_2, axis=0)
preds_Ineffective_1 = y_preds_1[:, 0] * 0.75 + model_preds_2_mean[:, 2] * 0.25
preds_Adequate_1 = y_preds_1[:, 1] * 0.75 + model_preds_2_mean[:, 0] * 0.25
preds_Effective_1 = y_preds_1[:, 2] * 0.75 + model_preds_2_mean[:, 1] * 0.25

model_preds_5_mean = np.mean(model_preds_5, axis=0)
preds_Ineffective_2 = y_preds_2[:, 0] * 0.75 + model_preds_5_mean[:, 0] * 0.25
preds_Adequate_2 = y_preds_2[:, 1] * 0.75 + model_preds_5_mean[:, 1] * 0.25
preds_Effective_2 = y_preds_2[:, 2] * 0.75 + model_preds_5_mean[:, 2] * 0.25

preds_Ineffective = preds_Ineffective_0 * 0.33 + preds_Ineffective_1 * 0.34 + preds_Ineffective_2 * 0.33
preds_Adequate = preds_Adequate_0 * 0.33 + preds_Adequate_1 * 0.34 + preds_Adequate_2 * 0.33
preds_Effective = preds_Effective_0 * 0.33 + preds_Effective_1 * 0.34 + preds_Effective_2 * 0.33

sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

sample['Ineffective'] = preds_Ineffective
sample['Adequate'] = preds_Adequate
sample['Effective'] = preds_Effective

sample.to_csv('submission.csv', index=False)

display(sample)

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.015175,0.497955,0.48687
1,5a88900e7dc1,0.028157,0.781238,0.190605
2,9790d835736b,0.027453,0.420099,0.552448
3,75ce6d68b67b,0.080789,0.435147,0.484064
4,93578d946723,0.074161,0.510287,0.415552
5,2e214524dbe3,0.021663,0.500101,0.478236
6,84812fc2ab9f,0.021637,0.450413,0.527949
7,c668ff840720,0.060464,0.614413,0.325123
8,739a6d00f44a,0.052531,0.53701,0.410459
9,bcfae2c9a244,0.02306,0.608356,0.368584
