# RuATD22 Competition (binary task)

* https://github.com/dialogue-evaluation/RuATD
* Leaderboard: https://www.kaggle.com/c/ruatd-2022-bi/leaderboard

# Colab env

In [None]:
!pip install transformers tokenizers datasets

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Colab\ Notebooks/kaggle/ruatd22_competion

/content/drive/MyDrive/Colab Notebooks/kaggle/ruatd22_competion


# Common env

In [1]:
import pandas as pd
import numpy as np
import torch
import pickle
from tqdm import tqdm
import os
import time
from joblib import dump, load

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import VotingClassifier, HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from scipy import stats
from transformers import (AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback, AutoConfig)
from datasets import Dataset, load_metric, Features, ClassLabel, Value
from tokenizers.decoders import ByteLevel

from typing import Optional, Callable, Tuple, List

In [24]:
# Подготовка набора данных для BERT

def build_dataset(data: pd.DataFrame, tokenizer: AutoTokenizer, max_length=512, with_label=True):
    if with_label:
        class_names = ["M", "H"]
        features = Features({'Text': Value('string'), 'label': ClassLabel(names=class_names, num_classes=2)})
        dataset = Dataset.from_pandas(data, preserve_index=False, features=features)
        dataset = dataset.map(lambda e: tokenizer(e['Text'], truncation=True, padding='max_length', max_length=max_length), batched=True)    
        dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
    else:
        dataset = Dataset.from_pandas(data, preserve_index=False)
        dataset = dataset.map(lambda e: tokenizer(e['Text'], truncation=True, padding='max_length', max_length=max_length), batched=True)    
        dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
    return dataset

In [2]:
# Сохраняем файл с предсказаниями переданного классификатора

def submission(clf, X: np.ndarray, df: pd.DataFrame, out_suffix: str=""):
    clf_pred = clf.predict(X)
    df['label'] = clf_pred
    df.loc[df['label'] == 0, 'Class'] = 'M'
    df.loc[df['label'] == 1, 'Class'] = 'H'
    df.to_csv(f'submission_{out_suffix}.csv', columns=['Id','Class'], index=False)  

In [3]:
ds_path = './dataset/'
cp_path = 'test_trainer/checkpoint-8068'

# RuATD Dataset

In [4]:
df_train = pd.read_csv(os.path.join(ds_path, 'train.csv'))
df_train.loc[df_train['Class'] == 'M', 'label'] = 0
df_train.loc[df_train['Class'] == 'H', 'label'] = 1
df_train = df_train.convert_dtypes()

y_train = df_train['label'].to_numpy(dtype=np.int8)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129066 entries, 0 to 129065
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Id      129066 non-null  Int64 
 1   Text    129066 non-null  string
 2   Class   129066 non-null  string
 3   label   129066 non-null  Int64 
dtypes: Int64(2), string(2)
memory usage: 4.2 MB


In [5]:
df_val = pd.read_csv(os.path.join(ds_path, 'val.csv'))
df_val.loc[df_val['Class'] == 'M', 'label'] = 0
df_val.loc[df_val['Class'] == 'H', 'label'] = 1
df_val = df_val.convert_dtypes()

y_val = df_val['label'].to_numpy(dtype=np.int8)

df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21511 entries, 0 to 21510
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      21511 non-null  Int64 
 1   Text    21511 non-null  string
 2   Class   21511 non-null  string
 3   label   21511 non-null  Int64 
dtypes: Int64(2), string(2)
memory usage: 714.4 KB


In [6]:
df_test = pd.read_csv(os.path.join(ds_path, 'test.csv'))

## Features Dataset

In [7]:
feats_path = './others/Data/features'

In [8]:
# train

with open(os.path.join(feats_path, 'train_feats.pkl'), 'rb') as f:
    train_feats = pickle.load(f)

with open(os.path.join(feats_path, 'train_QFT.pkl'), 'rb') as f:
    train_qfeats = pickle.load(f)

x_train_feats = np.hstack((train_feats, train_qfeats))

In [9]:
# val

with open(os.path.join(feats_path, 'val_feats.pkl'), 'rb') as f:
    val_feats = pickle.load(f)

with open(os.path.join(feats_path, 'val_QFT.pkl'), 'rb') as f:
    val_qfeats = pickle.load(f)

x_val_feats = np.hstack((val_feats, val_qfeats))


In [10]:
# test

with open(os.path.join(feats_path, 'test_feats.pkl'), 'rb') as f:
    test_feats = pickle.load(f)

with open(os.path.join(feats_path, 'test_QFT.pkl'), 'rb') as f:
   test_qfeats = pickle.load(f)

x_test_feats = np.hstack((test_feats, test_qfeats))

# 1. GLTR

In [3]:
def top_k_logits(logits, k):
    """
    Filters logits to only the top k choices
    from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py
    """
    if k == 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1]
    return torch.where(logits < min_values,
                       torch.ones_like(logits, dtype=logits.dtype) * -1e10,
                       logits)

In [4]:
class AbstractLanguageChecker:
    """
    Abstract Class that defines the Backend API of GLTR.

    To extend the GLTR interface, you need to inherit this and
    fill in the defined functions.
    """

    def __init__(self):
        """
        In the subclass, you need to load all necessary components
        for the other functions.
        Typically, this will comprise a tokenizer and a model.
        """
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

    def check_probabilities(self, in_text, topk=40):
        """
        Function that GLTR interacts with to check the probabilities of words

        Params:
        - in_text: str -- The text that you want to check
        - topk: int -- Your desired truncation of the head of the distribution

        Output:
        - payload: dict -- The wrapper for results in this function, described below

        Payload values
        ==============
        bpe_strings: list of str -- Each individual token in the text
        real_topk: list of tuples -- (ranking, prob) of each token
        pred_topk: list of list of tuple -- (word, prob) for all topk
        """
        raise NotImplementedError

    def postprocess(self, token):
        """
        clean up the tokens from any special chars and encode
        leading space by UTF-8 code '\u0120', linebreak with UTF-8 code 266 '\u010A'
        :param token:  str -- raw token text
        :return: str -- cleaned and re-encoded token text
        """
        raise NotImplementedError




In [5]:
class RuLM(AbstractLanguageChecker):
    def __init__(self, model_name_or_path="sberbank-ai/rugpt3small_based_on_gpt2"):
        super(RuLM, self).__init__()
        self.enc = AutoTokenizer.from_pretrained(model_name_or_path)
        self.enc.add_special_tokens({'bos_token': '<s>'})

        self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
        self.model.resize_token_embeddings(len(self.enc))
        print("Device:", self.device)
        self.model.to(self.device)
        self.model.eval()
        self.start_token = self.enc('<s>', return_tensors='pt').data['input_ids'][0]
        self.decoder = ByteLevel()
        # self.start_token = self.enc.eos_token_id
        print("Loaded GPT-3 model!")

    def check_probabilities(self, in_text, topk=40):
        # Process input
        token_ids = self.enc(in_text, return_tensors='pt').data['input_ids'][0]
        token_ids = torch.concat([self.start_token, token_ids])
        # Forward through the model
        output = self.model(token_ids.to(self.device))
        all_logits = output.logits[:-1].detach().squeeze()
        # construct target and pred
        # yhat = torch.softmax(logits[0, :-1], dim=-1)
        all_probs = torch.softmax(all_logits, dim=1)

        y = token_ids[1:]
        # Sort the predictions for each timestep
        sorted_preds = torch.argsort(all_probs, dim=1, descending=True).cpu()
        # [(pos, prob), ...]
        real_topk_pos = list(
            [int(np.where(sorted_preds[i] == y[i].item())[0][0])
             for i in range(y.shape[0])])
        real_topk_probs = all_probs[np.arange(0, y.shape[0], 1), y].data.cpu().numpy().tolist()
        real_topk_probs = list(map(lambda x: round(x, 5), real_topk_probs))

        real_topk = list(zip(real_topk_pos, real_topk_probs))
        # [str, str, ...]
        bpe_strings = [self.decoder.decode([self.enc.convert_ids_to_tokens(tok.item())]) for tok in token_ids[:]]

        bpe_strings = [self.postprocess(s) for s in bpe_strings]

        topk_prob_values, topk_prob_inds = torch.topk(all_probs, k=topk, dim=1)

        pred_topk = [list(zip([self.decoder.decode(self.enc.convert_ids_to_tokens(tok.item())) for tok in topk_prob_inds[i]] ,
                              topk_prob_values[i].data.cpu().numpy().tolist()
                              )) for i in range(y.shape[0])]
        pred_topk = [[(self.postprocess(t[0]), t[1]) for t in pred] for pred in pred_topk]


        # pred_topk = []
        payload = {'bpe_strings': bpe_strings,
                   'real_topk': real_topk,
                   'pred_topk': pred_topk}
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return payload

    def sample_unconditional(self, length=100, topk=5, temperature=1.0):
        '''
        Sample `length` words from the model.
        Code strongly inspired by
        https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py

        '''
        context = torch.full((1, 1),
                             self.enc.encoder['<s>'],
                             device=self.device,
                             dtype=torch.long)
        prev = context
        output = context
        # Forward through the model
        with torch.no_grad():
            for i in range(length):
                logits = self.model(prev).logits
                logits = logits[:, -1, :] / temperature
                # Filter predictions to topk and softmax
                probs = torch.softmax(top_k_logits(logits, k=topk),
                                      dim=-1)
                # Sample
                prev = torch.multinomial(probs, num_samples=1)
                # Construct output
                output = torch.cat((output, prev), dim=1)

        output_text = self.enc.decode(output[0].tolist())
        return output_text

    def postprocess(self, token):
        with_space = False
        with_break = False
        if token.startswith('Ġ'):
            with_space = True
            token = token[1:]
            # print(token)
        elif token.startswith('â'):
            token = ' '
        elif token.startswith('Ċ'):
            token = ' '
            with_break = True

        token = '-' if token.startswith('â') else token
        token = '“' if token.startswith('ľ') else token
        token = '”' if token.startswith('Ŀ') else token
        token = "'" if token.startswith('Ļ') else token

        if with_space:
            token = '\u0120' + token
        if with_break:
            token = '\u010A' + token

        return token

In [6]:
def real_topk_count(payload: dict) -> dict:    
    ids = np.array([x[0] for x in payload['real_topk']])
    topk_10 = ids[ids < 10]
    topk_100 = ids[np.where(np.logical_and(ids >= 10, ids < 100))]
    topk_1000 = ids[np.where(np.logical_and(ids >= 100, ids < 1000))]
    topk_over_1000 = ids[ids > 1000]

    frac_p = [payload['real_topk'][i][1] / np.max([x[1] for x in payload['pred_topk'][i]]) for i in range(len(payload['real_topk']))]

    threshold = 10
    pred_probs_normal = [[ x[1] for x in payload['pred_topk'][i][:threshold] ] for i in range(len(payload['pred_topk']))] 
    pred_probs_normal = [[y / sum(x) for y in x] for x in pred_probs_normal]  
    frac_entr = stats.entropy(pred_probs_normal, axis=1)

    return {'topk_10': np.round(len(topk_10) / len(ids), 4),
            'topk_100': np.round(len(topk_100) / len(ids), 4),
            'topk_1000': np.round(len(topk_1000) / len(ids), 4),
            'topk_over_1000': np.round(len(topk_over_1000) / len(ids), 4),
            'frac_p_median': np.round(np.median(frac_p), 4),
            'frac_entr_median': np.round(np.median(frac_entr), 4),
            'tokens_size': len(ids)}

In [None]:
# Выбираем часть примеров из выборки  в одинаковой пропорции классов

idx_limit = 10000
df_train_limited = pd.concat([df_train[df_train['Class'] == 'M'][:idx_limit], df_train[df_train['Class'] == 'H'][:idx_limit]])

In [None]:
# Получаем распределения токенов по примерам выборки

lm = RuLM()
train_payload = []
for raw_text in tqdm(df_train_limited['Text']):
    payload = lm.check_probabilities(raw_text, topk=20)
    train_payload.append(payload)

In [None]:
with open(os.path.join(ds_path, 'train_payload.pkl'), 'wb') as f:    
    pickle.dump(train_payload, f)

In [None]:
with open(os.path.join(ds_path, 'train_payload.pkl'), 'rb') as f:    
    new_train_payload = pickle.load(f)

print(len(new_train_payload))

In [None]:
# извлекаем признаки из распределений токенов

df_topk = pd.DataFrame.from_dict([real_topk_count(payload) for payload in tqdm(new_train_payload)])

In [None]:
df_train_limited.reset_index(drop=True, inplace=True)

In [None]:
df_train_topk = pd.DataFrame.join(df_train_limited, df_topk)

In [None]:
df_train_topk.to_pickle(os.path.join(ds_path, 'df_train_topk.pkl'))

In [12]:
df_new = pd.read_pickle(os.path.join(ds_path, 'df_train_topk.pkl')) 

In [13]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                20000 non-null  int64  
 1   Text              20000 non-null  object 
 2   Class             20000 non-null  object 
 3   topk_10           20000 non-null  float64
 4   topk_100          20000 non-null  float64
 5   topk_1000         20000 non-null  float64
 6   topk_over_1000    20000 non-null  float64
 7   frac_p_median     20000 non-null  float64
 8   frac_entr_median  20000 non-null  float64
 9   tokens_size       20000 non-null  int64  
dtypes: float64(6), int64(2), object(2)
memory usage: 1.5+ MB


In [14]:
df_new.head()

Unnamed: 0,Id,Text,Class,topk_10,topk_100,topk_1000,topk_over_1000,frac_p_median,frac_entr_median,tokens_size
0,3,Минстрой обозначил способы снижения энергоемко...,M,0.4167,0.3333,0.0833,0.1667,0.1913,1.9226,12
1,4,В конце 1873 года военный суд вынес решение по...,M,0.5789,0.1053,0.1579,0.1579,0.0352,1.6375,19
2,13,Земная атмосфера не имеет ничего общего со сти...,M,0.9219,0.0547,0.0156,0.0078,0.4203,1.6183,128
3,18,Жертва вводит данные банковской карты или крип...,M,0.4286,0.2143,0.1429,0.2143,0.0676,1.7483,14
4,26,"В голове в духе 461-го года, а я сижу дома и д...",M,0.6494,0.3384,0.0076,0.0046,0.1923,1.9237,656


## Model

In [15]:
df_new.loc[:,'topk_10':'tokens_size'].head()

Unnamed: 0,topk_10,topk_100,topk_1000,topk_over_1000,frac_p_median,frac_entr_median,tokens_size
0,0.4167,0.3333,0.0833,0.1667,0.1913,1.9226,12
1,0.5789,0.1053,0.1579,0.1579,0.0352,1.6375,19
2,0.9219,0.0547,0.0156,0.0078,0.4203,1.6183,128
3,0.4286,0.2143,0.1429,0.2143,0.0676,1.7483,14
4,0.6494,0.3384,0.0076,0.0046,0.1923,1.9237,656


In [16]:
x_train, x_test, y_train, y_test = train_test_split(df_new.loc[:,'topk_10':'tokens_size'], df_new['Class'], test_size=0.1)

In [17]:
encoder = LabelEncoder()
encoder.fit(y_train)

LabelEncoder()

In [18]:
y_train_enc = encoder.transform(y_train)
y_test_enc = encoder.transform(y_test)

In [19]:
clf = LogisticRegression()
clf.fit(x_train, y_train_enc)

LogisticRegression()

In [20]:
y_test_pred = clf.predict(x_test)

In [24]:
print(classification_report(y_test_enc, y_test_pred))
print(confusion_matrix(y_test_enc, y_test_pred))

              precision    recall  f1-score   support

           0       0.59      0.48      0.53      1023
           1       0.54      0.65      0.59       977

    accuracy                           0.56      2000
   macro avg       0.57      0.57      0.56      2000
weighted avg       0.57      0.56      0.56      2000

[[489 534]
 [338 639]]


# 2. Feature Extraction

In [11]:
def clf_eval(clf, x, y_true):
    y_pred = clf.predict(x)
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

In [12]:
x_train = x_train_feats
x_val = x_val_feats
x_test = x_test_feats

In [13]:
ct = ColumnTransformer([
        ('scaler', StandardScaler(), list(range(x_train.shape[1])))
    ], remainder='passthrough')

x_train_scaled = ct.fit_transform(x_train)
x_val_scaled = ct.transform(x_val)
x_test_scaled = ct.transform(x_test)

In [15]:
clf_lr = LogisticRegression(max_iter=1000, random_state=0, verbose=0).fit(x_train_scaled, y_train)
clf_eval(clf_lr, x_val_scaled, y_val)

              precision    recall  f1-score   support

           0       0.74      0.76      0.75     10755
           1       0.75      0.73      0.74     10756

    accuracy                           0.74     21511
   macro avg       0.74      0.74      0.74     21511
weighted avg       0.74      0.74      0.74     21511

[[8132 2623]
 [2915 7841]]


In [None]:
lr_pred = clf_lr.predict(x_val_scaled)
np.save('val_logreg_pred.npy', lr_pred)

In [21]:
clf_hgboost = HistGradientBoostingClassifier(random_state=0).fit(x_train_scaled, y_train)
clf_eval(clf_hgboost, x_val_scaled, y_val)

              precision    recall  f1-score   support

           0       0.76      0.75      0.76     10755
           1       0.75      0.76      0.76     10756

    accuracy                           0.76     21511
   macro avg       0.76      0.76      0.76     21511
weighted avg       0.76      0.76      0.76     21511

[[8079 2676]
 [2540 8216]]


In [22]:
submission(clf_hgboost, x_test_scaled, df_test, 'hgboost')

In [None]:
hgboost_pred = clf_hgboost.predict(x_val_scaled)
np.save('val_hgboost_pred.npy', hgboost_pred)

# 3. BERT

Обучение выполнялось скриптами из каталога ./bert/

In [11]:
model_loaded = AutoModelForSequenceClassification.from_pretrained(cp_path)
tokenizer_loaded = AutoTokenizer.from_pretrained(cp_path)

## Evaluate

In [None]:
ds_val = build_dataset(df_val, tokenizer_loaded, max_length=200, with_label=True)

  0%|          | 0/22 [00:00<?, ?ba/s]

In [None]:
training_args = TrainingArguments("eval_trainer", 
                                per_device_train_batch_size=64, 
                                per_device_eval_batch_size=64,
                                do_train=False,
                                do_eval=False,
                                report_to="none"
                                )

trainer = Trainer(model=model_loaded, 
                    args=training_args)

val_preds = trainer.predict(ds_val).predictions

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text.
***** Running Prediction *****
  Num examples = 21511
  Batch size = 64


In [None]:
np.save(os.path.join(ds_path, "val_probs_bert"), val_preds) # сохраняем логиты для использования в soft VotingClassifier 

In [None]:
val_preds = np.argmax(val_preds, axis=1) 
np.save(os.path.join(ds_path, "val_preds_bert"), val_preds)

## Submission

In [None]:
df_test = pd.read_csv(os.path.join(ds_path, 'test.csv'))

In [None]:
ds_test = build_dataset(df_test, tokenizer_loaded, max_length=200, with_label=False)

  0%|          | 0/65 [00:00<?, ?ba/s]

In [None]:
training_args = TrainingArguments("test_trainer", 
                                per_device_train_batch_size=64, 
                                per_device_eval_batch_size=64,
                                do_train=False,
                                do_eval=False,
                                report_to="none"
                                )

trainer = Trainer(model=model_loaded, args=training_args)
test_preds = trainer.predict(ds_test).predictions

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text, Id.
***** Running Prediction *****
  Num examples = 64533
  Batch size = 64


In [None]:
np.save(os.path.join(ds_path, "test_bert_prob"), test_preds) # сохраняем логиты для использования в soft VotingClassifier

[ 4.5222025 -4.2354918]


In [None]:
test_preds_labels = np.argmax(test_preds, axis=1)
np.save(os.path.join(ds_path, "test_bert_pred"), test_preds_labels) 

In [None]:
df_test['label'] = test_preds_labels

In [None]:
df_test.loc[df_test['label'] == 0, 'Class'] = 'M'
df_test.loc[df_test['label'] == 1, 'Class'] = 'H'

In [None]:
df_test.to_csv('submission.csv', columns=['Id','Class'], index=False) 

# 4. Ансамбль классификаторов

In [None]:
x_train = x_train_feats
x_val = x_val_feats
x_test = x_test_feats

In [None]:
ct = ColumnTransformer([
        ('scaler', StandardScaler(), list(range(x_train.shape[1])))
    ], remainder='passthrough')

x_train_scaled = ct.fit_transform(x_train)
x_val_scaled = ct.transform(x_val)
x_test_scaled = ct.transform(x_test)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# По факту класс для классификатора BERT в работе не использовался, вместо него применялся класс заглушка для ускорения эксперимента

class BertClassifiier(BaseEstimator, ClassifierMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 512):
        
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.eval().to(self.device)
        self.max_length = max_length
   
    def fit(self, X, y=None):
        return self

    def predict_proba(self, X, y=None):
        dataset = build_dataset(X, self.tokenizer, max_length=200, with_label=False)
        training_args = TrainingArguments("eval_trainer", 
                                per_device_train_batch_size=64, 
                                per_device_eval_batch_size=64,
                                do_train=False,
                                do_eval=False,
                                report_to="none"
                                )
        
        trainer = Trainer(model=self.model, args=training_args)
        
        return trainer.predict(dataset).predictions

In [18]:
# Классификатор Bert заглушка, которая работает уже готовыми предсказаниями BERT. Сделано для ускорение эксперимента

class DummyBertClassifiier(BaseEstimator, ClassifierMixin):    
    def __init__(self, dataset: str='val'):
        self.dataset = dataset

    def fit(self, X, y=None):
        """Классификатор не учится, потому что использует готовые предсказания"""
        return self

    def predict_proba(self, X, y=None):
        result=None
        if self.dataset == 'val':
            result = np.load("val_bert_prob.npy")
        elif self.dataset == 'test':
            result = np.load("test_bert_prob.npy")         
        return result

In [19]:
clf_bert_dummy = DummyBertClassifiier(dataset='test')
clf_hgboost = HistGradientBoostingClassifier(random_state=0)
clf_lr = LogisticRegression(max_iter=5000, random_state=0, verbose=0)

eclf_soft = VotingClassifier(estimators=[('hist_gb', clf_hgboost), ('lr', clf_lr), ('bert', clf_bert_dummy)], voting='soft')
eclf_soft.fit(x_train_scaled, y_train)

VotingClassifier(estimators=[('hist_gb',
                              HistGradientBoostingClassifier(random_state=0)),
                             ('lr',
                              LogisticRegression(max_iter=5000,
                                                 random_state=0)),
                             ('bert', DummyBertClassifiier(dataset='test'))],
                 voting='soft')

In [25]:
submission(clf=eclf_soft, X=x_test_scaled, df=df_test, out_suffix='eclf_soft')

# 5. Комбинация признаков

In [26]:
model_loaded = AutoModelForSequenceClassification.from_pretrained(cp_path)
tokenizer_loaded = AutoTokenizer.from_pretrained(cp_path)

In [27]:
ds_train = build_dataset(df_train, tokenizer_loaded, max_length=200, with_label=False)
ds_val = build_dataset(df_val, tokenizer_loaded, max_length=200, with_label=False)
ds_test = build_dataset(df_test, tokenizer_loaded, max_length=200, with_label=False)
train_loader = torch.utils.data.DataLoader(ds_train, shuffle=False, batch_size=32)
val_loader = torch.utils.data.DataLoader(ds_val, shuffle=False, batch_size=32)
test_loader = torch.utils.data.DataLoader(ds_test, shuffle=False, batch_size=32)

100%|██████████| 130/130 [00:18<00:00,  6.99ba/s]
100%|██████████| 22/22 [00:04<00:00,  5.09ba/s]
100%|██████████| 65/65 [00:10<00:00,  6.11ba/s]


In [28]:
# Берутся все слои Bert без выходного слоя классификатора

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

model_nohead = torch.nn.Sequential(list(model_loaded.children())[0])
model_nohead.eval().to(device)

cpu


Sequential(
  (0): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [None]:
with torch.no_grad():
        x_train_bert = torch.vstack([model_nohead(batch['input_ids'].to(device)).last_hidden_state[:,0].detach().to('cpu') for batch in tqdm(train_loader)])

np.save(os.path.join(ds_path, 'x_train_emb.npy'), x_train_bert.numpy())

100%|██████████| 4034/4034 [27:00<00:00,  2.49it/s]


In [None]:
with torch.no_grad():
        x_val_bert = torch.vstack([model_nohead(batch['input_ids'].to(device)).last_hidden_state[:,0].detach().to('cpu') for batch in tqdm(val_loader)])
np.save(os.path.join(ds_path, 'x_val_emb.npy'), x_val_bert.numpy())

100%|██████████| 673/673 [04:29<00:00,  2.50it/s]


In [16]:
with torch.no_grad():
        x_test_bert = torch.vstack([model_nohead(batch['input_ids'].to(device)).last_hidden_state[:,0].detach().to('cpu') for batch in tqdm(test_loader)])

np.save(os.path.join(ds_path, 'x_test_emb.npy'), x_test_bert.numpy())

100%|██████████| 2017/2017 [25:01<00:00,  1.34it/s]


In [29]:
x_train_emb = np.load(os.path.join(ds_path, 'x_train_emb.npy'))
x_val_emb = np.load(os.path.join(ds_path, 'x_val_emb.npy'))
x_test_emb = np.load(os.path.join(ds_path, 'x_test_emb.npy'))

x_train = np.hstack((x_train_feats, x_train_emb))
print(x_train_emb.shape, x_train_feats.shape, x_train.shape)

x_val = np.hstack((x_val_feats, x_val_emb))
print(x_val_emb.shape, x_val_feats.shape, x_val.shape)

x_test = np.hstack((x_test_feats, x_test_emb))
print(x_test_emb.shape, x_test_feats.shape, x_test.shape)

(129066, 768) (129066, 219) (129066, 987)
(21511, 768) (21511, 219) (21511, 987)
(64533, 768) (64533, 219) (64533, 987)


In [30]:
ct = ColumnTransformer([
        ('scaler', StandardScaler(), list(range(x_train.shape[1])))
    ], remainder='passthrough')

x_train_scaled = ct.fit_transform(x_train)
x_val_scaled = ct.transform(x_val)
x_test_scaled = ct.transform(x_test)

In [31]:
clf_lr = LogisticRegression(max_iter=5000, random_state=0, verbose=0).fit(x_train_scaled, y_train)
clf_eval(clf_lr, x_val_scaled, y_val)

              precision    recall  f1-score   support

           0       0.82      0.82      0.82     10755
           1       0.82      0.82      0.82     10756

    accuracy                           0.82     21511
   macro avg       0.82      0.82      0.82     21511
weighted avg       0.82      0.82      0.82     21511

[[8769 1986]
 [1912 8844]]


In [32]:
dump(clf_lr, 'models/logreg.joblib')

['models/logreg.joblib']

In [35]:
clf_lr= load('models/logreg.joblib')
print(clf_lr.score(x_val_scaled, y_val))

submission(clf_lr, x_test_scaled, df_test, 'logreg2')

0.8187903863139789
