# LM generated text detection

In [None]:
!pip install transformers tokenizers datasets

In [1]:
import pandas as pd
import numpy as np
import torch
import pickle
from tqdm import tqdm
import os
import time
from joblib import dump, load

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from scipy import stats
from transformers import (AutoModelForSequenceClassification, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback, AutoConfig)
from datasets import Dataset, load_metric, Features, ClassLabel, Value
from tokenizers.decoders import ByteLevel

from typing import Optional, Callable, Tuple, List

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Colab\ Notebooks/kaggle/ruatd22_competion

/content/drive/MyDrive/Colab Notebooks/kaggle/ruatd22_competion


In [22]:
def submission(clf, X: np.ndarray, df: pd.DataFrame, out_suffix: str=""):
    clf_pred = clf.predict(X)
    df['label'] = clf_pred
    df.loc[df_test['label'] == 0, 'Class'] = 'M'
    df.loc[df_test['label'] == 1, 'Class'] = 'H'
    df_test.to_csv(f'submission_{out_suffix}.csv', columns=['Id','Class'], index=False)  

# RuATD Dataset

In [2]:
df_train = pd.read_csv('train.csv')
df_train.loc[df_train['Class'] == 'M', 'label'] = 0
df_train.loc[df_train['Class'] == 'H', 'label'] = 1
df_train = df_train.convert_dtypes()
df_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129066 entries, 0 to 129065
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Id      129066 non-null  Int64 
 1   Text    129066 non-null  string
 2   Class   129066 non-null  string
 3   label   129066 non-null  Int64 
dtypes: Int64(2), string(2)
memory usage: 4.2 MB


In [3]:
df_val = pd.read_csv('val.csv')
df_val.loc[df_val['Class'] == 'M', 'label'] = 0
df_val.loc[df_val['Class'] == 'H', 'label'] = 1
df_val = df_val.convert_dtypes()
df_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21511 entries, 0 to 21510
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      21511 non-null  Int64 
 1   Text    21511 non-null  string
 2   Class   21511 non-null  string
 3   label   21511 non-null  Int64 
dtypes: Int64(2), string(2)
memory usage: 714.4 KB


In [4]:
df_test = pd.read_csv('test.csv')

## Stats Features Dataset

In [5]:
feats_path = './others/Data/features'

In [6]:
# train

y_train = df_train['label'].to_numpy(dtype=np.int8)
with open(os.path.join(feats_path, 'train_feats.pkl'), 'rb') as f:
    train_feats = pickle.load(f)

with open(os.path.join(feats_path, 'train_QFT.pkl'), 'rb') as f:
    train_qfeats = pickle.load(f)

x_train_feats = np.hstack((train_feats, train_qfeats))

In [7]:
x_train_emb = np.load('x_train_emb.npy')
x_val_emb = np.load('x_val_emb.npy')
x_test_emb = np.load('x_test_emb.npy')

In [8]:
# val

y_val = df_val['label'].to_numpy(dtype=np.int8)

with open(os.path.join(feats_path, 'val_feats.pkl'), 'rb') as f:
    val_feats = pickle.load(f)

with open(os.path.join(feats_path, 'val_QFT.pkl'), 'rb') as f:
    val_qfeats = pickle.load(f)

x_val_feats = np.hstack((val_feats, val_qfeats))


In [9]:
# test

with open(os.path.join(feats_path, 'test_feats.pkl'), 'rb') as f:
    test_feats = pickle.load(f)

with open(os.path.join(feats_path, 'test_QFT.pkl'), 'rb') as f:
   test_qfeats = pickle.load(f)

x_test_feats = np.hstack((test_feats, test_qfeats))

In [10]:
x_train = np.hstack((x_train_feats, x_train_emb))
print(x_train_emb.shape, x_train_feats.shape, x_train.shape)

(129066, 768) (129066, 219) (129066, 987)


In [11]:
ct = ColumnTransformer([
        ('scaler', StandardScaler(), list(range(x_train.shape[1])))
    ], remainder='passthrough')

x_train_scaled = ct.fit_transform(x_train)

In [12]:
print(np.max(x_train), np.min(x_train))
print(np.max(x_train_scaled), np.min(x_train_scaled))

600000000.0 -3265.825
359.2561760081403 -60.314761350217026


In [13]:
x_val = np.hstack((x_val_feats, x_val_emb))
print(x_val_emb.shape, x_val_feats.shape, x_val.shape)

(21511, 768) (21511, 219) (21511, 987)


In [14]:
x_val_scaled = ct.transform(x_val)

In [15]:
print(np.max(x_val), np.min(x_val))
print(np.max(x_val_scaled), np.min(x_val_scaled))

2000000000.0 -1741.5025
1122.1494566910153 -32.06115424534341


In [17]:
x_test = np.hstack((x_test_feats, x_test_emb))
print(x_test_emb.shape, x_test_feats.shape, x_test.shape)

x_test_scaled = ct.transform(x_test)

(64533, 768) (64533, 219) (64533, 987)


# 1. GLTR

In [None]:
def top_k_logits(logits, k):
    """
    Filters logits to only the top k choices
    from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py
    """
    if k == 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1]
    return torch.where(logits < min_values,
                       torch.ones_like(logits, dtype=logits.dtype) * -1e10,
                       logits)

In [None]:
class AbstractLanguageChecker:
    """
    Abstract Class that defines the Backend API of GLTR.

    To extend the GLTR interface, you need to inherit this and
    fill in the defined functions.
    """

    def __init__(self):
        """
        In the subclass, you need to load all necessary components
        for the other functions.
        Typically, this will comprise a tokenizer and a model.
        """
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

    def check_probabilities(self, in_text, topk=40):
        """
        Function that GLTR interacts with to check the probabilities of words

        Params:
        - in_text: str -- The text that you want to check
        - topk: int -- Your desired truncation of the head of the distribution

        Output:
        - payload: dict -- The wrapper for results in this function, described below

        Payload values
        ==============
        bpe_strings: list of str -- Each individual token in the text
        real_topk: list of tuples -- (ranking, prob) of each token
        pred_topk: list of list of tuple -- (word, prob) for all topk
        """
        raise NotImplementedError

    def postprocess(self, token):
        """
        clean up the tokens from any special chars and encode
        leading space by UTF-8 code '\u0120', linebreak with UTF-8 code 266 '\u010A'
        :param token:  str -- raw token text
        :return: str -- cleaned and re-encoded token text
        """
        raise NotImplementedError




In [None]:
class RuLM(AbstractLanguageChecker):
    def __init__(self, model_name_or_path="sberbank-ai/rugpt3small_based_on_gpt2"):
        super(RuLM, self).__init__()
        self.enc = AutoTokenizer.from_pretrained(model_name_or_path)
        self.enc.add_special_tokens({'bos_token': '<s>'})

        self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
        self.model.resize_token_embeddings(len(self.enc))
        print("Device:", self.device)
        self.model.to(self.device)
        self.model.eval()
        self.start_token = self.enc('<s>', return_tensors='pt').data['input_ids'][0]
        self.decoder = ByteLevel()
        # self.start_token = self.enc.eos_token_id
        print("Loaded GPT-3 model!")

    def check_probabilities(self, in_text, topk=40):
        # Process input
        token_ids = self.enc(in_text, return_tensors='pt').data['input_ids'][0]
        token_ids = torch.concat([self.start_token, token_ids])
        # Forward through the model
        output = self.model(token_ids.to(self.device))
        all_logits = output.logits[:-1].detach().squeeze()
        # construct target and pred
        # yhat = torch.softmax(logits[0, :-1], dim=-1)
        all_probs = torch.softmax(all_logits, dim=1)

        y = token_ids[1:]
        # Sort the predictions for each timestep
        sorted_preds = torch.argsort(all_probs, dim=1, descending=True).cpu()
        # [(pos, prob), ...]
        real_topk_pos = list(
            [int(np.where(sorted_preds[i] == y[i].item())[0][0])
             for i in range(y.shape[0])])
        real_topk_probs = all_probs[np.arange(0, y.shape[0], 1), y].data.cpu().numpy().tolist()
        real_topk_probs = list(map(lambda x: round(x, 5), real_topk_probs))

        real_topk = list(zip(real_topk_pos, real_topk_probs))
        # [str, str, ...]
        bpe_strings = [self.decoder.decode([self.enc.convert_ids_to_tokens(tok.item())]) for tok in token_ids[:]]

        bpe_strings = [self.postprocess(s) for s in bpe_strings]

        topk_prob_values, topk_prob_inds = torch.topk(all_probs, k=topk, dim=1)

        pred_topk = [list(zip([self.decoder.decode(self.enc.convert_ids_to_tokens(tok.item())) for tok in topk_prob_inds[i]] ,
                              topk_prob_values[i].data.cpu().numpy().tolist()
                              )) for i in range(y.shape[0])]
        pred_topk = [[(self.postprocess(t[0]), t[1]) for t in pred] for pred in pred_topk]


        # pred_topk = []
        payload = {'bpe_strings': bpe_strings,
                   'real_topk': real_topk,
                   'pred_topk': pred_topk}
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return payload

    def sample_unconditional(self, length=100, topk=5, temperature=1.0):
        '''
        Sample `length` words from the model.
        Code strongly inspired by
        https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py

        '''
        context = torch.full((1, 1),
                             self.enc.encoder['<s>'],
                             device=self.device,
                             dtype=torch.long)
        prev = context
        output = context
        # Forward through the model
        with torch.no_grad():
            for i in range(length):
                logits = self.model(prev).logits
                logits = logits[:, -1, :] / temperature
                # Filter predictions to topk and softmax
                probs = torch.softmax(top_k_logits(logits, k=topk),
                                      dim=-1)
                # Sample
                prev = torch.multinomial(probs, num_samples=1)
                # Construct output
                output = torch.cat((output, prev), dim=1)

        output_text = self.enc.decode(output[0].tolist())
        return output_text

    def postprocess(self, token):
        with_space = False
        with_break = False
        if token.startswith('Ġ'):
            with_space = True
            token = token[1:]
            # print(token)
        elif token.startswith('â'):
            token = ' '
        elif token.startswith('Ċ'):
            token = ' '
            with_break = True

        token = '-' if token.startswith('â') else token
        token = '“' if token.startswith('ľ') else token
        token = '”' if token.startswith('Ŀ') else token
        token = "'" if token.startswith('Ļ') else token

        if with_space:
            token = '\u0120' + token
        if with_break:
            token = '\u010A' + token

        return token

In [None]:
def real_topk_count(payload: dict) -> dict:    
    ids = np.array([x[0] for x in payload['real_topk']])
    topk_10 = ids[ids < 10]
    topk_100 = ids[np.where(np.logical_and(ids >= 10, ids < 100))]
    topk_1000 = ids[np.where(np.logical_and(ids >= 100, ids < 1000))]
    topk_over_1000 = ids[ids > 1000]

    frac_p = [payload['real_topk'][i][1] / np.max([x[1] for x in payload['pred_topk'][i]]) for i in range(len(payload['real_topk']))]

    threshold = 10
    pred_probs_normal = [[ x[1] for x in payload['pred_topk'][i][:threshold] ] for i in range(len(payload['pred_topk']))] 
    pred_probs_normal = [[y / sum(x) for y in x] for x in pred_probs_normal]  
    frac_entr = stats.entropy(pred_probs_normal, axis=1)

    return {'topk_10': np.round(len(topk_10) / len(ids), 4),
            'topk_100': np.round(len(topk_100) / len(ids), 4),
            'topk_1000': np.round(len(topk_1000) / len(ids), 4),
            'topk_over_1000': np.round(len(topk_over_1000) / len(ids), 4),
            'frac_p_median': np.round(np.median(frac_p), 4),
            'frac_entr_median': np.round(np.median(frac_entr), 4),
            'tokens_size': len(ids)}

In [None]:
idx_limit = 10000
df_train_limited = pd.concat([df_train[df_train['Class'] == 'M'][:idx_limit], df_train[df_train['Class'] == 'H'][:idx_limit]])

In [None]:
lm = RuLM()
train_payload = []
for raw_text in tqdm(df_train_limited['Text']):
    payload = lm.check_probabilities(raw_text, topk=20)
    train_payload.append(payload)

In [None]:
with open('train_payload.pkl', 'wb') as f:    
    pickle.dump(train_payload, f)

In [None]:
with open('train_payload.pkl', 'rb') as f:    
    new_train_payload = pickle.load(f)

In [None]:
print(len(new_train_payload))

In [None]:
df_topk = pd.DataFrame.from_dict([real_topk_count(payload) for payload in tqdm(new_train_payload)])

In [None]:
df_train_limited.reset_index(drop=True, inplace=True)

In [None]:
df_train_topk = pd.DataFrame.join(df_train_limited, df_topk)

In [None]:
df_train_topk.to_pickle('df_train_topk.pkl')

In [None]:
df_new = pd.read_pickle('df_train_topk.pkl') 

In [None]:
df_new.info()

## Models

## LogReg

In [None]:
df_new.loc[:,'topk_10':'tokens_size'].head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_new.loc[:,'topk_10':'tokens_size'], df_new['Class'], test_size=0.1)

In [None]:
encoder = LabelEncoder()
encoder.fit(y_train)

In [None]:
y_train_enc = encoder.transform(y_train)
y_test_enc = encoder.transform(y_test)

In [None]:
clf = LogisticRegression()
clf.fit(x_train, y_train_enc)

In [None]:
y_test_pred = clf.predict(x_test)

In [None]:
y_train_pred = clf.predict(x_train)

In [None]:
print(classification_report(y_train_enc, y_train_pred))

In [None]:
print(classification_report(y_test_enc, y_test_pred))
print(confusion_matrix(y_test_enc, y_test_pred))

## SVC

In [None]:
scaler = StandardScaler()
x_train['tokens_size'] = scaler.fit_transform(x_train['tokens_size'].to_numpy().reshape(-1, 1))

In [None]:
clf_svc = LinearSVC(max_iter=10000)
clf_svc.fit(x_train, y_train_enc)

In [None]:
y_pred = clf_svc.predict(x_train)
print(classification_report(y_train_enc, y_pred))

# 2. Feature Extraction

In [19]:
def clf_eval(clf, x, y_true):
    y_pred = clf.predict(x)
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))

## Linear models

In [None]:
clf = SGDClassifier(shuffle=True, random_state=0)
clf.fit(x_train_scaled, y_train)
clf_eval(clf, x_val_scaled, y_val)

              precision    recall  f1-score   support

           0       0.82      0.79      0.81     10755
           1       0.80      0.83      0.81     10756

    accuracy                           0.81     21511
   macro avg       0.81      0.81      0.81     21511
weighted avg       0.81      0.81      0.81     21511

[[8501 2254]
 [1814 8942]]


In [None]:
clf_lr = LogisticRegression(max_iter=5000, random_state=0, verbose=0).fit(x_train_scaled, y_train)
clf_eval(clf_lr, x_val_scaled, y_val)

              precision    recall  f1-score   support

           0       0.82      0.82      0.82     10755
           1       0.82      0.82      0.82     10756

    accuracy                           0.82     21511
   macro avg       0.82      0.82      0.82     21511
weighted avg       0.82      0.82      0.82     21511

[[8769 1986]
 [1912 8844]]


In [None]:
lr_pred = clf_lr.predict(x_val_scaled)
np.save('val_logreg_pred.npy', lr_pred)

## LightGBM

In [None]:
clf_hgboost = HistGradientBoostingClassifier(random_state=0).fit(x_train_scaled, y_train)
clf_eval(clf_hgboost, x_val_scaled, y_val)

              precision    recall  f1-score   support

           0       0.82      0.81      0.81     10755
           1       0.81      0.82      0.81     10756

    accuracy                           0.81     21511
   macro avg       0.81      0.81      0.81     21511
weighted avg       0.81      0.81      0.81     21511

[[8659 2096]
 [1937 8819]]


In [None]:
clf_hgboost.score(x_val_scaled, y_val)

0.8125145274510716

In [None]:
hgboost_pred = clf_hgboost.predict(x_val_scaled)
np.save('val_hgboost_pred.npy', hgboost_pred)

## Ensemble

In [None]:
clf1 = HistGradientBoostingClassifier(random_state=0)
clf2 = LogisticRegression(max_iter=5000, random_state=0)
clf3 = SGDClassifier(shuffle=True, random_state=0)
# clf3 = RandomForestClassifier(random_state=0)

Accuracy: 0.84 (+/- 0.00) [Hist GradBoost]
Accuracy: 0.86 (+/- 0.00) [LogReg]
Accuracy: 0.83 (+/- 0.00) [Random Forest]
Accuracy: 0.85 (+/- 0.00) [Ensemble]


In [None]:
eclf = VotingClassifier(estimators=[('hist_gb', clf1), ('lr', clf2), ('rf', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Hist GradBoost', 'LogReg', 'Random Forest', 'Ensemble']):
    scores = cross_val_score(clf, x_train_scaled, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

## Logreg, Ensemble with Bert + Stat Feats

In [None]:
eclf_soft = VotingClassifier(estimators=[('hist_gb', clf1), ('lr', clf2), ('sgd', clf3)], voting='soft')
eclf_soft.fit(x_train_scaled, y_train)
eclf_soft.score(x_val_scaled, y_val)

0.821207754172284

In [None]:
eclf.fit(x_train_scaled, y_train)
eclf.score(x_val_scaled, y_val)

0.8176281902282553

In [None]:
dump(eclf_soft, 'models/eclf_soft.joblib') 
dump(clf_lr, 'models/logreg.joblib')

['models/logreg.joblib']

In [20]:
eclf_soft = load('models/eclf_soft.joblib')

In [31]:
clf_lr= load('models/logreg.joblib')

In [32]:
clf_lr.score(x_val_scaled, y_val)

0.8187903863139789

In [33]:
submission(clf_lr, x_test_scaled, df_test, 'logreg')

## MLP

In [None]:
# parameters (tuning)
layers = [(100), (25, 50, 25)]
activations = ['relu','logistic']
learning_rates = [0.001,0.01]
alphas = [0.00005, 0.0001, 0.0005]
n_combos = len(layers) * len(activations) * len(learning_rates) * len(alphas)

# parameters (fixed)
solver = 'adam'
learning_rate = 'adaptive'
tol = 0.001
n_iter_no_change = 10
max_iter = 250

# find best combination
results_ = {'validation':{}, 'test':{}}
index_ = 0
best_acc_ = 0
best_index_ = 0

for lr in learning_rates:
	for alpha in alphas:
		for layer in layers:
			for activation in activations:

				model = MLPClassifier(hidden_layer_sizes=layer,
								activation=activation,
								solver=solver,
								alpha=alpha,
								learning_rate=learning_rate,
								learning_rate_init=lr,
								max_iter=max_iter,
								early_stopping=True,
								tol=tol,n_iter_no_change=n_iter_no_change,
								validation_fraction=0.02,
								random_state=175)

			# Train
			start_ = time.time()
			model.fit(x_train_scaled, y_train_enc)
			val_acc = model.score(x_val_scaled, y_val_enc)
			end_ = time.time()

			# update best model
			if val_acc > best_acc_:
				best_index_ = index_
				best_acc_ = val_acc

			# validation results results
			results_['validation'][index_] = {'val_acc': val_acc, 'Layers': layer, 'Activation': activation,
											  'LR': lr, 'Alpha': alpha, 'Model': model}

			# track
			print('{:4}/{:4} finished --- time: {:6.2f}\n'.format(index_+1, n_combos, end_-start_),results_['validation'][index_])
			index_ += 1

# results for best combination
best_model = results_['validation'][best_index_]['Model']

test_acc = best_model.score(x_val_scaled, y_val_enc)

results_['test']['test_acc'] = test_acc
results_['test']['best_model'] = best_model

print('Final test accuracy:\n', test_acc, '\nModel Configuration:\n', best_model)

# Final test accuracy:
#  0.755520431407187 
# Model Configuration:
#  MLPClassifier(activation='logistic', alpha=5e-05, early_stopping=True,
#               hidden_layer_sizes=100, learning_rate='adaptive', max_iter=250,
#               random_state=175, tol=0.001, validation_fraction=0.02)

## Результаты:

* SVC, GradientBoosting долго учатся, вместо них в сравнении LineraSVC и LightGBM
* лучшее качество у LightGBM, но до Bert сильно не дотягивает
* вариант многослойного перцептрона из статьи качество чуть меньшее чем LightGBM показал
* Bert + Stat дают на тестовой выборке ниже 80 качество, хотя на валидационной больше 80

# 3. BERT

In [8]:
def build_dataset(data: pd.DataFrame, tokenizer: AutoTokenizer, max_length=512, with_label=True):
    if with_label:
        class_names = ["M", "H"]
        features = Features({'Text': Value('string'), 'label': ClassLabel(names=class_names, num_classes=2)})
        dataset = Dataset.from_pandas(data, preserve_index=False, features=features)
        dataset = dataset.map(lambda e: tokenizer(e['Text'], truncation=True, padding='max_length', max_length=max_length), batched=True)    
        dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
    else:
        dataset = Dataset.from_pandas(data, preserve_index=False)
        dataset = dataset.map(lambda e: tokenizer(e['Text'], truncation=True, padding='max_length', max_length=max_length), batched=True)    
        dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
    return dataset

## Train

In [None]:
del bert
del training_args
del trainer
torch.cuda.empty_cache()

In [None]:
# model_name = "DeepPavlov/rubert-base-cased-sentence"
model_name = "DeepPavlov/rubert-base-cased"
# model_name = 'cointegrated/rubert-tiny'
num_labels = 2
batch_size = 24
epochs = 3
lr=2e-5
max_len=200

In [None]:
tokenizer_bert = AutoTokenizer.from_pretrained(model_name)

In [None]:
df_val['len'] = df_val['Text'].map(lambda x: len(tokenizer_bert(x)['input_ids'])) 
df_train['len'] = df_train['Text'].map(lambda x: len(tokenizer_bert(x)['input_ids'])) 

In [None]:
print(df_val['len'].describe())
print(df_train['len'].describe())

In [None]:
df_val['len'].plot.hist()

In [None]:
df_train['len'].plot.hist()

In [None]:
ds_val = build_dataset(df_val, tokenizer_bert, max_length=max_len)

In [None]:
ds_train = build_dataset(df_train, tokenizer_bert, max_length=max_len)

In [None]:
config_bert = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels)

bert = AutoModelForSequenceClassification.from_pretrained(model_name, config=config_bert)

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Validate accuracy on epoch
training_args_epoch = TrainingArguments("test_trainer", 
                                per_device_train_batch_size=batch_size, 
                                per_device_eval_batch_size=batch_size,
                                num_train_epochs=epochs,
                                learning_rate=lr,
                                save_strategy='epoch',
                                evaluation_strategy='epoch',
                                save_total_limit=2,
                                load_best_model_at_end=True,
                                do_train=True,
                                do_eval=True,
                                optim='adamw_torch',
                                report_to="none"
                                )

In [None]:
# Validate accuracy on step
training_args_step = TrainingArguments("test_trainer", 
                                per_device_train_batch_size=batch_size, 
                                per_device_eval_batch_size=batch_size,
                                num_train_epochs=epochs,
                                learning_rate=lr,
                                save_strategy='steps',
                                evaluation_strategy='steps',
                                logging_strategy='steps',
                                save_total_limit=2,
                                load_best_model_at_end=True,
                                do_train=True,
                                do_eval=True,
                                optim='adamw_torch',
                                report_to="none",
                                disable_tqdm=False
                                )

In [None]:
!nvidia-smi

In [None]:
trainer = Trainer(model=bert, 
                args=training_args, 
                train_dataset=ds_train, 
                eval_dataset=ds_val,
                compute_metrics=compute_metrics,
                tokenizer=tokenizer_bert, 
                callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
                )

In [None]:
train_result = trainer.train()

In [None]:
metrics = train_result.metrics

## Evaluate

In [None]:
!cp /content/drive/MyDrive/models/out.7z /content

In [None]:
!7z x /content/out.7z

In [11]:
cp_path = 'test_trainer/checkpoint-8068'

In [12]:
model_loaded = AutoModelForSequenceClassification.from_pretrained(cp_path)
tokenizer_loaded = AutoTokenizer.from_pretrained(cp_path)

## Validate

In [None]:
ds_val = build_dataset(df_val, tokenizer_loaded, max_length=200, with_label=True)

  0%|          | 0/22 [00:00<?, ?ba/s]

In [None]:
training_args = TrainingArguments("eval_trainer", 
                                per_device_train_batch_size=64, 
                                per_device_eval_batch_size=64,
                                do_train=False,
                                do_eval=False,
                                report_to="none"
                                )
trainer = Trainer(model=model_loaded, 
                    args=training_args)

val_preds = trainer.predict(ds_val).predictions

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text.
***** Running Prediction *****
  Num examples = 21511
  Batch size = 64


In [None]:
print(val_preds[0])
np.save("val_probs_bert", val_preds)

In [None]:
print(val_preds[0])

[-0.22721739  0.22898251]


In [None]:
torch.sigmoid(torch.Tensor(val_preds[0]))

tensor([0.4434, 0.5570])

In [None]:
val_preds = np.argmax(val_preds, axis=1) 
np.save("val_preds_bert", val_preds)

## Submission

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_test['len'] = df_test['Text'].map(lambda x: len(tokenizer_bert(x)['input_ids'])) 

In [None]:
df_test['len'].plot.hist()

In [None]:
ds_test = build_dataset(df_test, tokenizer_loaded, max_length=200, with_label=False)

  0%|          | 0/65 [00:00<?, ?ba/s]

In [None]:
training_args = TrainingArguments("test_trainer", 
                                per_device_train_batch_size=64, 
                                per_device_eval_batch_size=64,
                                do_train=False,
                                do_eval=False,
                                report_to="none"
                                )

trainer = Trainer(model=model_loaded, 
                    args=training_args)
test_preds = trainer.predict(ds_test).predictions

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text, Id.
***** Running Prediction *****
  Num examples = 64533
  Batch size = 64


In [None]:
print(test_preds[0])
np.save("test_bert_prob", test_preds)

[ 4.5222025 -4.2354918]


In [None]:
test_preds_labels = np.argmax(test_preds, axis=1)
np.save("test_bert_pred", test_preds_labels) 

In [None]:
df_test['label'] = test_preds_labels

In [None]:
df_test.loc[df_test['label'] == 0, 'Class'] = 'M'
df_test.loc[df_test['label'] == 1, 'Class'] = 'H'

In [None]:
df_test.to_csv('submission.csv', columns=['Id','Class'], index=False) 

# 4. Analyse models

In [None]:
val_bert_pred = np.load("val_bert_pred.npy")
val_hgboost_pred = np.load("val_hgboost_pred.npy")
val_logreg_pred = np.load("val_logreg_pred.npy")

In [None]:
df_val['bert'] = val_bert_pred
df_val['hgboost'] = val_hgboost_pred
df_val['logreg'] = val_logreg_pred

In [None]:
# swap labels in hgboost
print(df_val['hgboost'][0])
df_val['hgboost'] = df_val['hgboost'] ^ 1
print(df_val['hgboost'][0])

# swap labels in logreg
df_val['logreg'] = df_val['logreg'] ^ 1


0
1


In [None]:
bert_hgboost = df_val[df_val['bert'] != df_val['hgboost']]
bert_label = df_val[df_val['bert'] != df_val['label']]
hgboost_label = df_val[df_val['hgboost'] != df_val['label']]
logreg_label = df_val[df_val['logreg'] != df_val['label']]

In [None]:
bert_label_idxs = set(bert_label.index.to_list())
hgboost_label_idxs = set(hgboost_label.index.to_list()) 
logreg_label_idxs = set(logreg_label.index.to_list())

In [None]:
bert_hgboost_common = bert_label_idxs & hgboost_label_idxs  # сэмплы на которых ошибаются и берт и бустинг
bert_hgboost_diff = bert_label_idxs.symmetric_difference(hgboost_label_idxs)  # все сэмплы на которых ошибаются или берт или бустинг
bert_hgboost_errors = bert_label_idxs - hgboost_label_idxs  # сэмплы на которых ошибается берт, но не ошибается бустинг
hgboost_bert_errors = hgboost_label_idxs - bert_label_idxs  # сэмплы на которых ошибается бустинг, но не ошибается берт

In [None]:
logreg_hgboost_common = logreg_label_idxs & hgboost_label_idxs  # сэмплы на которых ошибаются и логрег и бустинг
logreg_hgboost_diff = hgboost_label_idxs.symmetric_difference(logreg_label_idxs)  # все сэмплы на которых ошибаются или логрег или бустинг
logreg_hgboost_errors = logreg_label_idxs - hgboost_label_idxs  # сэмплы на которых ошибается логрег, но не ошибается бустинг
hgboost_logreg_errors = hgboost_label_idxs - logreg_label_idxs  # сэмплы на которых ошибается бустинг, но не ошибается логрег

In [None]:
print(f"Common errors bert-hgboost {len(bert_hgboost_common) / df_val.shape[0]:%}")
print(f"Diff all errors {len(bert_hgboost_diff) / df_val.shape[0]:%}")
print(f"Bert fail, hgboost complete {len(bert_hgboost_errors) / df_val.shape[0]:%}")
print(f"hgboost fail, Bert complete {len(hgboost_bert_errors) / df_val.shape[0]:%}")

Common errors 7.986612%
Diff all errors 27.041979%
Bert fail, hgboost complete 10.687555%
hgboost fail, Bert complete 16.354423%


In [None]:
print(f"Common errors logreg-hgboost {len(logreg_hgboost_common) / df_val.shape[0]:%}")
print(f"Diff all errors {len(logreg_hgboost_diff) / df_val.shape[0]:%}")
print(f"Logreg fail, hgboost complete {len(logreg_hgboost_errors) / df_val.shape[0]:%}")
print(f"hgboost fail, logreg complete {len(hgboost_logreg_errors) / df_val.shape[0]:%}")

Common errors logreg-hgboost 18.371996%
Diff all errors 28.464507%
Logreg fail, hgboost complete 7.372972%
hgboost fail, logreg complete 5.969039%


In [None]:
print(f"Common errors bert-logreg-hgboost {len(bert_hgboost_common & logreg_hgboost_common) / df_val.shape[0]:%}")
print(f"Diff all errors {len(logreg_hgboost_diff.symmetric_difference(bert_hgboost_diff)) / df_val.shape[0]:%}")


Common errors logreg-hgboost 6.294454%
Diff all errors 28.464507%


In [None]:
df_val['vote_sum'] = df_val.loc[ :,['bert','hgboost', 'logreg'] ].sum(axis=1)

In [None]:
df_val.loc[df_val['vote_sum'] >= 2, 'vote'] = 1
df_val.loc[df_val['vote_sum'] < 2, 'vote'] = 0
df_val = df_val.convert_dtypes()

In [None]:
vote_label = df_val[df_val['vote'] != df_val['label']]

In [None]:
print(f"Vote hard eval accuracy {1 - (vote_label.shape[0] / df_val.shape[0]):%}")
print(f"Bert eval accuracy {1 - (bert_label.shape[0] / df_val.shape[0]):%}")
print(f"Hgboost eval accuracy {1 - (hgboost_label.shape[0] / df_val.shape[0]):%}")
print(f"Logreg eval accuracy {1 - (logreg_label.shape[0] / df_val.shape[0]):%}")

Vote eval accuracy 78.252987%
Bert eval accuracy 81.325833%
Hgboost eval accuracy 75.658965%
Logreg eval accuracy 74.255032%


# 5. Ensemble Bert + Logreg + HGBOOST

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class BertClassifiier(BaseEstimator, ClassifierMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 512):
        
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.eval().to(self.device)
        self.max_length = max_length
   
    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

    def predict_proba(self, X, y=None):
        dataset = build_dataset(X, self.tokenizer, max_length=200, with_label=False)
        training_args = TrainingArguments("eval_trainer", 
                                per_device_train_batch_size=64, 
                                per_device_eval_batch_size=64,
                                do_train=False,
                                do_eval=False,
                                report_to="none"
                                )
        
        trainer = Trainer(model=self.model, args=training_args)
        
        return trainer.predict(dataset).predictions

In [None]:
class DummyBertClassifiier(BaseEstimator, ClassifierMixin):    
    def __init__(self, dataset: str='val'):
        self.dataset = dataset

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

    def predict_proba(self, X, y=None):
        result=None
        if self.dataset == 'val':
            result = np.load("val_bert_prob.npy")
        elif self.dataset == 'test':
            result = np.load("test_bert_prob.npy")         
        return result

In [None]:
# clf_bert = BertClassifiier(bert_tokenizer=tokenizer_loaded , bert_model=model_loaded, max_length=200)
# bert_val_preds = clf_bert.predict_proba(df_val)
# print(bert_val_preds.shape)

In [None]:
clf_bert_dummy = DummyBertClassifiier(dataset='test')
clf_hgboost = HistGradientBoostingClassifier(random_state=0)
clf_lr = LogisticRegression(max_iter=5000, random_state=0, verbose=0)

In [None]:
eclf = VotingClassifier(estimators=[('hist_gb', clf_hgboost), ('lr', clf_lr), ('bert', clf_bert_dummy)], voting='soft')
eclf.fit(x_train_scaled, y_train)
# print(eclf.score(x_val_scaled, y_val))

VotingClassifier(estimators=[('hist_gb',
                              HistGradientBoostingClassifier(random_state=0)),
                             ('lr',
                              LogisticRegression(max_iter=5000,
                                                 random_state=0)),
                             ('bert', DummyBertClassifiier(dataset='test'))],
                 voting='soft')

In [25]:
submission(clf=eclf_soft, X=x_test_scaled, df=df_test, out_suffix='eclf_soft')

In [26]:
eclf_soft.score(x_val_scaled, y_val)

0.821207754172284

In [None]:
eclf_test_preds = eclf.predict(x_test_scaled)

In [None]:
df_test['label'] = eclf_test_preds
df_test.loc[df_test['label'] == 0, 'Class'] = 'M'
df_test.loc[df_test['label'] == 1, 'Class'] = 'H'

In [None]:
df_test.to_csv('submission2.csv', columns=['Id','Class'], index=False) 

In [None]:
df_subm_first = pd.read_csv('submission.csv')
df_subm_second = pd.read_csv('submission2.csv')

In [None]:
df_subm_diff = df_subm_first[df_subm_first['Class'] != df_subm_second['Class']]

In [None]:
df_subm_diff.shape

(1888, 2)

## Результат

* на тестовой выборке на 1% лучше показал ансамбль, хотя на валидационной прирост по сравнению с бертом более значительный был (84%)

# 6. Bert feautures + Stats feats + LGBM

In [13]:
ds_train = build_dataset(df_train, tokenizer_loaded, max_length=200, with_label=False)
ds_val = build_dataset(df_val, tokenizer_loaded, max_length=200, with_label=False)
ds_test = build_dataset(df_test, tokenizer_loaded, max_length=200, with_label=False)
train_loader = torch.utils.data.DataLoader(ds_train, shuffle=False, batch_size=32)
val_loader = torch.utils.data.DataLoader(ds_val, shuffle=False, batch_size=32)
test_loader = torch.utils.data.DataLoader(ds_test, shuffle=False, batch_size=32)

  0%|          | 0/130 [00:00<?, ?ba/s]

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/65 [00:00<?, ?ba/s]

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [15]:
model_nohead = torch.nn.Sequential(list(model_loaded.children())[0])
model_nohead.eval().to(device)
# print(model_nohead.device)

Sequential(
  (0): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [None]:
with torch.no_grad():
        x_train_bert = torch.vstack([model_nohead(batch['input_ids'].to(device)).last_hidden_state[:,0].detach().to('cpu') for batch in tqdm(train_loader)])

np.save('x_train_emb', x_train_bert.numpy())

100%|██████████| 4034/4034 [27:00<00:00,  2.49it/s]


In [None]:
with torch.no_grad():
        x_val_bert = torch.vstack([model_nohead(batch['input_ids'].to(device)).last_hidden_state[:,0].detach().to('cpu') for batch in tqdm(val_loader)])
np.save('x_val_emb', x_val_bert.numpy())

100%|██████████| 673/673 [04:29<00:00,  2.50it/s]


In [16]:
with torch.no_grad():
        x_test_bert = torch.vstack([model_nohead(batch['input_ids'].to(device)).last_hidden_state[:,0].detach().to('cpu') for batch in tqdm(test_loader)])

np.save('x_test_emb', x_test_bert.numpy())

100%|██████████| 2017/2017 [25:01<00:00,  1.34it/s]


In [None]:
clf_hgboost = HistGradientBoostingClassifier(random_state=0).fit(x_train, y_train)

In [None]:
clf_hgboost.score(x_val, y_val)

0.7575194086746316

In [None]:
clf_lr = LogisticRegression(max_iter=5000, random_state=0, verbose=0).fit(x_train, y_train)
clf_eval(clf_lr, x_val, y_val)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


NameError: name 'clf_eval' is not defined