In [None]:
import os
from collections import defaultdict, Counter
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer
import pytorch_lightning as pl
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder

# Baseline notebook

В данном ноутбуке показан бейзлайн для предсказания искуственности текста.

Бейзлайн работает одинаково для обоих треков - binary и multiclass. 

## BERT

Finetune BERT. Используется предобученный DeepPavlov ruBERT, доступный на HuggingFace hub.

In [None]:
batch_size=16
model_name = 'DeepPavlov/rubert-base-cased'

In [None]:
data_dir = './data/' # директория с данными 
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
val = pd.read_csv(data_dir + 'val.csv')

le = LabelEncoder() # закодируем лейблы 
le.fit(train['Class'].values)

In [None]:
def collate_fn(input_data):
    texts, labels = zip(*input_data)
    labels = torch.LongTensor(labels)
    inputs = tokenizer(texts, return_tensors='pt', padding='longest', max_length=256, truncation=True)
    inputs['Class'] = labels
    return inputs

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, sort=False, le=None):
        super().__init__()
        self.texts = data['Text'].values
        if 'Class' in data.columns: # если есть разметка
            assert not data['Class'].isnull().any(), "Some labels are null"
            if le is not None:
                self.labels = le.transform(data['Class'])
            else:
                self.labels = data['Class'].values
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        if hasattr(self, 'labels'):
            return self.texts[idx], self.labels[idx]
        else:
            return self.texts[idx], []

class Metric: # metric class for storing metrics (accuracy, loss)
    def __init__(self):
        self.storage = defaultdict(list)
    
    def store(self, **kwargs):
        for key in kwargs:
            self.storage[key].append(kwargs[key])
            
    def reset(self):
        self.storage.clear()
        
    def log(self):
        for key in self.storage:
            self.storage[key] = np.mean(self.storage[key])
        return self.storage.items()
        
class BertClassifier(pl.LightningModule):
    def __init__(self, model_name, lr=1e-5, num_labels=2):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.metric = Metric()
        self.learning_rate = lr
        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.bert.parameters(), lr=self.learning_rate)
        return optimizer
        
    def forward(self, x):
        return self.bert(**x)
    
    def training_step(self, batch, batch_idx):
        labels = batch.pop('Class')
        logits = self.bert(**batch).logits
        loss = F.cross_entropy(logits, labels)
        predictions = logits.argmax(axis=1)
        accuracy = torch.mean((predictions == labels).double())
        self.metric.store(loss=loss.item(), accuracy=accuracy.item())
        if batch_idx % 100: # every 100 batches - log metrics (mean of last 100 batches)
            for k,v in self.metric.log():
                self.log(f'train/{k}', v)
            self.metric.reset()
        return loss
    
    def validation_step(self, batch, batch_idx):
        labels = batch.pop('Class')
        logits = self.bert(**batch).logits
        loss = F.cross_entropy(logits, labels)
        self.log('val/loss', loss)
        predictions = logits.argmax(axis=1)
        self.log('val/accuracy', torch.mean((predictions == labels).double()))

In [None]:
train = TextDataset(train, le=le)
val = TextDataset(val, le=le)
test = TextDataset(test, le=le)

train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertClassifier(model_name, num_labels=len(le.classes_))

In [None]:
version = f"{model_name}_binary"
logger = pl.loggers.TensorBoardLogger(save_dir=os.getcwd(), name='lightning_logs', version=version)
trainer = pl.Trainer(
    logger=logger, 
    gpus=[1],
    max_epochs=3, 
    num_sanity_val_steps=1
)
trainer.fit(model, train_loader, val_loader)

In [None]:
torch.save(model.bert.state_dict(), "./deeppavlov_bert_trained.pt")

In [None]:
def get_accuracy_and_pred(model, loader): # используйте эту функцию для получения accuracy и предсказаний
    preds = []
    model.eval()
    labels = None
    accs = 0
    ns = 0
    for batch in tqdm(loader):
        for key in batch:
            batch[key] = batch[key].to(model.device)
        labels = batch.pop('Class')
        with torch.no_grad():
            pred = model(batch).logits.argmax(axis=1)
        if labels.size()[1] > 0:
            accs += torch.sum((pred == labels).double())
        preds.append(pred.cpu().numpy())
        ns += len(pred)
    return accs/ns, np.concatenate(preds)

acc, preds = get_accuracy_and_pred(model, test_loader)
np.save('./preds/test_preds_bert.npy', le.inverse_transform(preds))
print(f"Test accuracy: {acc}")

### TD-IDF baseline

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
def encode_labels(data, le):
    data['Class'] = le.transform(data['Class'])
    return data

In [None]:
with open('./stopwords_ru.txt', encoding='utf-8') as fp:
    stopwords = [s.strip() for s in fp.readlines()]

In [None]:
train = pd.read_csv('./ruatd-multi/train.csv')
test = pd.read_csv('./ruatd-multi/test.csv')
val = pd.read_csv('./ruatd-multi/val.csv')

le = LabelEncoder()
le.fit(train['Class'].values)

for d in [train, val]:
    d = encode_labels(d, le)

In [None]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 3), max_features=50000)), 
    ('svd', TruncatedSVD(n_components=5000)),
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(C=0.01, n_jobs=40))
], verbose=True)

X_train = np.concatenate((train['Text'].values, val['Text'].values))
y_train = np.concatenate((train['Class'].values, val['Class'].values))

In [None]:
model.fit(X_train, y_train)

In [None]:
preds = model.predict(test['Text'].values)

In [None]:
np.save('preds/test_preds_tfidf.npy', le.inverse_transform(preds))