In [None]:
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)


from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd
import numpy as np
import os
import re
import random
import gc
pd.set_option('display.max_columns', None)
np.seterr(divide='ignore', invalid='ignore')
gc.enable()


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR

from transformers import AutoTokenizer, AutoModel


RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything()

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(f'Using device: {device}')

Using device: cuda


# Загрузка данных

In [None]:
data_dir = '../input/jrstc-train-folds'
train_file_path = os.path.join(data_dir, 'validation_data_5_folds.csv')
print(f'Train file: {train_file_path}')

Train file: ../input/jrstc-train-folds/validation_data_5_folds.csv


In [None]:
train_df = pd.read_csv(train_file_path)

# Очистка данных

In [None]:
def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') 
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml')
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  
                               u"\U0001F300-\U0001F5FF"  
                               u"\U0001F680-\U0001F6FF"  
                               u"\U0001F1E0-\U0001F1FF"  
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) 
    text = re.sub(' +', ' ', text) 
    text = text.strip() 

    return text

In [None]:
tqdm.pandas()
train_df['less_toxic'] = train_df['less_toxic'].progress_apply(text_cleaning)
train_df['more_toxic'] = train_df['more_toxic'].progress_apply(text_cleaning)

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

In [None]:
train_df.sample(10)

Unnamed: 0,worker,less_toxic,more_toxic,kfold
20578,56,of course it s nuclear any moron who has read ...,Unblock me now you piece of shit Or I will fin...,3
26465,446,Perspective In the trial Shylock represents th...,So apparently an inability to read can make yo...,4
28404,340,Oh really Well then stop making bullsht edits ...,I have corrected the section on Twin earth for...,4
19714,36,Please refrain from adding nonsense to Wikiped...,this page smells very bad,2
3752,441,Dear SchfiftyThree I would just like to say th...,TAKE YOUR JOB AS SERIOUSLY AS POSSIBLE YOU LOS...,0
20191,4,Wiki is a two way street deleting a post becau...,In actual fact Oh no not again is a four word ...,2
3518,682,Maybe we can get a vote I m flexible with my v...,Fuck off Wolfkeeper Talk,0
24602,200,Jew Marxist Raus Raus Raus,No MORON I am not admitting any of those thing...,4
13730,613,Rochester WIKI Hi Please stop putting that rid...,BITCH Your a bitch Why did you delete the Engl...,2
12554,725,I challenge you to present a shred of evidence...,How about you learn how to read and write duri...,1


In [None]:
train_df.groupby(['kfold']).size()

kfold
0    6022
1    6022
2    6022
3    6021
4    6021
dtype: int64

# ROBERTa

In [None]:
params = {
    'device': device,
    'debug': False,
    'checkpoint': 'roberta-base',
    'output_logits': 768,
    'max_len': 256,
    'num_folds': train_df['kfold'].nunique(),
    'batch_size': 16,
    'dropout': 0.2,
    'num_workers': 2,
    'epochs': 3,
    'lr': 2e-5,
    'margin': 0.7,
    'scheduler_name': 'OneCycleLR',
    'max_lr': 5e-5,                 
    'pct_start': 0.1,               
    'anneal_strategy': 'cos',       
    'div_factor': 1e3,              
    'final_div_factor': 1e3,        
    'no_decay': True
}

In [None]:
if params['debug']:
    train_df = train_df.sample(frac=0.01)
    print('Reduced training Data Size for Debugging purposes')

# Создание датасета

In [None]:
class BERTDataset:
    def __init__(self, more_toxic, less_toxic, max_len=params['max_len'], checkpoint=params['checkpoint']):
        self.more_toxic = more_toxic
        self.less_toxic = less_toxic
        self.max_len = max_len
        self.checkpoint = checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.num_examples = len(self.more_toxic)

    def __len__(self):
        return self.num_examples

    def __getitem__(self, idx):
        more_toxic = str(self.more_toxic[idx])
        less_toxic = str(self.less_toxic[idx])

        tokenized_more_toxic = self.tokenizer(
            more_toxic,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_attention_mask=True,
            return_token_type_ids=True,
        )

        tokenized_less_toxic = self.tokenizer(
            less_toxic,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_attention_mask=True,
            return_token_type_ids=True,
        )

        ids_more_toxic = tokenized_more_toxic['input_ids']
        mask_more_toxic = tokenized_more_toxic['attention_mask']
        token_type_ids_more_toxic = tokenized_more_toxic['token_type_ids']

        ids_less_toxic = tokenized_less_toxic['input_ids']
        mask_less_toxic = tokenized_less_toxic['attention_mask']
        token_type_ids_less_toxic = tokenized_less_toxic['token_type_ids']

        return {'ids_more_toxic': torch.tensor(ids_more_toxic, dtype=torch.long),
                'mask_more_toxic': torch.tensor(mask_more_toxic, dtype=torch.long),
                'token_type_ids_more_toxic': torch.tensor(token_type_ids_more_toxic, dtype=torch.long),
                'ids_less_toxic': torch.tensor(ids_less_toxic, dtype=torch.long),
                'mask_less_toxic': torch.tensor(mask_less_toxic, dtype=torch.long),
                'token_type_ids_less_toxic': torch.tensor(token_type_ids_less_toxic, dtype=torch.long),
                'target': torch.tensor(1, dtype=torch.float)}

# Скедулер

In [None]:
def get_scheduler(optimizer, scheduler_params=params):
    if scheduler_params['scheduler_name'] == 'CosineAnnealingWarmRestarts':
        scheduler = CosineAnnealingWarmRestarts(
            optimizer,
            T_0=scheduler_params['T_0'],
            eta_min=scheduler_params['min_lr'],
            last_epoch=-1
        )
    elif scheduler_params['scheduler_name'] == 'OneCycleLR':
        scheduler = OneCycleLR(
            optimizer,
            max_lr=scheduler_params['max_lr'],
            steps_per_epoch=int(df_train.shape[0] / params['batch_size']) + 1,
            epochs=scheduler_params['epochs'],
            pct_start=scheduler_params['pct_start'],
            anneal_strategy=scheduler_params['anneal_strategy'],
            div_factor=scheduler_params['div_factor'],
            final_div_factor=scheduler_params['final_div_factor'],
        )
    return scheduler

# Метрика

In [None]:
class MetricMonitor:
    def __init__(self, float_precision=4):
        self.float_precision = float_precision
        self.reset()

    def reset(self):
        self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})

    def update(self, metric_name, val):
        metric = self.metrics[metric_name]

        metric["val"] += val
        metric["count"] += 1
        metric["avg"] = metric["val"] / metric["count"]

    def __str__(self):
        return " | ".join(
            [
                "{metric_name}: {avg:.{float_precision}f}".format(
                    metric_name=metric_name, avg=metric["avg"],
                    float_precision=self.float_precision
                )
                for (metric_name, metric) in self.metrics.items()
            ]
        )

# Модель

In [None]:
class ToxicityModel(nn.Module):
    def __init__(self, checkpoint=params['checkpoint'], params=params):
        super(ToxicityModel, self).__init__()
        self.checkpoint = checkpoint
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.layer_norm = nn.LayerNorm(params['output_logits'])
        self.dropout = nn.Dropout(params['dropout'])
        self.dense = nn.Sequential(
            nn.Linear(params['output_logits'], 128),
            nn.SiLU(),
            nn.Dropout(params['dropout']),
            nn.Linear(128, 1)
        )

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds

# Тренировка и валидация

## 1. Тренировка

In [None]:
def train_fn(train_loader, model, criterion, optimizer, epoch, params, scheduler=None):
    metric_monitor = MetricMonitor()
    model.train()
    stream = tqdm(train_loader)
    
    for i, batch in enumerate(stream, start=1):
        ids_more_toxic = batch['ids_more_toxic'].to(device)
        mask_more_toxic = batch['mask_more_toxic'].to(device)
        token_type_ids_more_toxic = batch['token_type_ids_more_toxic'].to(device)
        ids_less_toxic = batch['ids_less_toxic'].to(device)
        mask_less_toxic = batch['mask_less_toxic'].to(device)
        token_type_ids_less_toxic = batch['token_type_ids_less_toxic'].to(device)
        target = batch['target'].to(device)

        logits_more_toxic = model(ids_more_toxic, token_type_ids_more_toxic, mask_more_toxic)
        logits_less_toxic = model(ids_less_toxic, token_type_ids_less_toxic, mask_less_toxic)
        loss = criterion(logits_more_toxic, logits_less_toxic, target)
        metric_monitor.update('Loss', loss.item())
        loss.backward()
        optimizer.step()
            
        if scheduler is not None:
            scheduler.step()
        
        optimizer.zero_grad()
        stream.set_description(f"Epoch: {epoch:02}. Train. {metric_monitor}")

## 2. Валидация

In [None]:
def validate_fn(val_loader, model, criterion, epoch, params):
    metric_monitor = MetricMonitor()
    model.eval()
    stream = tqdm(val_loader)
    all_loss = []
    with torch.no_grad():
        for i, batch in enumerate(stream, start=1):
            ids_more_toxic = batch['ids_more_toxic'].to(device)
            mask_more_toxic = batch['mask_more_toxic'].to(device)
            token_type_ids_more_toxic = batch['token_type_ids_more_toxic'].to(device)
            ids_less_toxic = batch['ids_less_toxic'].to(device)
            mask_less_toxic = batch['mask_less_toxic'].to(device)
            token_type_ids_less_toxic = batch['token_type_ids_less_toxic'].to(device)
            target = batch['target'].to(device)

            logits_more_toxic = model(ids_more_toxic, token_type_ids_more_toxic, mask_more_toxic)
            logits_less_toxic = model(ids_less_toxic, token_type_ids_less_toxic, mask_less_toxic)
            loss = criterion(logits_more_toxic, logits_less_toxic, target)
            all_loss.append(loss.item())
            metric_monitor.update('Loss', loss.item())
            stream.set_description(f"Epoch: {epoch:02}. Valid. {metric_monitor}")
            
    return np.mean(all_loss)

# Запуск

In [None]:
best_models_of_each_fold = []

In [None]:
gc.collect()
for fold in range(params['num_folds']):
    print(f'******************** Training Fold: {fold+1} ********************')
    current_fold = fold
    df_train = train_df[train_df['kfold'] != current_fold].copy()
    df_valid = train_df[train_df['kfold'] == current_fold].copy()

    train_dataset = BERTDataset(
        df_train.more_toxic.values,
        df_train.less_toxic.values
    )
    valid_dataset = BERTDataset(
        df_valid.more_toxic.values,
        df_valid.less_toxic.values
    )

    train_dataloader = DataLoader(
        train_dataset, batch_size=params['batch_size'], shuffle=True,
        num_workers=params['num_workers'], pin_memory=True
    )
    valid_dataloader = DataLoader(
        valid_dataset, batch_size=params['batch_size']*2, shuffle=False,
        num_workers=params['num_workers'], pin_memory=True
    )
    
    model = ToxicityModel()
    model = model.to(params['device'])
    criterion = nn.MarginRankingLoss(margin=params['margin'])
    if params['no_decay']:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = optim.AdamW(optimizer_grouped_parameters, lr=params['lr'])
    else:
        optimizer = optim.AdamW(model.parameters(), lr=params['lr'])
    scheduler = get_scheduler(optimizer)

    best_loss = np.inf
    best_epoch = 0
    best_model_name = None
    for epoch in range(1, params['epochs'] + 1):
        train_fn(train_dataloader, model, criterion, optimizer, epoch, params, scheduler)
        valid_loss = validate_fn(valid_dataloader, model, criterion, epoch, params)
        if valid_loss <= best_loss:
            best_loss = valid_loss
            best_epoch = epoch
            if best_model_name is not None:
                os.remove(best_model_name)
            torch.save(model.state_dict(), f"{params['checkpoint']}_{epoch}_epoch_f{fold+1}.pth")
            best_model_name = f"{params['checkpoint']}_{epoch}_epoch_f{fold+1}.pth"


    print('')
    print(f'The best LOSS: {best_loss} for fold {fold+1} was achieved on epoch: {best_epoch}.')
    print(f'The Best saved model is: {best_model_name}')
    best_models_of_each_fold.append(best_model_name)
    del df_train, df_valid, train_dataset, valid_dataset, train_dataloader, valid_dataloader, model
    _ = gc.collect()
    torch.cuda.empty_cache()

******************** Training Fold: 1 ********************


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]


The best LOSS: 0.48714220381918405 for fold 1 was achieved on epoch: 2.
The Best saved model is: roberta-base_2_epoch_f1.pth
******************** Training Fold: 2 ********************


  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]


The best LOSS: 0.4678449454919371 for fold 2 was achieved on epoch: 1.
The Best saved model is: roberta-base_1_epoch_f2.pth
******************** Training Fold: 3 ********************


  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]


The best LOSS: 0.47333391034413896 for fold 3 was achieved on epoch: 2.
The Best saved model is: roberta-base_2_epoch_f3.pth
******************** Training Fold: 4 ********************


  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]


The best LOSS: 0.4785001954388997 for fold 4 was achieved on epoch: 2.
The Best saved model is: roberta-base_2_epoch_f4.pth
******************** Training Fold: 5 ********************


  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/1506 [00:00<?, ?it/s]

  0%|          | 0/189 [00:00<?, ?it/s]


The best LOSS: 0.47197202943935596 for fold 5 was achieved on epoch: 2.
The Best saved model is: roberta-base_2_epoch_f5.pth


In [None]:
for i, name in enumerate(best_models_of_each_fold):
    print(f'Best model of fold {i+1}: {name}')

Best model of fold 1: roberta-base_2_epoch_f1.pth
Best model of fold 2: roberta-base_1_epoch_f2.pth
Best model of fold 3: roberta-base_2_epoch_f3.pth
Best model of fold 4: roberta-base_2_epoch_f4.pth
Best model of fold 5: roberta-base_2_epoch_f5.pth


In [None]:
predictions_nn = None
for model_name in glob.glob(best_models_of_each_fold + '/*.pth'):
    model = ToxicityModel()
    model.load_state_dict(torch.load(model_name))
    model = model.to(params['device'])
    model.eval()

    test_dataset = BERTDataset(
        text = test_df['text'].values
    )
    test_loader = DataLoader(
        test_dataset, batch_size=params['batch_size'],
        shuffle=False, num_workers=params['num_workers'],
        pin_memory=True
    )

    temp_preds = None
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f'Predicting. '):
            ids= batch['ids'].to(device)
            mask = batch['mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            predictions = model(ids, token_type_ids, mask).to('cpu').numpy()
            
            if temp_preds is None:
                temp_preds = predictions
            else:
                temp_preds = np.vstack((temp_preds, predictions))

    if predictions_nn is None:
        predictions_nn = temp_preds
    else:
        predictions_nn += temp_preds
        
predictions_nn /= (len(glob.glob(models_dir + '/*.pth')))

Predicting. :   0%|          | 0/236 [00:00<?, ?it/s]

Predicting. :   0%|          | 0/236 [00:00<?, ?it/s]

Predicting. :   0%|          | 0/236 [00:00<?, ?it/s]

Predicting. :   0%|          | 0/236 [00:00<?, ?it/s]

Predicting. :   0%|          | 0/236 [00:00<?, ?it/s]

In [None]:
sub_df = pd.DataFrame()
sub_df['comment_id'] = test_df['comment_id']
sub_df['score'] = predictions_nn
sub_df['score'] = sub_df['score'].rank(method='first')

In [None]:
sub_df.to_csv('submission.csv', index=False)