In [None]:
!pip install transformers sentencepiece sacrebleu


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
from tqdm.auto import tqdm
from sklearn.utils import shuffle
import pandas as pd
import torch
from tqdm import tqdm
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    #T5Tokenizer,
    BertForMaskedLM,
    AutoTokenizer
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/train.tsv',sep='	', header=0)
df = df.fillna('')
df.head()

Unnamed: 0,index,toxic_comment,neutral_comment1,neutral_comment2,neutral_comment3
0,0,"и,чё,блядь где этот херой был до этого со свои...","Ну и где этот герой был,со своими доказательст...",Где этот герой был до этого со своими доказате...,"и,где этот герой был до этого со своими доказа..."
1,1,"О, а есть деанон этого петуха?","О, а есть деанон",,
2,2,"херну всякую пишут,из-за этого лайка.долбоебизм.","Чушь всякую пишут, из- за этого лайка.","Ерунду всякую пишут,из-за этого лайка.",
3,3,из за таких пидоров мы и страдаем,из за таких плохих людей мы и страдаем,Из-за таких людей мы и страдаем,из за таких как он мы и страдаем
4,4,гондон путинский он а не артист,"Человек Путина он, а не артист",,


In [None]:
df_train_toxic = []
df_train_neutral = []

for index, row in df.iterrows():
    references = row[['neutral_comment1', 'neutral_comment2', 'neutral_comment3']].tolist()

    for reference in references:
        if len(reference) > 0:
            df_train_toxic.append(row['toxic_comment'])
            df_train_neutral.append(reference)
        else:
            break


df = pd.DataFrame({
    'toxic_comment': df_train_toxic,
    'neutral_comment': df_train_neutral
})

df = shuffle(df)

In [None]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx < len(self.x['input_ids'])
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item

    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2



In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple
from sklearn.model_selection import train_test_split
import gc
from tqdm.auto import tqdm, trange

In [None]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        )
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']

        return {k: torch.tensor(v) for k, v in batch.items()}

In [None]:
def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0

    for batch in test_dataloader:
        with torch.no_grad():
            loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
            num += len(batch) * loss.item()
            den += len(batch)
    val_loss = num / den
    return val_loss

In [None]:
def train_loop(
    model, train_dataloader, val_dataloader,
    max_epochs=30,
    lr=3e-5,
    gradient_accumulation_steps=1,
    cleanup_step=100,
    report_step=400,
    window=100,
):
    cleanup()
    optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=lr)

    ewm_loss = 0
    model.train()

    for epoch in trange(max_epochs):
        step = 0
        print(step)
        tq = tqdm(train_dataloader)
        for i, batch in enumerate(tq):
          batch['labels'][batch['labels']==0] = -100
          loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss
          loss.backward()
          optimizer.step()
          optimizer.zero_grad()
          step += 1

          if i % cleanup_step == 0:
            cleanup()

          w = 1 / min(i+1, window)
          ewm_loss = ewm_loss * (1-w) + loss.item() * w
          tq.set_description(f'loss: {ewm_loss:4.4f}')

          if (i and i % report_step == 0 or i == len(train_dataloader)-1)  and val_dataloader is not None:
              model.eval()
              eval_loss = evaluate_model(model, val_dataloader)
              model.train()
              print(f'epoch {epoch}, step {i}/{step}: train loss: {ewm_loss:4.4f}  val loss: {eval_loss:4.4f}')

          if step % 1000 == 0 and step != 0:
              model.save_pretrained(f't5_base__{epoch}')

    cleanup()

In [None]:
def train_model(x, y, model_type, model_name, test_size=0.1, batch_size=32, **kwargs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if model_type == 't5':
      model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
      tokenizer = AutoTokenizer.from_pretrained(model_name)
    elif model_type == 'gpt':
      model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
      tokenizer = AutoTokenizer.from_pretrained(model_name)
    elif model_type == 'bert':
      model = BertForMaskedLM.from_pretrained(model_name).to(device)
      tokenizer = AutoTokenizer.from_pretrained(model_name)

    x1, x2, y1, y2 = train_test_split(x, y, test_size=test_size, random_state=42)
    train_dataset = PairsDataset(tokenizer(x1), tokenizer(y1))
    test_dataset = PairsDataset(tokenizer(x2), tokenizer(y2))

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)
    val_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=False, shuffle=True, collate_fn=data_collator)

    train_loop(model, train_dataloader, val_dataloader, **kwargs)
    return model

In [None]:
cleanup()

In [None]:
#model_name = 'ai-forever/ruT5-base'
model_name = 'sberbank-ai/rugpt3small_based_on_gpt2'
#model_name = 'sberbank-ai/ruBert-base'
model_type = 't5'
datasets = {
    'train': df
}

In [None]:
model = train_model(df['toxic_comment'].tolist(), df['neutral_comment'].tolist(), model_type = model_type,model_name=model_name, batch_size=16, max_epochs=5)
model.save_pretrained(f't5_base_trained')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/5 [00:00<?, ?it/s]

0


  0%|          | 0/624 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Using pad_token, but it is not set yet.


ValueError: ignored

In [None]:
def paraphrase(text, model, n=None, max_length='auto', temperature=0.0, beams=3):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs,
        num_return_sequences=n or 1,
        do_sample=False,
        temperature=temperature,
        repetition_penalty=3.0,
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]
    if not n and isinstance(text, str):
        return texts[0]
    return texts

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
print(paraphrase(['Дмитрий вы ебанулись, уже все выложено'], model, temperature=50.0, beams=10))

['Дмитрий, уже всё выложено']


In [None]:
class DetoxDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.tokenizer.src_lang = "ru_XX"
        self.tokenizer.tgt_lang = "ru_XX"

    def __getitem__(self, idx):

        source = self.tokenizer(
            self.data.iloc[idx].toxic_comment,
            max_length=150,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer(
            self.data.iloc[idx].neutral_comment1,
            max_length=150,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        source["labels"] = target["input_ids"]

        return {k: v.squeeze(0) for k, v in source.items()}

    def __len__(self):
        return self.data.shape[0]

In [None]:
def paraphrase(
    text,
    model,
    tokenizer,
    n=None,
    max_length="auto",
    beams=5,
):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors="pt", add_special_tokens=False)["input_ids"].to(
        model.device
    )

    if max_length == "auto":
        max_length = inputs.shape[1] + 10

    result = model.generate(
        inputs,
        num_return_sequences=n or 1,
        do_sample=False,
        temperature=1.0,
        repetition_penalty=10.0,
        max_length=max_length,
        min_length=int(0.5 * max_length),
        num_beams=beams,
        # forced_bos_token_id=tokenizer.lang_code_to_id[tokenizer.tgt_lang],
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]

    if not n and isinstance(text, str):
        return texts[0]
    return texts[0]

In [None]:

tokenizer = T5Tokenizer.from_pretrained('sberbank-ai/ruT5-base')
model = T5ForConditionalGeneration.from_pretrained('sberbank-ai/ruT5-base')

Downloading spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(dataset, random_state=42, test_size=0.01)
trainset = DetoxDataset(train, tokenizer)
valset = DetoxDataset(val, tokenizer)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="bart_detox",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,  # 8 is too much
    weight_decay=1e-5,
    num_train_epochs=8, # use 3 or 5 epochs here
    learning_rate=1e-5,
    evaluation_strategy="steps",
    save_strategy="no",
    save_total_limit=1,
    logging_steps=500,
    gradient_accumulation_steps=1,
)

In [None]:

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=valset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
500,0.193,0.156554
1000,0.175,0.149435
1500,0.1736,0.143371
2000,0.1662,0.14079
2500,0.1624,0.138624
3000,0.1587,0.137684


TrainOutput(global_step=3440, training_loss=0.16966283709503885, metrics={'train_runtime': 4008.0717, 'train_samples_per_second': 13.728, 'train_steps_per_second': 0.858, 'total_flos': 9816591495168000.0, 'train_loss': 0.16966283709503885, 'epoch': 8.0})

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/model.pt')

In [None]:
model.load_state_dict(torch.load('drive/MyDrive/model.pt'))

<All keys matched successfully>

In [None]:
paraphrase('О, а есть деанон этого петуха?', model, tokenizer)

'О, а есть деанон этого петуха?'

In [None]:
test_inputs = open("test_toxic_parallel.txt", "r").read().split("\n")[:20]
preds = []
for text in tqdm(test_inputs):
    preds.append(paraphrase(text, model, tokenizer))

with open(f"predictions.txt", "w") as f:
    f.write("\n".join(preds))

In [None]:
pd.DataFrame({'text': test_inputs, 'preds': preds})