In [None]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]
!pip install corus
!pip install pymorphy2[fast]
!pip install razdel

In [None]:
!wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/tass-001.csv.gz

In [None]:
from tqdm import tqdm, trange
import random
tqdm.pandas()
import pandas as pd
from razdel import tokenize
from corus import load_ods_tass
from string import punctuation
from tqdm.auto import tqdm
import pymorphy2
m = pymorphy2.MorphAnalyzer()

In [None]:
path = './tass-001.csv.gz'
records = load_ods_tass(path)

titles = []  # ~5 минут
for i in records:
    titles.append(i.title)

In [None]:
def my_tokenize(x):
    return [_.text for _ in list(tokenize(x))]

In [None]:
df = pd.DataFrame()
df['text'] = titles[:100000]
df = df.dropna()
df['tokens'] = df['text'].progress_apply(my_tokenize)
df['corrupted_tokens'] = df['tokens']

In [None]:
def corrupt_tokens(tokens):
    cases = ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct']
    try:
        corrupted = []
        for i, t in enumerate(tokens):
            ## IF COUNT PREP == 0 -> NONE
            if m.parse(t)[0].tag.POS == 'NOUN':
                if m.parse(tokens[i-1])[0].tag.POS == 'PREP':
                    old_case = m.parse(t)[0].tag.case
                    new_case = old_case
                    while new_case == old_case:
                        new_case = random.choice(cases)
                    token = m.parse(t)[0].inflect({new_case}).word
                else:
                    token =  m.parse(t)[0].word
            else:
                token =  m.parse(t)[0].word
            corrupted.append(token)
        return corrupted
    except AttributeError:
        print(tokens)
        return None

In [None]:
def mask_tokens(tokens):
    cases = ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct']
    try:
        corrupted = []
        for i, t in enumerate(tokens):
            ## IF COUNT PREP == 0 -> NONE
            if m.parse(t)[0].tag.POS == 'NOUN':
                if m.parse(tokens[i-1])[0].tag.POS == 'PREP':
                    old_case = m.parse(t)[0].tag.case
                    new_case = old_case
                    while new_case == old_case:
                        new_case = random.choice(cases)
                    token = '[MASK]'
                else:
                    token =  m.parse(t)[0].word
            else:
                token =  m.parse(t)[0].word
            corrupted.append(token)
        return corrupted
    except AttributeError:
        print(tokens)
        return None

In [None]:
df['corrupted_tokens'] = df['corrupted_tokens'].progress_apply(corrupt_tokens)

In [None]:
df['masked_tokens'] = df['corrupted_tokens'].progress_apply(mask_tokens)

In [None]:
def tokens_to_text(x):
    return ' '.join(x)

def text_preproc(x):
    for p in punctuation:
        if p == '[' or p == ']':
            x = x
        else:
            x = x.replace(p, '')
    x = x.replace('ё', 'е')
    x = x.replace('  ', ' ')
    return x.lower()

In [None]:
df = df.dropna()

In [None]:
df['corrupted_text'] = df['corrupted_tokens'].apply(tokens_to_text)
df['masked_text'] = df['masked_tokens'].apply(tokens_to_text)

In [None]:
df['text'] = df['text'].apply(text_preproc)
df['corrupted_text'] = df['corrupted_text'].apply(text_preproc)
df['masked_text'] = df['masked_text'].apply(text_preproc)

In [None]:
def check_prep(x):
    prep = 0
    for token in x:
        if m.parse(token)[0].tag.POS == 'PREP':
            prep += 1
    if prep == 0:
        return None
    else:
        return x

In [None]:
df['check_prep'] = df['corrupted_tokens'].progress_apply(check_prep)

In [None]:
df = df.dropna()

In [None]:
df

In [None]:
df.sample(10)

In [None]:
!pip install simplet5

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df[['corrupted_text', 'text']], test_size=0.2)

In [None]:
train.columns = ['source_text', 'target_text']
test.columns = ['source_text', 'target_text']

In [None]:
train

In [None]:
test

# T5

## cointegrated/rut5-base-multitask

In [None]:
from simplet5 import SimpleT5

In [None]:
model = SimpleT5()
hf_name = 'cointegrated/rut5-base-multitask'
model.from_pretrained('t5', hf_name)

In [None]:
model.train(train_df=train, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=test, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 100, 
            target_max_token_len = 100,
            batch_size = 16,
            max_epochs = 3,
            use_gpu = True,
            outputdir = "cointegrated_rut5-base-multitask",
            early_stopping_patience_epochs = 0,
            precision = 32,
            )

In [None]:
model.load_model("t5","./outputs/simplet5-epoch-3-train-loss-0.0475-val-loss-0.0368", use_gpu=True)
preds = []
for i in tqdm(test['source_text'].values.tolist()):
    preds.append(model.predict(i[0]))
    
test['preds'] = [x[0] for x in preds]

score = 0
for true, pred in test[['target_text', 'preds']].values.tolist():
    if true == pred:
        score += 1
        
score / len(test)

In [None]:
score = 0
for true, pred in test[['target_text', 'preds']].values.tolist():
    if true == pred:
        score += 1
        
score / len(test)

In [None]:
assert False

In [None]:
test = test.drop(columns='preds')

## sberbank-ai/ruT5-base

In [None]:
model = SimpleT5()
hf_name = 'sberbank-ai/ruT5-base'
model.from_pretrained('t5', hf_name)

In [None]:
model.train(train_df=train, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=test, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 100, 
            target_max_token_len = 100,
            batch_size = 16,
            max_epochs = 3,
            use_gpu = True,
            outputdir = "outputs_sberbank-ai_ruT5-base",
            early_stopping_patience_epochs = 0,
            precision = 32,
            )

In [None]:
model.load_model("t5","./outputs_sberbank-ai_ruT5-base/simplet5-epoch-2-train-loss-0.0707-val-loss-0.0326", use_gpu=True)
preds = []
for i in tqdm(test['source_text'].values.tolist()):
    preds.append(model.predict(i[0]))
    
test['preds'] = [x[0] for x in preds]

score = 0
for true, pred in test[['target_text', 'preds']].values.tolist():
    if true == pred:
        score += 1
        
score / len(test)

# RoBERTa

In [None]:
from datasets import Dataset

In [None]:
train

In [None]:
train

In [None]:
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(test)

In [None]:
train_dataset

In [None]:
val_dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/ruBert-large")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["target_text"],truncation=True)

In [None]:
tokenized_train = train_dataset.map(
    tokenize_function,
    batched =True
)

In [None]:
tokenized_val = val_dataset.map(
    tokenize_function,
    batched =True
)

In [None]:
tokenized_train

In [None]:
tokenized_val

In [None]:
from transformers import DataCollatorForLanguageModeling


tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

model = AutoModelForMaskedLM.from_pretrained("sberbank-ai/ruBert-large")

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to = 'none',
    save_strategy = 'no'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

trainer.train()

In [None]:
import torch

In [None]:

def predict_mask(s):
    try:
        inputs = tokenizer(s, return_tensors="pt").to('cuda')
        with torch.no_grad():
            logits = model(**inputs).logits

        mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        pred = tokenizer.decode(predicted_token_id)
        pred = pred.split()
        out_s = []
        m = 0
        
        for i, token in enumerate(s.split()):
            if token == '[MASK]':
                out_s.append(pred[m])
                m+=1
            else:
                out_s.append(token)
        return ' '.join(out_s).lower()
    except IndexError:
        return None


In [None]:
predict_mask('сбербанк на следующей [MASK] снизит ставки по [mask]')

In [None]:
test_mask = df[['text', 'masked_text']][-1000:]

In [None]:
def rubert_mask(x):
    return x.replace('mask', 'MASK')

In [None]:
test_mask['masked_text'] = test_mask['masked_text'].apply(rubert_mask)

In [None]:
test_mask

In [None]:
test_mask['mask_pred'] = test_mask['masked_text'].progress_apply(predict_mask)
    

In [None]:
len(test_mask[test_mask['masked_text'] == test_mask['mask_pred']]) / len(test_mask)

In [None]:
def mask_pos(df):
    targets = df['text']
    preds = df['mask_pred']
    
    target_pos = []
    for i in targets:
        tokens = i.split()
        pos_list = []
        for tok in tokens:
            pos_list.append(m.parse(tok)[0].tag.case)
        target_pos.append(pos_list)    
    pred_pos = []
    for i in preds:
        tokens = i.split()
        pos_list = []
        for tok in tokens:
            pos_list.append(m.parse(tok)[0].tag.case)
        pred_pos.append(pos_list)    
    return target_pos, pred_pos
            

In [None]:
test_mask = test_mask.dropna()

In [None]:
target_pos, pred_pos = mask_pos(test_mask)

In [None]:
test_mask['target_pos'] = target_pos
test_mask['pred_pos'] = pred_pos

In [None]:
len(test_mask[test_mask['target_pos'] == test_mask['pred_pos']]) / len(test_mask)

In [None]:
test_mask

In [None]:
for i in test_mask[test_mask['target_pos'] != test_mask['pred_pos']].values.tolist():
    print(f'TRUE: \t {i[0]}\nPRED: \t {i[2]}\nINPT: \t {i[1]}')
    print()

# RULEC

In [None]:
def parse(lines):
    source_sentences = []
    gold_edits = []
    for item in paragraphs(lines):
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        assert sentence != []
        annotations = {}
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            
            
            if etype == 'noop':
                start_offset = -1
                end_offset = -1
            
            corrections = [c.strip() if c != '-NONE-' else ''
                           for c in fields[2].split('||')]
    
            # NOTE: start and end are *token* offsets
            original = ' '.join(
                    ' '.join(sentence).split()[start_offset:end_offset])
            annotator = int(fields[5])
            if annotator not in annotations.keys():
                annotations[annotator] = []
            annotations[annotator].append((start_offset, end_offset,
                                           original, corrections, etype))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            source_sentences.append(this_sentence)
            this_edits = {}
            for annotator, annotation in annotations.items():
                this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0]
            if len(this_edits) == 0:
                this_edits[0] = []
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)


def paragraphs(lines):
    paragraph = []
    for line in lines:
        if line == '':
            if paragraph:
                yield paragraph
                paragraph = []
        else:
            paragraph.append(line)

In [None]:
def apply_corrections(sentence, corrections):
    """Return a new sentence with corrections applied.
    Sentence should be a whitespace-separated tokenised string. Corrections
    should be a list of corrections.
    """
    tokens = sentence.split(' ')
    offset = 0

    for c in corrections:
        tokens, offset = _apply_correction(tokens, c, offset)

    return ' '.join(tokens)

def apply_bad_corrections(sentence, corrections):
    """Return a new sentence with corrections applied.
    Sentence should be a whitespace-separated tokenised string. Corrections
    should be a list of corrections.
    """
    tokens = sentence.split(' ')
    offset = 0

    for c in corrections:
        tokens, offset = _apply_bad_correction(tokens, c, offset)

    return ' '.join(tokens)


def _apply_correction(tokens, correction, offset):
    """Apply a single correction to a list of tokens."""
    start_token_offset, end_token_offset, bad_token, insertion, etype = correction
    to_insert = insertion[0].split(' ')
    end_token_offset += (len(to_insert) - 1)
    
    
    to_insert_filtered = [t for t in to_insert if t != '']

    head = tokens[:start_token_offset + offset]
    tail = tokens[end_token_offset + offset:]

    new_tokens = head + to_insert_filtered + tail

    new_offset = len(to_insert_filtered) - (end_token_offset - start_token_offset) + offset

    return new_tokens, new_offset


def _apply_bad_correction(tokens, correction, offset):
    """Apply a single correction to a list of tokens."""
    start_token_offset, end_token_offset, bad_token, insertion, etype = correction
    to_insert = insertion[0].split(' ')
    end_token_offset += (len(to_insert) - 1)
    
    
    if etype == 'Сущ.:Падеж':
        to_insert_filtered = ['[MASK]']
    else:
        to_insert_filtered = [t for t in to_insert if t != '']

    head = tokens[:start_token_offset + offset]
    tail = tokens[end_token_offset + offset:]

    new_tokens = head + to_insert_filtered + tail

    new_offset = len(to_insert_filtered) - (end_token_offset - start_token_offset) + offset

    return new_tokens, new_offset

In [None]:
def get_corrections(path):
    output = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

        sentences, corrections = parse(lines)
        for s, c in zip(sentences, corrections):
            output.append([apply_corrections(s, c[0])])
    return output

def get_corruptions(path):
    output = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

        sentences, corrections = parse(lines)
        for s, c in zip(sentences, corrections):
            output.append([apply_bad_corrections(s, c[0])])
    return output


In [None]:
target1 = get_corrections('../input/diplom/RULEC-GEC.train.M2')
source1 = get_corruptions('../input/diplom/RULEC-GEC.train.M2')

target2 = get_corrections('../input/diplom/RULEC-GEC.dev.M2')
source2 = get_corruptions('../input/diplom/RULEC-GEC.dev.M2')

In [None]:
test_rulec1 = pd.DataFrame()
test_rulec2 = pd.DataFrame()

test_rulec1['target'] = [x[0] for x in target1]
test_rulec1['source'] = [x[0] for x in source1]

test_rulec2['target'] = [x[0] for x in target2]
test_rulec2['source'] = [x[0] for x in source2]

In [None]:
import numpy as np

In [None]:
test_rulec = pd.concat([test_rulec1, test_rulec2])

In [None]:
test_rulec

In [None]:
case = []
for row in test_rulec.values.tolist():
    if row[0] == row[1]:
        case.append(np.nan)
    else:
        case.append(True)
        
test_rulec['case'] = case

In [None]:
test_rulec = test_rulec.dropna()

In [None]:
test_rulec

In [None]:
test_rulec['target'] = test_rulec['target'].apply(text_preproc)
test_rulec['source'] = test_rulec['source'].apply(text_preproc)

In [None]:
test_rulec['source'] = test_rulec['source'].apply(rubert_mask)

In [None]:
test_rulec

In [None]:
test_rulec['mask_pred'] = test_rulec['source'].progress_apply(predict_mask)

In [None]:
len(test_rulec[test_rulec['target'] == test_rulec['mask_pred']]) / len(test_mask)

In [None]:
def mask_pos(df):
    targets = df['target']
    preds = df['mask_pred']
    
    target_pos = []
    for i in targets:
        tokens = i.split()
        pos_list = []
        for tok in tokens:
            pos_list.append(m.parse(tok)[0].tag.case)
        target_pos.append(pos_list)    
    pred_pos = []
    for i in preds:
        tokens = i.split()
        pos_list = []
        for tok in tokens:
            pos_list.append(m.parse(tok)[0].tag.case)
        pred_pos.append(pos_list)    
    return target_pos, pred_pos
            

In [None]:
test_rulec = test_rulec.dropna() 

In [None]:
pos_target, pos_pred = mask_pos(test_rulec.dropna())

In [None]:
test_rulec['pos_target'] = pos_target
test_rulec['pos_pred'] = pos_pred

In [None]:
test_rulec

In [None]:
len(test_rulec[test_rulec['pos_target'] == test_rulec['pos_pred']]) / len(test_rulec)

In [None]:
for i in test_rulec[test_rulec['pos_target'] != test_rulec['pos_pred']].values.tolist():
    print(f'TRUE: \t {i[0]}\nPRED: \t {i[3]}\nINPT: \t {i[1]}')
    print()

In [None]:
predict_mask('Нас это тоже вполне устраивает, но не нужно останавливаться на полпути. Мы готовы показать вам, что значит для Украины настоящая [MASK]')