In [None]:
!pip install corus

In [None]:
!wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/tass-001.csv.gz

In [None]:
!pip install pymorphy2[fast]

In [None]:
!pip install razdel

In [None]:
from tqdm import tqdm
import random
tqdm.pandas()
import pandas as pd
from razdel import tokenize
from corus import load_ods_tass
from string import punctuation
import pymorphy2
m = pymorphy2.MorphAnalyzer()

In [None]:
path = './tass-001.csv.gz'
records = load_ods_tass(path)

titles = []  # ~5 минут
for i in records:
    
    titles.append(i.title)

In [None]:
def my_tokenize(x):
    return [_.text for _ in list(tokenize(x))]

In [None]:
df = pd.DataFrame()
df['text'] = titles[:100000]
df = df.dropna()
df['tokens'] = df['text'].progress_apply(my_tokenize)
df['corrupted_tokens'] = df['tokens']

In [None]:
df

In [None]:
def corrupt_tokens(tokens):
    cases = ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct']
    try:
        corrupted = []
        for i, t in enumerate(tokens):
            ## IF COUNT PREP == 0 -> NONE
            if m.parse(t)[0].tag.POS == 'NOUN':
                if m.parse(tokens[i-1])[0].tag.POS == 'PREP':
                    old_case = m.parse(t)[0].tag.case
                    new_case = old_case
                    while new_case == old_case:
                        new_case = random.choice(cases)
                    token = m.parse(t)[0].inflect({new_case}).word
                else:
                    token =  m.parse(t)[0].word
            else:
                token =  m.parse(t)[0].word
            corrupted.append(token)
        return corrupted
    except AttributeError:
        print(tokens)
        return None

In [None]:
df

In [None]:
df['corrupted_tokens'] = df['corrupted_tokens'].progress_apply(corrupt_tokens)

In [None]:
df.to_csv('data_corrupted.csv')

In [None]:
def tokens_to_text(x):
    return ' '.join(x)

In [None]:
def text_preproc(x):
    for p in punctuation + '«»':
        x = x.replace(p, '')
    x = x.replace('ё', 'е')
    while '  ' in x:
        x = x.replace('  ', ' ')
    return x.lower()

In [None]:
df = df.dropna()

In [None]:
df['corrupted_text'] = df['corrupted_tokens'].apply(tokens_to_text)

In [None]:
df['text'] = df['text'].apply(text_preproc)
df['corrupted_text'] = df['corrupted_text'].apply(text_preproc)

In [None]:
def check_prep(x):
    prep = 0
    for token in x:
        if m.parse(token)[0].tag.POS == 'PREP':
            prep += 1
    if prep == 0:
        return None
    else:
        return x

In [None]:
df['check_prep'] = df['corrupted_tokens'].progress_apply(check_prep)

In [None]:
df = df.dropna()

In [None]:
df[['corrupted_text', 'text']].to_csv('df_10k.csv')

In [None]:
df[['corrupted_text', 'text']]

# T5

In [None]:
!pip install transformers sentencepiece

In [None]:
import torch 
from transformers import T5ForConditionalGeneration, T5Tokenizer
raw_model = 'cointegrated/rut5-base-multitask' 
model = T5ForConditionalGeneration.from_pretrained(raw_model).cuda();
tokenizer = T5Tokenizer.from_pretrained(raw_model)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df[['corrupted_text', 'text']])

In [None]:
pairs = train.values.tolist()

In [None]:
from tqdm.auto import trange
import random
import numpy as np

batch_size = 16  # сколько примеров показываем модели за один шаг
report_steps = 200  # раз в сколько шагов печатаем результат
epochs = 3  # сколько раз мы покажем данные модели ОСТАВИТЬ 2 ЭПОХИ

model.train()
losses = []
for epoch in range(epochs):
    print('EPOCH', epoch)
    random.shuffle(pairs)
    for i in trange(0, int(len(pairs) / batch_size)):
        batch = pairs[i * batch_size: (i + 1) * batch_size]
        # кодируем вопрос и ответ 
        x = tokenizer([p[0] for p in batch], return_tensors='pt', padding="max_length", max_length=100,).to(model.device)
        y = tokenizer([p[1] for p in batch], return_tensors='pt', padding="max_length", max_length=100,).to(model.device)
        # -100 - специальное значение, позволяющее не учитывать токены
        y.input_ids[y.input_ids == 0] = -100
        # вычисляем функцию потерь
        loss = model(
            input_ids=x.input_ids,
            attention_mask=x.attention_mask,
            labels=y.input_ids,
            decoder_attention_mask=y.attention_mask,
            return_dict=True
        ).loss
        # делаем шаг градиентного спуска
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # печатаем скользящее среднее значение функции потерь
        losses.append(loss.item())
        if i % report_steps == 0:
            print('step', i, 'loss', np.mean(losses[-report_steps:]))

In [None]:
model.eval()

def answer(x, **kwargs):
    inputs = tokenizer(x, return_tensors='pt',padding="max_length", max_length=100,).to(model.device)
    with torch.no_grad():
        hypotheses = model.generate(**inputs, **kwargs, max_length=100)
    return tokenizer.decode(hypotheses[0], skip_special_tokens=True)

In [None]:
test_df = test[:5000]

In [None]:
test_df['preds'] = test_df['corrupted_text'].progress_apply(answer)

In [None]:
c = 0
hits = []
for i in test_df[['text', 'preds']].values.tolist():
    if i[0] == i[1]:
        c += 1
        hits.append(True)
    else:
        hits.append(False)

c/len(test_df[['text', 'preds']].values.tolist())

In [None]:
test_df['hit'] = hits

In [None]:
test_df.to_csv('preds_cointegrated_ruT5-base.csv')

In [None]:
for i in test_df[test_df['hit'] == False][['text', 'preds']].values.tolist():
    print(f'TRUE: \t {i[0]}\nPRED: \t {i[1]}')
    print()

# RULEC-GEC test dataset

In [None]:
def parse(lines):
    source_sentences = []
    gold_edits = []
    for item in paragraphs(lines):
        sentence = [line[2:].strip() for line in item if line.startswith('S ')]
        assert sentence != []
        annotations = {}
        for line in item[1:]:
            if line.startswith('I ') or line.startswith('S '):
                continue
            assert line.startswith('A ')
            line = line[2:]
            fields = line.split('|||')
            start_offset = int(fields[0].split()[0])
            end_offset = int(fields[0].split()[1])
            etype = fields[1]
            
            
            if etype == 'noop':
                start_offset = -1
                end_offset = -1
            
            corrections = [c.strip() if c != '-NONE-' else ''
                           for c in fields[2].split('||')]
    
            # NOTE: start and end are *token* offsets
            original = ' '.join(
                    ' '.join(sentence).split()[start_offset:end_offset])
            annotator = int(fields[5])
            if annotator not in annotations.keys():
                annotations[annotator] = []
            annotations[annotator].append((start_offset, end_offset,
                                           original, corrections, etype))
        tok_offset = 0
        for this_sentence in sentence:
            tok_offset += len(this_sentence.split())
            source_sentences.append(this_sentence)
            this_edits = {}
            for annotator, annotation in annotations.items():
                this_edits[annotator] = [edit for edit in annotation if edit[0] <= tok_offset and edit[1] <= tok_offset and edit[0] >= 0 and edit[1] >= 0]
            if len(this_edits) == 0:
                this_edits[0] = []
            gold_edits.append(this_edits)
    return (source_sentences, gold_edits)


def paragraphs(lines):
    paragraph = []
    for line in lines:
        if line == '':
            if paragraph:
                yield paragraph
                paragraph = []
        else:
            paragraph.append(line)

In [None]:
def apply_corrections(sentence, corrections):
    """Return a new sentence with corrections applied.
    Sentence should be a whitespace-separated tokenised string. Corrections
    should be a list of corrections.
    """
    tokens = sentence.split(' ')
    offset = 0

    for c in corrections:
        tokens, offset = _apply_correction(tokens, c, offset)

    return ' '.join(tokens)

def apply_bad_corrections(sentence, corrections):
    """Return a new sentence with corrections applied.
    Sentence should be a whitespace-separated tokenised string. Corrections
    should be a list of corrections.
    """
    tokens = sentence.split(' ')
    offset = 0

    for c in corrections:
        tokens, offset = _apply_bad_correction(tokens, c, offset)

    return ' '.join(tokens)


def _apply_correction(tokens, correction, offset):
    """Apply a single correction to a list of tokens."""
    start_token_offset, end_token_offset, bad_token, insertion, etype = correction
    to_insert = insertion[0].split(' ')
    end_token_offset += (len(to_insert) - 1)
    
    
    to_insert_filtered = [t for t in to_insert if t != '']

    head = tokens[:start_token_offset + offset]
    tail = tokens[end_token_offset + offset:]

    new_tokens = head + to_insert_filtered + tail

    new_offset = len(to_insert_filtered) - (end_token_offset - start_token_offset) + offset

    return new_tokens, new_offset


def _apply_bad_correction(tokens, correction, offset):
    """Apply a single correction to a list of tokens."""
    start_token_offset, end_token_offset, bad_token, insertion, etype = correction
    to_insert = insertion[0].split(' ')
    end_token_offset += (len(to_insert) - 1)
    
    
    if etype == 'Сущ.:Падеж':
        to_insert_filtered = [bad_token]
    else:
        to_insert_filtered = [t for t in to_insert if t != '']

    head = tokens[:start_token_offset + offset]
    tail = tokens[end_token_offset + offset:]

    new_tokens = head + to_insert_filtered + tail

    new_offset = len(to_insert_filtered) - (end_token_offset - start_token_offset) + offset

    return new_tokens, new_offset

In [None]:
def get_corrections(path):
    output = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

        sentences, corrections = parse(lines)
        for s, c in zip(sentences, corrections):
            output.append([apply_corrections(s, c[0])])
    return output

def get_corruptions(path):
    output = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

        sentences, corrections = parse(lines)
        for s, c in zip(sentences, corrections):
            output.append([apply_bad_corrections(s, c[0])])
    return output


In [None]:
target1 = get_corrections('../input/diplom/RULEC-GEC.train.M2')
source1 = get_corruptions('../input/diplom/RULEC-GEC.train.M2')

target2 = get_corrections('../input/diplom/RULEC-GEC.dev.M2')
source2 = get_corruptions('../input/diplom/RULEC-GEC.dev.M2')

In [None]:
test_rulec1 = pd.DataFrame()
test_rulec2 = pd.DataFrame()

In [None]:
test_rulec1['target'] = [x[0] for x in target1]
test_rulec1['source'] = [x[0] for x in source1]

test_rulec2['target'] = [x[0] for x in target2]
test_rulec2['source'] = [x[0] for x in source2]

In [None]:
test_rulec = pd.concat([test_rulec1, test_rulec2])

In [None]:
test_rulec

In [None]:
case = []
for row in test_rulec.values.tolist():
    if row[0] == row[1]:
        case.append(np.nan)
    else:
        case.append(True)

In [None]:
test_rulec['case'] = case

In [None]:
test_rulec = test_rulec.dropna()

In [None]:
test_rulec

In [None]:
test_rulec['target'] = test_rulec['target'].apply(text_preproc)
test_rulec['source'] = test_rulec['source'].apply(text_preproc)

In [None]:
test_rulec

In [None]:
test_rulec['preds'] = test_rulec['source'].progress_apply(answer)

In [None]:
c = 0
hits = []
for i in test_rulec[['target', 'preds']].values.tolist():
    if i[0][:-1] == i[1]:
        c += 1
        hits.append(True)
    else:
        hits.append(False)

c/len(test_rulec[['target', 'preds']].values.tolist())

In [None]:
test_rulec['hit'] = hits

In [None]:
for i in test_rulec[test_rulec['hit'] == False][['target', 'preds', 'source']].values.tolist():
    print(f'TRUE: \t {i[0]}\nPRED: \t {i[1]}\nINPT: \t {i[2]}')
    print()

In [None]:
answer('текст включает в себя этимология и история употребления слова')

# MLM: BERT & RoBERTa