In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 35.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import pandas as pd
import re, random
import nltk
from tqdm import tqdm
from collections import Counter, defaultdict
import itertools
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
from transformers import AutoTokenizer
from transformers import set_seed
import sys
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

In [None]:
SEED = 314
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [None]:
data = pd.read_csv("./dataset.csv")

In [None]:
class spell:
    def __init__(self, data, corrupted_text, correct_text):
        print("starting")
        words = " ".join(list(data[correct_text])).lower()
        print("extracting tokens")
        words = re.findall(r'[\w]+', words)
        #words = u" ".join(words).split()
        print("creating set of syms")
        self.d_sym = "".join(list(set(list("".join(words)))))
        print("creating set of words")
        self.d_set = set(words)
        print("creating dict")
        self.d_dict = dict(Counter(words))
        print("init done")
        print("")
    
    def create_symspell(self, arr):
        self.pbar = tqdm(total=len(arr))
        self.symdict = defaultdict(list)
        pool = ThreadPool(10)
        pool.map(self.symspell, arr[:10000])
    
    def symspell(self, word):
        words = self.away_2(word)
        for w in words:
            self.symdict[w].append(word)
        self.pbar.update(1)
            
    def away_1(self, word):
        #letters = self.d_sym
        letters = 'абвгдежзийклмнопрстуфхцчшщъыьэюяё'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        #deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(transposes + replaces + inserts) #deletes

    def away_2(self, word):
        return set([e2 for e1 in self.away_1(word)
                    for e2 in self.away_1(e1)])
    
    def known(self, words):
        return set(w for w in words if w in self.d_set)

    def edit_candidates(self, word):
        ttt = self.known(self.away_1(word)) | self.known(self.away_2(word))

        return list(ttt)

    def most_freq_edits(self, word):
        lst = self.edit_candidates(word)
        lst.sort(key=lambda x: self.d_dict[x])
        lst.reverse()
        return lst

    def token(self, sent):
        return re.findall(r'[\w]+', sent)



In [None]:
a = spell(data, "source", "target")

starting
extracting tokens
creating set of syms
creating set of words
creating dict
init done



In [None]:
a.most_freq_edits("человен")

['человек', 'человека', 'человеку', 'человеке']

# fine-tuning gpt-2

In [None]:
#!g1.1
train_data = data.target.copy()

In [None]:
#!g1.1
def build_text_files(data_arr, dest_path):
    with open(dest_path, 'w') as f:
        data = ''
        for texts in data_arr:
            data += texts + "  "
        f.write(data)

train, test = train_test_split(train_data,test_size=0.1)

build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

In [None]:
#!g1.1
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

Downloading:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
#!g1.1
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)




In [None]:
#!g1.1

model = AutoModelWithLMHead.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")

training_args = TrainingArguments(
    output_dir="./gpt2-ru", 
    overwrite_output_dir=True, 
    num_train_epochs=1, 
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_steps = 500,
    save_steps=1000,
    warmup_steps=500,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



Downloading:   0%|          | 0.00/551M [00:00<?, ?B/s]

In [None]:
#!g1.1
trainer.train()

***** Running training *****
  Num examples = 3242
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 102
  Number of trainable parameters = 125231616


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=102, training_loss=4.1814551259957105, metrics={'train_runtime': 118.3526, 'train_samples_per_second': 27.393, 'train_steps_per_second': 0.862, 'total_flos': 211777191936000.0, 'train_loss': 4.1814551259957105, 'epoch': 1.0})

In [None]:
#!g1.1
trainer.save_model()

Saving model checkpoint to ./gpt2-ru
Configuration saved in ./gpt2-ru/config.json
Model weights saved in ./gpt2-ru/pytorch_model.bin


# Поиск и замена ошибок

In [None]:
#!g2.mig
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
with torch.no_grad():
    model = GPT2LMHeadModel.from_pretrained('./gpt2-ru') #finetuned rugpt-2
    model.to(device)
    model.eval()

tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')

def score(sentence):
    tokenize_input = tokenizer.encode(sentence)
    tensor_input = torch.tensor([tokenize_input]).to(device)
    loss = model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.cpu().detach().numpy())

loading configuration file ./gpt2-ru/config.json
Model config GPT2Config {
  "_name_or_path": "sberbank-ai/rugpt3small_based_on_gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 2048,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 2048,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "use_cache": true,
  "vocab_size": 50264
}

loading weights file ./gpt2-ru/pytorch_model.bin
All model chec

In [None]:
#!g2.mig
def find_best(sent, strategy = None, top = 6, att_score = 500, att_freq = 3):
    global a
    tokenized_lower = a.token(sent.lower())
    tokenized_normal = a.token(sent)

    # ищем какие слова - ошибки
    words_mistakes = {}  # ключ - слово, значение - индекс в tokenized_lower
    for i, word in enumerate(tokenized_lower):
        if not [word] == list(a.known([word])):
            words_mistakes[word] = i
    
    
    # ищем по словарю возможные замены
    edit_suggestions = []
    words_mistakes_list = []
    for word in words_mistakes:
        words_mistakes_list.append(word)
        edit_suggestion = a.most_freq_edits(word)
        if edit_suggestion:
          if top is not None:
            if len(edit_suggestion) > top:
              edit_suggestion = edit_suggestion[:top-1]
          edit_suggestions.append(edit_suggestion)
        else:
            edit_suggestions.append([word])

    # генерируем подстановоки вместо ошибок

    pairs = list(itertools.product(*edit_suggestions))
    sent_suggestions = []
    for pair in pairs:
        sent_tmp = sent
        for i in range(len(words_mistakes)):
            if tokenized_normal[words_mistakes[words_mistakes_list[i]]][0].isupper():
                replace = pair[i]
                replace = replace[0].upper() + replace[1:]
            else:
                replace = pair[i]
            sent_tmp = sent_tmp.replace(tokenized_normal[words_mistakes[words_mistakes_list[i]]], replace)

        sent_suggestions.append(sent_tmp)
    #sent_suggestions = list(set(sent_suggestions))
    #print(len(sent_suggestions))

    # проверяем семантическую адекватность подстановок и выбираем лучшую
    scores = []
    for i in sent_suggestions:
        scores.append(score(i))
    #plt.hist(scores)
    #plt.show()
    return sent_suggestions[np.argmin(scores)]

In [None]:
#!g2.mig
find_best("Об этом чернз минуту.")

'Об этом через минуту.'

# Тесты

## Тест всего билда

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
import string


def clear (s):
  punct = "!\"#$%&'()*+,./:;<=>?@[\]^_`{|}~'"
  tokens = nltk.word_tokenize(s)
  tokens = list(filter(lambda token: token not in string.punctuation, tokens))
  s = ' '.join(tokens)
  return s.translate(str.maketrans('', '', punct)).lower()


In [None]:
#!g2.mig
def validate(data, verbose = False):
  res = []
  time1 = time.time()
  for i, elm in tqdm(enumerate(data.source), total = len(data)):
    res.append(find_best(elm))
  l = 0
  n = 0
  time2 = time.time()
  for i, elm in enumerate(data.target):
    n += 1
    if clear(elm) == clear(res[i]):
      l += 1
    else:
      if verbose:
        print("FAILED || ", data.source.iloc[i], '==>', clear(res[i]), '!!!===', clear(elm))
  
  print("TOTAL ACU: ",l/n)
  print("SECONDS PER ITER :", np.round((time2-time1)/len(data), 4))
  print("TOTAL HOURS FOR ALL PRiVATE: ", np.round((time2-time1)/len(data) * 56000 / (3600),1) )

In [None]:
#!g2.mig
data_val = data[0:200]
# validate(data_val, verbose = True)

In [None]:
from  evaluate_spell import *

to_output_differences = False


source_sents = [extract_words(line.strip().strip('\ufeff'))
                for line in data_val.source if line.strip().strip('\ufeff') != ""]
correct_sents = [extract_words(line.strip().strip('\ufeff'))
                  for line in data_val.target if line.strip().strip('\ufeff') != ""]
answer_sents = [extract_words(find_best(line).strip().strip('\ufeff'))
                for line in data_val.source  if line.strip().strip('\ufeff') != ""]
etalon_corrections, answer_corrections =\
    make_corrections_data(source_sents, correct_sents, answer_sents)
TP, precision, recall, f_measure = measure_quality(etalon_corrections, answer_corrections)
print("Precision={0:.2f} Recall={1:.2f} FMeasure={2:.2f}".format(
    100 * precision, 100 * recall, 100 * f_measure))
print(TP, len(answer_corrections), len(etalon_corrections))
if to_output_differences:
    output_differences(source_sents, correct_sents, answer_sents,
                       etalon_corrections, answer_corrections)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
Precision=60.15 Recall=40.82 FMeasure=48.63
80 133 196


# Private submit

In [None]:
#!g2.mig
data_sub = pd.read_csv("./private_test.csv")

In [None]:
#!g2.mig
data_sub.head(5)

Unnamed: 0,corrupted_text
0,мясыне блюда говядина
1,- А можно я пойд?
2,Бордюры обонй ассортименте
3,Вместо союса кетчуп
4,"Не прдесталя, как она могла туда папаст."


In [None]:
#!g2.mig
len(data_sub)

56526

In [None]:
#!g2.mig
def submit(data, outpath):
    with open(outpath, 'w') as file:
        for i, elm in tqdm(enumerate(data.corrupted_text), total = len(data)):
            file.write(find_best(elm) + "\n")
