In [1]:
import warnings
import pandas as pd

from augmenter import Augmenter
from synonimizer import Synonimizer
from pathlib import Path

In [2]:
warnings.filterwarnings('ignore')
data_path = Path('data')

In [3]:
dataset = pd.read_csv(data_path / 'dataset_with_synonimized_desc_005.csv').drop(columns=['Unnamed: 0', 'syn_mcc_description'])

In [4]:
with open(data_path / '10000-russian-words-cyrillic-only.txt', 'r') as f:
    wordlist = list(map(lambda x: x[:-1], f.readlines()))

In [5]:
augmenter = Augmenter(wordlist)
synonimizer = Synonimizer()

In [6]:
def make_augment(synonimizer, p_synonimize, augmenter, p_skip, p_swap, p_replace, p_add, p_abb):
    def f(text):
        text = synonimizer.synonimize_text(text, word_types=['NOUN'], word_change_prob=p_synonimize)
        text = augmenter.skip_symbol(text, p_skip)
        text = augmenter.swap_symbols(text, p_swap)
        text = augmenter.replace_symbol(text, p_replace)
        text = augmenter.add_entity(text, p_add)
        text = augmenter.word_abb(text, p_abb)
        return text
    
    return f
    

In [7]:
dataset['aug_mcc_description'] = dataset['mcc_description'].apply(make_augment(synonimizer, 0.3, augmenter, 0.3, 0.3, 0.3, 0.3, 0.3))

In [8]:
dataset.to_csv(data_path / 'dataset_aug_syn_03.csv', index=False)