In [1]:
!pip install datasets -q
!pip install happytransformer -q

In [2]:
from happytransformer import HappyTextToText

In [3]:
happy_tt = HappyTextToText("T5", "t5-large")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
from datasets import load_dataset

train_dataset = load_dataset("ai-forever/spellcheck_benchmark", "RUSpellRU", split='train[:]')
eval_dataset = load_dataset("ai-forever/spellcheck_benchmark", "RUSpellRU", split='test[:]')

In [5]:
import csv
def generate_csv(csv_path, dataset):
    with open(csv_path, 'w', newline='') as csvfile:
        writter = csv.writer(csvfile)
        writter.writerow(["input", "target"])
        for case in dataset:
            input_text = "grammar: " + case["source"]
            for correction in case["correction"]:
                # a few of the cases are blank strings. So we'll skip them
                if input_text and correction:
                    writter.writerow([input_text, correction])

In [6]:
generate_csv("/content/sample_data/train.csv", train_dataset)
generate_csv("/content/sample_data/eval.csv", eval_dataset)

In [7]:
from happytransformer import TTTrainArgs

In [8]:
args = TTTrainArgs(batch_size=8, fp16=True, eval_steps=.8)

In [None]:
happy_tt.train("/content/sample_data/train.csv", args=args)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/135236 [00:00<?, ? examples/s]

In [None]:
example = "grammar: Привит, расзкажи чем зонимаешся в свободное время?"
result = happy_tt.generate_text(example_1, args=beam_settings)
print(result.text)

In [None]:
happy_tt.save('/content/sample_data/transformer/')

In [None]:
import zipfile

In [None]:
files_to_zip = ['/content/sample_data/transformer/config.json',
                '/content/sample_data/transformer/generation_config.json',
                '/content/sample_data/transformer/special_tokens_map.json',
                '/content/sample_data/transformer/model.safetensors',
                '/content/sample_data/transformer/spiece.model',
                '/content/sample_data/transformer/tokenizer.json',
                '/content/sample_data/transformer/tokenizer_config.json'
]

zip_filename = 'archive.zip'

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in files_to_zip:
        zipf.write(file)

print('Архивация завершена!')

Архивация завершена!


In [None]:
!mv /content/archive.zip /content/drive/MyDrive/spell-checker

print('Архив сохранен на Google Drive!')

Архив сохранен на Google Drive!
