In [None]:
import torch
import re
import argparse
import logging
import os
import nlpaug.augmenter.word as naw

import totto

In [None]:
def get_parser():
    parser = argparse.ArgumentParser()

    # I/O
    parser.add_argument("--train_json", type=str, default="data/pretrain/totto/totto_train_data.jsonl")
    parser.add_argument("--output_dir", type=str, default="output/pretrain/0_demo")
    parser.add_argument("--device", type=str, default="cuda:0")
    parser.add_argument("--report_step", type=int, default=10)
    parser.add_argument("--save_step", type=int, default=1000)

    # data
    parser.add_argument("--max_title_length", type=int, default=128)  # todo? table max len?

    parser.add_argument("--table_model", type=str, default="google/tapas-small")
    parser.add_argument("--text_model", type=str, default="bert-base-uncased")

    # model
    parser.add_argument("--uni_dim", type=int, default=512, help="projection dim for both modality")

    # training
    parser.add_argument("--epochs", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=320)
    parser.add_argument("--shuffle", action="store_true")
    parser.add_argument("--lr", type=float, default=1e-5)
    parser.add_argument("--seed", type=int, default=1107)

    parser.add_argument("--debug", action="store_true")
    return parser

In [None]:
logging.basicConfig(level=logging.INFO)
parser = get_parser()
args = parser.parse_args(["--debug", "--train_json", "../data/pretrain/totto/totto_dev_data.jsonl"])
args

In [None]:
totto_dataset = totto.ToTToDataset(args.train_json, args)
data0 = totto_dataset.data[0]
data0.title

bert context

In [None]:
aug_ctx = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
augmented_text = aug_ctx.augment(data0.title)
print(f"original: {data0.title}")
print(f"augmented: {augmented_text}")

word2vec

In [None]:
# from nlpaug.util.file.download import DownloadUtil
# DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='../data/model') # Download fasttext model
# DownloadUtil.download_glove(model_name='glove.6B', dest_dir='../data/model')

In [None]:
# model_type: word2vec, glove or fasttext
aug_w2v = naw.WordEmbsAug(
    model_type='glove',
    model_path='../data/model/glove.6B.300d.txt',
    action="substitute"
)
augmented_text = aug_w2v.augment(data0.title)
print(f"original: {data0.title}")
print(f"augmented: {augmented_text}")

In [None]:
aug_w2v.augment("Governors under the Constitution of 1868.")

translation

In [None]:
aug_trans = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
print(f"original: {data0.title}")
aug_trans.augment(data0.title)

In [2]:
def get_parser():
    parser = argparse.ArgumentParser()

    # I/O
    parser.add_argument("--train_json", type=str, default="data/pretrain/totto/totto_train_data.jsonl")
    parser.add_argument("--output_dir", type=str, default="output/pretrain/0_demo")
    parser.add_argument("--device", type=str, default="cuda:0")
    parser.add_argument("--report_step", type=int, default=10)
    parser.add_argument("--save_step", type=int, default=1000)

    # data
    parser.add_argument("--max_title_length", type=int, default=128)  # todo? table max len?

    parser.add_argument("--table_model", type=str, default="google/tapas-small")
    parser.add_argument("--text_model", type=str, default="bert-base-uncased")

    # model
    parser.add_argument("--uni_dim", type=int, default=512, help="projection dim for both modality")

    # training
    parser.add_argument("--epochs", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=320)
    parser.add_argument("--shuffle", action="store_true")
    parser.add_argument("--lr", type=float, default=1e-5)
    parser.add_argument("--seed", type=int, default=1107)

    parser.add_argument("--debug", action="store_true")
    return parser

In [3]:
logging.basicConfig(level=logging.INFO)
parser = get_parser()
args = parser.parse_args(["--debug", "--train_json", "../data/pretrain/totto/totto_dev_data.jsonl"])
args

Namespace(batch_size=320, debug=True, device='cuda:0', epochs=100, lr=1e-05, max_title_length=128, output_dir='output/pretrain/0_demo', report_step=10, save_step=1000, seed=1107, shuffle=False, table_model='google/tapas-small', text_model='bert-base-uncased', train_json='../data/pretrain/totto/totto_dev_data.jsonl', uni_dim=512)

In [4]:
totto_dataset = totto.ToTToDataset(args.train_json, args)
data0 = totto_dataset.data[0]
data0.title

INFO:root:read data from ../data/pretrain/totto/totto_dev_data.jsonl
INFO:root:[('success', 1000), ('rpt set error', 110), ('row/col size > 256', 9), ('table length > 1280', 6)]


'List of Governors of South Carolina. Governors under the Constitution of 1868.'

bert context

In [5]:
aug_ctx = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
augmented_text = aug_ctx.augment(data0.title)
print(f"original: {data0.title}")
print(f"augmented: {augmented_text}")

original: List of Governors of South Carolina. Governors under the Constitution of 1868.
augmented: and general governors of south sudanese. governors under state ordinance of 1868.


word2vec

In [6]:
# from nlpaug.util.file.download import DownloadUtil
# DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='../data/model') # Download fasttext model
# DownloadUtil.download_glove(model_name='glove.6B', dest_dir='../data/model')

In [7]:
# model_type: word2vec, glove or fasttext
aug_w2v = naw.WordEmbsAug(
    model_type='glove',
    model_path='../data/model/glove.6B.300d.txt',
    action="substitute"
)
augmented_text = aug_w2v.augment(data0.title)
print(f"original: {data0.title}")
print(f"augmented: {augmented_text}")

INFO:gensim.models.keyedvectors:loading projection weights from ../data/model/glove.6B.300d.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from ../data/model/glove.6B.300d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-05-19T03:33:25.868601', 'gensim': '4.2.0', 'python': '3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]', 'platform': 'Linux-4.18.0-15-generic-x86_64-with-glibc2.17', 'event': 'load_word2vec_format'}


original: List of Governors of South Carolina. Governors under the Constitution of 1868.
augmented: List which Governors well South Carolina. Governors administration three Constitution of 1904.


In [10]:
aug_w2v.augment("Governors under the Constitution of 1868.")

'Governors part last Constitution of 1875.'

translation

In [13]:
aug_trans = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
print(f"original: {data0.title}")
aug_trans.augment(data0.title)

original: List of Governors of South Carolina. Governors under the Constitution of 1868.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'List of governors of South Carolina. Governors under the 1868 Constitution.'