In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install sacremoses
!pip install ftfy



In [5]:
# Code for infilling and adding masks to sequence (used to randomly mask sequences in BART training)

import numpy as np
import nltk.tokenize.casual
import bisect
from IPython import embed

np.random.seed(0)

def max_span(num_mask, tokenized_len, thresh):
    i = 1
    while True:
        if (num_mask + i)/(tokenized_len) >= thresh:
            return i
        i += 1

def list_diffs(arr, max_len):
    if len(arr) == 0:
        return max_len
    else:
        # print(np.diff(np.array([0] + arr + [max_len - 1]) - 2))
        return np.max(np.diff(np.array([0] + arr + [max_len - 1]) - 2))

def collapse_contig(arr, token):
    output = []
    seen_prev = False
    for i in arr:
        if i == token:
            if seen_prev:
                seen_prev = True
                continue
            seen_prev = True
        else:
            seen_prev = False
        output.append(i)
    return output

def text_infill(sentence, mask_token, lam = 3, thresh = 0.3):
    tokenized = np.array(nltk.tokenize.casual.casual_tokenize(sentence), dtype = "object")
    masked_idcs = []

    while (len(masked_idcs) / len(tokenized)) < thresh:
        span_length = np.random.poisson(lam = lam)

        while ((span_length > list_diffs(masked_idcs, len(tokenized))) or \
            (span_length > max_span(len(masked_idcs), len(tokenized), thresh))):
            span_length = np.random.poisson(lam = lam)
            # print("Span length is too long, it is currently:", span_length)

        # print("tokenized is", tokenized)
        # print("masked idcs are", masked_idcs)
        # print("span length is", span_length)

        if span_length == 0:
            start_idx = np.random.randint(0, len(tokenized) + 1)
            while ((start_idx in masked_idcs) or (start_idx in (np.array(masked_idcs) + 1))):
                # print("bad, start_idx is", start_idx)
                start_idx = np.random.randint(0, len(tokenized) + 1)

            # print("start idx is", start_idx)
            tokenized = np.insert(tokenized, start_idx, mask_token)
            bisect.insort(masked_idcs, start_idx)

        else:
            while True:
                start_idx = np.random.randint(0, len(tokenized) - span_length + 1)
                idcs = np.arange(start_idx, start_idx + span_length)

                for i in idcs:
                    if i in masked_idcs or i in (np.array(masked_idcs) + 1):
                        # print("bad i" , i)
                        continue
                break

            for i in idcs:
                bisect.insort(masked_idcs, i)
                tokenized[i] = mask_token
            #print("idcs are", idcs)
    # print("final mask ratio:",len(masked_idcs)/len(tokenized))
    return collapse_contig(tokenized, mask_token)

# Masks tokens from idx to idx + span_length with mask_token
# If idx > length of sequence, does not mask anything
def span_mask(sentence, idx, mask_token, span_length = 1):
    tokenized = np.array(nltk.tokenize.casual.casual_tokenize(sentence), dtype = "object")
    max_len = len(tokenized)
    if idx < max_len:
        end_span = min(idx+span_length, max_len)
        tokenized[idx:end_span] = [mask_token for i in range(end_span - idx)]
    return tokenized

# embed()
# print(text_infill("I'm gonna go, do you want anything Mom?", "<mask>"))
# print(text_infill("Hey I'm going to the store, do you want anything?", "<mask>"))

In [6]:
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
from sacremoses import MosesDetokenizer
import numpy as np
import torch
import random
import html
import re
import ftfy
from nltk.tokenize.casual import casual_tokenize

nl_tok = "[<NEW>]"
md = MosesDetokenizer(lang='ko')

def detokenize(input):
    # return TreebankWordDetokenizer().detokenize(input)
    return md.detokenize(input)

def set_seed(seed, n_gpu):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

def bool2str(cand):
    if cand:
        return "T"
    return "F"

def seed_everything(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Only useful for convolution
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

def preprocess(text, preserve_lines = False):
    if preserve_lines:
        return ftfy.fix_text(html.unescape(text))
    # Remove linee break and excess spaces
    return ftfy.fix_text(html.unescape(re.sub(r'\s+', ' ', text).strip()))

# Quick test
# TreebankWordDetokenizer.detokenize(TreebankWordTokenizer.tokenize("sh*t"))

In [7]:
# Finetuning the toxic and nontoxic language models
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments, AdamW, EarlyStoppingCallback, PreTrainedTokenizerFast
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import nn
import argparse
import random
from IPython import embed

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if not torch.cuda.is_available():
    print("No GPUs found!")
else:
    print("Found", str(torch.cuda.device_count()), "GPUS!")

Found 1 GPUS!


In [10]:
# Load in the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v1', forced_bos_token_id = tokenizer.bos_token_id).to(device)
model.train()
mask = tokenizer.mask_token

model_dir = '/content/drive/MyDrive/졸작/model/toxic'

if not os.path.exists(model_dir):
    print(model_dir)
    os.mkdir(model_dir)

output_dir = model_dir
print(output_dir)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


/content/drive/MyDrive/졸작/model/toxic


In [11]:
train_texts = []
val_texts = []

# Read/process the data based on which dataset we're using: Jigsaw or Dynabench
# If you want to load your own data, put the data loading logic here
train = pd.read_csv('/content/unsmile_train_v1.0.tsv',delimiter='\t')
val = pd.read_csv('/content/unsmile_valid_v1.0.tsv',delimiter='\t')

train_texts =  train["문장"].tolist()
val_texts = val["문장"].tolist()

print(len(train_texts), len(val_texts))
print(train_texts[0], val_texts[0])

15005 3737
일안하는 시간은 쉬고싶어서 그런게 아닐까 ㅇㄱㄹㅇ 진짜 죽어도 상관없다는 마인드로 싸웠더니 지금 서열 상타취노 식칼들고 니가 나 안찌르면 내가 너 찌른다 했더니 애비충 냄동충 알아서기노 ^됫^ 한번 서열 잡고 그 담에 개길때마다 더 세게나가면 확실하게 짓누를수있다익이


In [12]:
# # Test percentiles of tokenized lengths
# src_lengths = [len(tokenizer(x).input_ids) for x in train_texts]
# # tgt_lengths = [len(tokenizer(x).input_ids) for x in train_labels]
# print(np.percentile(src_lengths, 99))
# embed()

# Tokenize everything
tokenized_labs_train = tokenizer.batch_encode_plus(
    train_texts,
    max_length = 232, # args.max_target_length
    padding="max_length",
    truncation=True,
    return_tensors = "pt").input_ids

tokenized_labs_val = tokenizer.batch_encode_plus(
    val_texts,
    max_length = 232, # args.max_target_length,
    padding="max_length",
    truncation=True,
    return_tensors = "pt").input_ids

tokenized_labs_val[tokenized_labs_val == tokenizer.pad_token_id] = tokenizer.eos_token_id
tokenized_labs_train[tokenized_labs_train == tokenizer.pad_token_id] = tokenizer.eos_token_id

In [13]:
 # Convert this so that examples are dynamically masked
class JigsawDataset(torch.utils.data.Dataset):
    def __init__(self, labels, rawdata):
        self.rawdata = rawdata
        self.labels = labels

    def __getitem__(self, idx):
        # Dynamically generate sources
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['labels'] = cur_lab

        # Do detokenization to ensure that the tokenization matches up later
        item = tokenizer.encode_plus(
            detokenize(text_infill(self.rawdata[idx], tokenizer.mask_token)),
            max_length = 182, # args.max_source_length,
            padding="max_length",
            truncation=True,
            return_tensors = "pt")

        # Necessary for squeezing purposes
        item = {key: torch.tensor(val[0]) for key, val in item.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JigsawDataset(tokenized_labs_train, train_texts)
val_dataset = JigsawDataset(tokenized_labs_val, val_texts)

In [30]:
# Print out what the model looks like before we start training
print("Before training:")
inputs_masked = ["한국인들은 <mask>"]
print("\n Generations: ", tokenizer.batch_decode(model.generate(tokenizer(inputs_masked, return_tensors = "pt").to('cuda').input_ids, do_sample=True, num_return_sequences = 3)),"\n")

Before training:

 Generations:  ['</s><s>          게       </s>', '</s><s> 이날      대부분   이       </s>', '</s><s>               출국  </s>'] 



In [39]:
print(tokenizer.decode(train_dataset[100]['input_ids']))
print(tokenizer.decode(train_dataset[100]['labels']))

이거 생각해보니 진짜 투신자살일까 <mask> 밀어서 떨어트린거 아니노...?<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
이거 생각해보니 진짜 투신자살일까 하는 생각이 드노.. 강간하려다가 밀어서 떨어트린거 아니노...?</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><

  item = {key: torch.tensor(val[0]) for key, val in item.items()}


In [40]:
class BartTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            outputs = model(
                input_ids = inputs.get("input_ids"),
                attention_mask = inputs.get("attention_mask"),
                labels = inputs.get("labels")
                )
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            return (loss, outputs) if return_outputs else loss

        def evaluate(self, **kwargs):
            metrics = super().evaluate(**kwargs)
            # Example to see what the model outputs for mask token (see if it outputs something biased)
            inputs_masked = ["한국인들은 <mask>"]
            print("\n Generations: ", tokenizer.batch_decode(model.generate(tokenizer(inputs_masked, return_tensors = "pt").to('cuda').input_ids, do_sample=True, num_return_sequences = 5)),"\n")
            return metrics

In [41]:
training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    max_steps= 30000, # args.max_steps,              # total number of training steps
    per_device_train_batch_size=8, # args.train_batch_size,  # batch size per device during training
    per_device_eval_batch_size=64, # args.eval_batch_size,   # batch size for evaluation
    learning_rate =2.5e-5, # args.lr,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    save_steps = 1000, # args.save_steps,
    eval_steps = 500, # args.save_steps,
    fp16 = True, # args.fp16,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    load_best_model_at_end = True,
    save_total_limit = 2, # args.save_total_limit,
    logging_dir='/content/logs', # args.logging_dir,            # directory for storing logs
    logging_steps= 500, #args.logging_steps,
    seed = 0, # args.seed,
    save_safetensors=False
)

trainer = BartTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    callbacks = [EarlyStoppingCallback(5)] # args.early_stopping_steps
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  item = {key: torch.tensor(val[0]) for key, val in item.items()}


Step,Training Loss,Validation Loss
500,0.2548,0.172361
1000,0.1771,0.166343
1500,0.1731,0.162991
2000,0.1682,0.162381
2500,0.158,0.158967
3000,0.1571,0.158048
3500,0.1605,0.15697
4000,0.1551,0.161789
4500,0.1515,0.155781
5000,0.1512,0.156322





 Generations:  ['</s><s>인들은 저게뭐야?</s><pad><pad><pad><pad>', '</s><s>인들은 한국인들은 애교심 많음</s><pad><pad>', '</s><s>인들은 한국인들은 잘 안되네!</s><pad><pad>', '</s><s>인들은 한국인들은 왜?</s><pad><pad><pad><pad><pad>', '</s><s>인들은 그놈들은 아플거아닌가</s>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 애들도 다닌다</s><pad>', '</s><s>인들은 한국인한테 어떻게 하면하지</s><pad>', '</s><s>인들은 지들끼리...</s><pad><pad>', '</s><s>인들은 한국인들은 저게 뭐냐?</s>', '</s><s>인들은 한국인들은 좆나다.</s>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 한국사람이나 되는줄</s><pad><pad><pad><pad>', '</s><s>인들은 한국인</s><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 일본이나 갔다와</s><pad><pad><pad><pad>', '</s><s>인들은 여자임</s><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 전라도고 지들 ᄏᄏᄏ</s>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 짱깨새끼들임</s>', '</s><s>인들은 한국인을 혐오한다</s><pad><pad>', '</s><s>인들은 개슬람새끼들이</s><pad>', '</s><s>인들은 한국만 ᄏᄏᄏ</s><pad>', '</s><s>인들은 한국인이다...</s><pad><pad><pad>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 좆같은 개독</s><pad><pad><pad>', '</s><s>인들은 병신같노 짱개들</s>', '</s><s>인들은 무지막지했네</s><pad><pad><pad>', '</s><s>인들은 자댕이 새끼</s><pad><pad><pad><pad>', '</s><s>인들은 시발 ᄏᄏ</s><pad><pad><pad>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 어이없네</s><pad>', '</s><s>인들은 재기나올까?</s>', '</s><s>인들은 왜 한국가냐</s><pad>', '</s><s>인들은 페미인데</s><pad><pad>', '</s><s>인들은 일베가 아니다</s><pad>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 개슬람이다.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 ᄏᄏᄏ</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s>', '</s><s>인들은 ᄏᄏᄏᄏ</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 왜 뒈졌노ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 자지가 동성애자일뿐</s>', '</s><s>인들은 걍 믿어야한다</s><pad><pad>', '</s><s>인들은 더럽네</s><pad><pad><pad><pad>', '</s><s>인들은 지가 좋아한다</s><pad><pad><pad>', '</s><s>인들은 못생겼어</s><pad><pad><pad>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 왜이러지 ᄏᄏ</s>', '</s><s>인들은 어딨냐?</s><pad><pad><pad><pad>', '</s><s>인들은 한국보다 낫노</s><pad><pad><pad>', '</s><s>인들은 ᄏᄏᄏᄏ</s><pad><pad>', '</s><s>인들은 ᄅᄋ</s><pad><pad><pad><pad>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 ᄏᄏᄏᄏ</s>', '</s><s>인들은 한남답어라</s>', '</s><s>인들은 지옥</s><pad><pad><pad>', '</s><s>인들은 전라도야</s><pad><pad>', '</s><s>인들은 좆나보네</s>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s>', '</s><s>인들은 ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s>', '</s><s>인들은 쟤네들이아</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s>', '</s><s>인들은 좆물같아요</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 망혼합합니다</s>', '</s><s>인들은 전라도아니라</s><pad>', '</s><s>인들은 한국땅</s><pad><pad><pad>', '</s><s>인들은 죽겠노</s><pad><pad>', '</s><s>인들은 ᄏᄏ</s><pad><pad>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 짱깨같은새키</s>', '</s><s>인들은 젠신병자</s><pad>', '</s><s>인들은 개독인가보네</s><pad>', '</s><s>인들은 한국인</s><pad><pad><pad><pad><pad>', '</s><s>인들은 병신이노</s><pad><pad><pad>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 미쳤다</s><pad>', '</s><s>인들은 조선족</s><pad><pad>', '</s><s>인들은 뭐냐ᄀᄀ</s>', '</s><s>인들은 한국은 사나이임</s>', '</s><s>인들은 교회를가노</s>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 ᄌᄂ</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 저능아니라ᄀᄏ</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 왜 ᄏ</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 무슬림이냐?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 한국인이잖아ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 정신병자임</s><pad><pad><pad>', '</s><s>인들은 이런병신이네</s><pad><pad><pad>', '</s><s>인들은 저건 뭐지</s><pad><pad><pad>', '</s><s>인들은 일본으로 보내주세요</s><pad><pad>', '</s><s>인들은 똥꼬충새끼임</s>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 패악질..</s><pad>', '</s><s>인들은 한국지들이었잖아</s>', '</s><s>인들은 빡지다</s><pad>', '</s><s>인들은 어거진</s><pad><pad>', '</s><s>인들은 대깨문임</s><pad>'] 



Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}



 Generations:  ['</s><s>인들은 ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s><pad><pad><pad><pad>', '</s><s>인들은 죽여버리고싶다</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 ᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏᄏ</s>', '</s><s>인들은 조선족들이구나</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 이기는것도없음</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'] 



  item = {key: torch.tensor(val[0]) for key, val in item.items()}



 Generations:  ['</s><s>인들은 무슨 인종차별</s><pad><pad><pad><pad>', '</s><s>인들은 무슨</s><pad><pad><pad><pad><pad><pad>', '</s><s>인들은 한국인이냐?</s><pad><pad><pad><pad>', '</s><s>인들은 좆족새끼들임</s>', '</s><s>인들은 ᄃᄃ</s><pad><pad><pad><pad>'] 



TrainOutput(global_step=9500, training_loss=0.15670610126696136, metrics={'train_runtime': 2043.5143, 'train_samples_per_second': 117.445, 'train_steps_per_second': 14.681, 'total_flos': 8234580185395200.0, 'train_loss': 0.15670610126696136, 'epoch': 5.063965884861407})

In [44]:
# Print out what the model looks like before we start training
print("After training:")
inputs_masked = ["한국인들은 <mask>"]
print("\n Generations: ", tokenizer.batch_decode(model.generate(tokenizer(inputs_masked, return_tensors = "pt").to('cuda').input_ids, do_sample=True, num_return_sequences = 3)),"\n")

After training:

 Generations:  ['</s><s>인들은 이슬람교구나</s><pad><pad><pad>', '</s><s>인들은 좆같이생겻노</s>', '</s><s>인들은 뭐하냐</s><pad><pad><pad>'] 



In [45]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [46]:
MODEL_SAVE_HUB_PATH = 'koBART-toxic' # ex) 'my-bert-fine-tuned'
HUGGINGFACE_AUTH_TOKEN = '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' # https://huggingface.co/settings/token

## Push to huggingface-hub
model.push_to_hub(
			MODEL_SAVE_HUB_PATH,
			use_temp_dir=True,
			use_auth_token=HUGGINGFACE_AUTH_TOKEN
)
tokenizer.push_to_hub(
			MODEL_SAVE_HUB_PATH,
			use_temp_dir=True,
			use_auth_token=HUGGINGFACE_AUTH_TOKEN
)

Non-default generation parameters: {'forced_bos_token_id': 0, 'forced_eos_token_id': 1}


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cho-to/koBART-toxic/commit/e129e7cc8ef16b472d5294a14a1e9b711cf8cac3', commit_message='Upload tokenizer', commit_description='', oid='e129e7cc8ef16b472d5294a14a1e9b711cf8cac3', pr_url=None, pr_revision=None, pr_num=None)