In [1]:
!pip install transformers
!pip install datasets

from IPython.display import clear_output 
clear_output()

In [2]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [3]:
import pandas as pd
from pathlib import Path
import re
import shutil
import zipfile

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPTNeoForCausalLM
import numpy as np
import random


class MyConfig(dict):
  def __getattr__(self, name): return self[name]
  def __setattr__(self, name, value): self[name] = value


configs = MyConfig({'path':{
    'path_drive' : Path('/content/gdrive/MyDrive/toxic_nlp/dataset'),
    'path_colab_data' : Path('./dataset')
    },
    'seeds' : 42,
    'model_to_use' : "EleutherAI/gpt-neo-125M"
                      #1.3B Won't work on 24GB or less cards
                      # model_to_use = "EleutherAI/gpt-neo-1.3B"
})

configs.path['gc'] = [f for f in configs.path['path_drive'].glob('*') if 'gc_clean_ner' in str(f)][0]
configs.path['gr'] = [f for f in configs.path['path_drive'].glob('*') if 'gab_reddit_ner' in str(f)][0]


if configs.seeds:
  random.seed(configs.seeds)
  np.random.seed(configs.seeds)
  torch.manual_seed(configs.seeds)

# 학습에 사용할 CPU나 GPU 장치를 얻습니다.
if torch.cuda.is_available():
  configs.device = "cuda"
else: configs.device="cpu"

print("Using {} device".format(configs.device))

configs

Using cuda device


{'device': 'cuda',
 'model_to_use': 'EleutherAI/gpt-neo-125M',
 'path': {'gc': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset/gc_clean_ner.csv'),
  'gr': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset/gab_reddit_ner.csv'),
  'path_colab_data': PosixPath('dataset'),
  'path_drive': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset')},
 'seeds': 42}

In [4]:
df_gc = pd.read_csv(configs.path['gc'], index_col=0)
df_gr = pd.read_csv(configs.path['gr'], index_col=0)
df = pd.concat([df_gc, df_gr], axis=0)



parent    0
text      0
dtype: int64

In [13]:
# first drop null & empty row
df_prt_text = df[['parent','text','label']].dropna(axis=0)
df_prt_text.isnull().sum(), df_prt_text.shape

(parent    0
 text      0
 label     0
 dtype: int64, (23674, 3))

In [5]:
[i for i in df_prt_text.text if len(i) < 1] # check if ''

[]

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained(configs.model_to_use, bos_token='<|endoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPTNeoForCausalLM.from_pretrained(configs.model_to_use).to(configs.device)
model.resize_token_embeddings(len(tokenizer))

len(tokenizer) # 5만 vocab

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898669.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=357.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=560.0, style=ProgressStyle(description_…




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1007.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526017373.0, style=ProgressStyle(descri…




50258

In [7]:
# configs.max_length_parent = max([len(tokenizer.encode(text)) for text in df_prt_text['parent']])
# configs.max_length_text = max([len(tokenizer.encode(text)) for text in df_prt_text['text']])
# print(configs.max_length_parent) # 362
# print(configs.max_length_text)

configs.max_length = 524

In [8]:
class TOXICDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        # batch 단위로 로드하기 전에 몽땅 tokenizer 해둠
        for txt in txt_list:
            encodings_dict = tokenizer('<|endoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [9]:
dataset = TOXICDataset(df_prt_text['text'], tokenizer, configs.max_length)
train_size = int(0.9*len(dataset))

In [10]:
configs.path['output_dir'] = configs.path['path_drive'] / 'gtp-neo-text-results'

configs.logging_steps = 5000
configs.save_steps = 10000
configs.batch_size = 8
configs.EPOCHS = 10

train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
training_args = TrainingArguments(output_dir=configs.path['output_dir'], 
                                  num_train_epochs=configs.EPOCHS, 
                                  logging_steps=configs.logging_steps, 
                                  save_steps=configs.save_steps,
                                  per_device_train_batch_size=configs.batch_size, 
                                  per_device_eval_batch_size=configs.batch_size,
                                  warmup_steps=100, 
                                  weight_decay=0.01, 
                                  logging_dir='./logs')

Trainer(model=model, 
        args=training_args, 
        train_dataset=train_dataset,
        eval_dataset=val_dataset, 
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                    'attention_mask': torch.stack([f[1] for f in data]),
                                    'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 21306
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 26640


Step,Training Loss
5000,0.3142
10000,0.2549
15000,0.2093
20000,0.1698
25000,0.1392


Saving model checkpoint to /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-text-results/checkpoint-10000
Configuration saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-text-results/checkpoint-10000/config.json
Model weights saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-text-results/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-text-results/checkpoint-20000
Configuration saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-text-results/checkpoint-20000/config.json
Model weights saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-text-results/checkpoint-20000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=26640, training_loss=0.21203501360552446, metrics={'train_runtime': 10107.3586, 'train_samples_per_second': 21.08, 'train_steps_per_second': 2.636, 'total_flos': 8.38661234171904e+16, 'train_loss': 0.21203501360552446, 'epoch': 10.0})

In [None]:
def generate_tokens(text, ratio_of_gen_token):
  number_of_tokens = len(text.split()) // ratio_of_gen_token
  if number_of_tokens < 1:
    return ' '.join(text.split())
  else :
    return ' '.join(text.split()[:number_of_tokens])

configs.gen_token_ratio = 4
configs.temperature = 0.8

gen_output = []

for text in df_prt_text['']:
  gen_text = generate_tokens(text, configs.gen_token_ratio)

  generated = tokenizer("<|endoftext|> "+ gen_text, return_tensors="pt").input_ids.cuda()
  sample_outputs = model.generate(generated, do_sample=True, 
                                top_k=50,
                                max_length=configs.max_length, 
                                top_p=0.95, 
                                temperature=configs.temperature, 
                                num_return_sequences=1)
  
  for sample_output in sample_outputs:
    gen_output.append(tokenizer.decode(sample_output, skip_special_tokens=True))





In [16]:
df_prt_text['text_gen'] = gen_output

In [17]:
configs.path['text_gen_data'] = configs.path['gc'].parent / 'text_gen_data_v2.csv'
df_prt_text.to_csv(configs.path['text_gen_data'])

In [18]:
for i,j in zip(df_prt_text['text'],df_prt_text['text_gen']):
  print(f'ori={i}\ngen={j}\n')

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m

ori=What information would you cut out? I'd like some kind of positive reinforcement, too - as a start, what if the pointer displayed as the click-icon, rather than as, well, the pointer?
gen= What information would you cut out? I'd like some more sources, preferably from the US military or intelligence community.

ori=, yes, correct as linked in my request the actual edit would be made on that page.
gen=, yes, correct as requested. We should now be clear what "official English" means.

ori=Thanks and noted. (Didn't realise Tim was an admin)
gen= Thanks and a note.

ori=Given the marketshare of Belgacom ADSL (one of the biggest in Belgium), I'm afraid that this will not help us far.
gen= Given the marketshare of Belgacom and their overpopulated but empty office are too incompetent, it could be argued that they are more like Soros than antibodies. That argument would be more in line with what you're saying about how the Soros anti-arent

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



ori=How much soap does it take to get the goat poop off your dick?
gen= How much soap??? that is not necessary and sufficient. I dont use non-nontaxitable skinnytard like you do.

ori=Jewish woman are dirty, rabbi. I know you know.
gen= Jewish woman raped me! How do you expect to get away with murder?

ori=My mamma used to always chase them off with a broom. https://youtu.be/BMchvKOAkDo e: unintentional house nigger joke realized. e I have some amazing pics of em
gen= My mamma used to always chase them off with a broom. I never knew what to do when she was going behind the Often.

ori=I scrolled up to check if I was on r/okbuddyretard
gen= I scrolled up and down. People were yelling at me for cryingarijuana companies are in the business of supplying or outright selling meat, for instance. If you want to make a killing out of a woman, you'd have to provide meat and meat a little later in the day. If you want to provide meat and meat a little later in the day, you'd have to provide it a 