In [1]:
!pip install transformers
!pip install datasets

from IPython.display import clear_output 
clear_output()

In [2]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [3]:
import pandas as pd
from pathlib import Path
import re
import shutil
import zipfile

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPTNeoForCausalLM
import numpy as np
import random


class MyConfig(dict):
  def __getattr__(self, name): return self[name]
  def __setattr__(self, name, value): self[name] = value

In [4]:
configs = MyConfig({'path':{
    'path_drive' : Path('/content/gdrive/MyDrive/toxic_nlp/dataset'),
    'path_colab_data' : Path('./dataset')
    },
    'seeds' : 42,
    'model_to_use' : "EleutherAI/gpt-neo-125M"
                      #1.3B Won't work on 24GB or less cards
                      # model_to_use = "EleutherAI/gpt-neo-1.3B"
})

configs.path['gc'] = [f for f in configs.path['path_drive'].glob('*') if 'gc_clean_ner' in str(f)][0]
configs.path['gr'] = [f for f in configs.path['path_drive'].glob('*') if 'gab_reddit_ner' in str(f)][0]


if configs.seeds:
  random.seed(configs.seeds)
  np.random.seed(configs.seeds)
  torch.manual_seed(configs.seeds)

# 학습에 사용할 CPU나 GPU 장치를 얻습니다.
if torch.cuda.is_available():
  configs.device = "cuda"
else: configs.device="cpu"

print("Using {} device".format(configs.device))

configs


Using cuda device


{'device': 'cuda',
 'model_to_use': 'EleutherAI/gpt-neo-125M',
 'path': {'gc': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset/gc_clean_ner.csv'),
  'gr': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset/gab_reddit_ner.csv'),
  'path_colab_data': PosixPath('dataset'),
  'path_drive': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset')},
 'seeds': 42}

In [5]:
df_gc = pd.read_csv(configs.path['gc'], index_col=0)
df_gr = pd.read_csv(configs.path['gr'], index_col=0)
df = pd.concat([df_gc, df_gr], axis=0)

In [6]:
df_gr.isnull().sum()
df_gc.isnull().sum()

parent           4
text             3
label            0
parent_ner    5020
text_ner      5370
dtype: int64

In [7]:
df_gc[df_gc.isnull()]

Unnamed: 0,parent,text,label,parent_ner,text_ner
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
...,...,...,...,...,...
9995,,,,,
9996,,,,,
9997,,,,,
9998,,,,,


In [8]:
print(df.shape)
df.label.value_counts()
df.isnull().sum()

(24146, 5)


parent          455
text             18
label             0
parent_ner    11272
text_ner      13475
dtype: int64

In [58]:
# first drop null & empty row
df_prt_text = df[['parent','text','label']].dropna(axis=0)
df_prt_text.isnull().sum()

parent    0
text      0
label     0
dtype: int64

In [63]:
df_prt_text.duplicated().sum()

92

In [10]:
[i for i in df_prt_text.parent if len(i) < 1] # check if ''

[]

## Set Model & Tokenizer

In [None]:
configs

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained(configs.model_to_use, bos_token='<|endoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPTNeoForCausalLM.from_pretrained(configs.model_to_use).to(configs.device)
model.resize_token_embeddings(len(tokenizer))

len(tokenizer) # 5만 vocab

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898669.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=357.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=560.0, style=ProgressStyle(description_…




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1007.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526017373.0, style=ProgressStyle(descri…




50258

In [54]:
# configs.max_length_parent = max([len(tokenizer.encode(text)) for text in df_prt_text['parent']])
# configs.max_length_text = max([len(tokenizer.encode(text)) for text in df_prt_text['text']])
# print(configs.max_length_parent) # 362
# print(configs.max_length_text)

1559
1459


In [12]:
configs.max_length = 524

class TOXICDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        # batch 단위로 로드하기 전에 몽땅 tokenizer 해둠
        for txt in txt_list:
            encodings_dict = tokenizer('<|endoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]



In [14]:
dataset = TOXICDataset(df_prt_text['parent'], tokenizer, configs.max_length)
train_size = int(0.9*len(dataset))

In [105]:
configs.path

{'gc': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset/gc_clean_ner.csv'),
 'gr': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset/gab_reddit_ner.csv'),
 'parent_gen_data': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset/parent_gen_data.csv'),
 'path_colab_data': PosixPath('dataset'),
 'path_drive': PosixPath('/content/gdrive/MyDrive/toxic_nlp/dataset')}

In [15]:
configs.path['output_dir'] = configs.path['path_drive'] / 'gtp-neo-parent-results'

configs.logging_steps = 5000
configs.save_steps = 10000
configs.batch_size = 8
configs.EPOCHS = 10

train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
training_args = TrainingArguments(output_dir=configs.path['output_dir'], 
                                  num_train_epochs=configs.EPOCHS, 
                                  logging_steps=configs.logging_steps, 
                                  save_steps=configs.save_steps,
                                  per_device_train_batch_size=configs.batch_size, 
                                  per_device_eval_batch_size=configs.batch_size,
                                  warmup_steps=100, 
                                  weight_decay=0.01, 
                                  logging_dir='./logs')

Trainer(model=model, 
        args=training_args, 
        train_dataset=train_dataset,
        eval_dataset=val_dataset, 
        data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                    'attention_mask': torch.stack([f[1] for f in data]),
                                    'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 21306
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 26640


Step,Training Loss
5000,0.3537
10000,0.2802
15000,0.2264
20000,0.1816
25000,0.149


Saving model checkpoint to /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-parent-results/checkpoint-10000
Configuration saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-parent-results/checkpoint-10000/config.json
Model weights saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-parent-results/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-parent-results/checkpoint-20000
Configuration saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-parent-results/checkpoint-20000/config.json
Model weights saved in /content/gdrive/MyDrive/toxic_nlp/dataset/gtp-neo-parent-results/checkpoint-20000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=26640, training_loss=0.2319482322211738, metrics={'train_runtime': 10112.3745, 'train_samples_per_second': 21.069, 'train_steps_per_second': 2.634, 'total_flos': 8.38661234171904e+16, 'train_loss': 0.2319482322211738, 'epoch': 10.0})

In [90]:
# Try sample 
gen_text = generate_tokens(df_prt_text.iloc[0,0], 4)

generated = tokenizer("<|endoftext|> "+ gen_text, return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50,
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=1)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  I also feel its important for StuRat to have his proper say on the proposed guidelines. He also should provide some context as such.


In [16]:
df_prt_text.iloc[0,0]

"I also feel its important for StuRat to have his proper say on the proposed guidelines. He has not been able to so far as hes busy defending himself. Im sure that if Stu Rat has a part to play in the guidelines then he will have some ownerhip of them and is likely to resplct them (as I will have to since I have rewritten a large part of them)'"

In [None]:
def generate_tokens(text, ratio_of_gen_token):
  number_of_tokens = len(text.split()) // ratio_of_gen_token
  if number_of_tokens < 1:
    return ' '.join(text.split())
  else :
    return ' '.join(text.split()[:number_of_tokens])

configs.gen_token_ratio = 4
configs.temperature = 0.8

gen_output = []

for text in df_prt_text['parent']:
  gen_text = generate_tokens(text, configs.gen_token_ratio)

  generated = tokenizer("<|endoftext|> "+ gen_text, return_tensors="pt").input_ids.cuda()
  sample_outputs = model.generate(generated, do_sample=True, 
                                top_k=50,
                                max_length=configs.max_length, 
                                top_p=0.95, 
                                temperature=configs.temperature, 
                                num_return_sequences=1)
  
  for sample_output in sample_outputs:
    gen_output.append(tokenizer.decode(sample_output, skip_special_tokens=True))




In [59]:
df_prt_text['parent_gen'] = gen_output
df_prt_text

In [61]:
# gen_output
# df_prt_text['parent_gen'] = gen_output
configs.path['parent_gen_data'] = configs.path['gc'].parent / 'parent_gen_data_v2.csv'
df_prt_text.to_csv(configs.path['parent_gen_data'])

In [62]:
for i,j in zip(df_prt_text['parent'],df_prt_text['parent_gen']):
  print(f'ori={i}\ngen={j}\n')

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
ori=For what it's worth, I think many people overreacted to several different issues (real or perceived) tonight. 'C'est le Wikipedia' or something. And yes, a nice page describing what EdwardsBot does would probably be helpful. I can help with that!
gen= For what it's worth, I think many people overreacted to the situation and were not aware that Twitter had been banned for at least two weeks.

ori=It's possible, I suppose (I did something similar in the tracker, for instance), but it takes some amount of work and isn't always reliable. A better solution is to make the template include a category, since the wikipedia database itself records the date/time that an article was added to a category . It's a little-known fact, but it's damn useful sometimes.
gen= It's possible, I suppose (I did something similar in the tracker, for instance), but it's certainly not going to be the standard library libtiff-exchange-yahoo (if you want to claim

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
gen= I've got no complaints about your lack of intellect. You've been a godsend to me.

ori=I just think the world was a much different place years ago. I doubt anyone could get a way with it today
gen= I just think the world is a lot different than it was years ago. It's just so sad. I'm so sorry for you. Sincerely are you the people that contributed to that world war from the other side?

ori=My Lord and savior Satan, how may I best serve you?
gen= My Lord & largest role is to support our troops in the fight against the nazis, while at the same time being a good friends with the good Americans. If you have a suggestion, please share. -

ori=u b u you're doing great. Oh just get fucked.. Oh just get fucked.
gen= u b u i u u u i u u i u i u i n i n o i oi oi... u b u i n u i n i n oi...

ori=Because the Islamic rape gangs and terrorist in Britain are described as chavs right... nope they have been everything but, White.
gen= Because the

In [15]:
df_prt_text.head()

Unnamed: 0,parent,text,parent_gen
0,I also feel its important for StuRat to have h...,"BTW, I see no ""attack"" here. What I see is som...",I also feel its important for StuRat to have ...
1,A main reason this project was started was to ...,I realise... I just want to urge caution and a...,A main reason this project was started was to...
2,"I'd go for that. (By the way, not all deletion...",I should have stressed that this should be a t...,I'd go for that. My personal experience sugge...
3,(edit conflict with Piotrus) I agree with the ...,Thanks to Lysy for shortening and copy editing...,(edit conflict with Piotrus) I agree with the...
4,The larger box displays fine in Mozilla Firefo...,I like the additional info in the info box. It...,The larger box displays fine in Mozilla Firef...


In [101]:
configs.path['parent_gen_data'] = configs.path['gc'].parent / 'parent_gen_data.csv'

In [102]:
df_prt_text.to_csv(configs.path['parent_gen_data'])