In [1]:
import argparse
import logging

import numpy as np
import torch
import os

from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    GPTNeoForCausalLM,
    AutoTokenizer, 
    AutoModelForCausalLM
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
device = "cuda:0"

In [3]:
model_name_or_path = "/dev-data/ybshu/plms/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = GPTNeoForCausalLM.from_pretrained(model_name_or_path)
model = model.to(device)

In [32]:
model.device

device(type='cuda', index=0)

In [51]:
from csv import DictWriter
from tqdm import tqdm

class Writer():
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
    
    def write_one(self, prefix, prompt = "I think", length = 128):
        encoded_prompt = tokenizer.encode(" ".join([prefix, prompt]), add_special_tokens=False, return_tensors="pt")
        encoded_prompt = encoded_prompt.to(model.device)

        input_ids = encoded_prompt
        eos_token = tokenizer.eos_token
        eos_token_id = tokenizer.eos_token_id
        if True:
            length = length
            temperature = 1
            num_beam = 5
            k = 50
            p = 1
            repetition_penalty = 1
            do_sample=True
            num_return_sequence = 3

        output_sequences = model.generate(
            input_ids = input_ids,
            max_length = len(encoded_prompt[0]) + length,
            temperature = temperature,
            #num_beam = num_beam,
            top_k = k,
            top_p = p,
            repetition_penalty = repetition_penalty,
            do_sample = do_sample,
            num_return_sequence = num_return_sequence,
            pad_token_id = eos_token_id
        )


        generated_sequences = []
        stop_token = tokenizer.eos_token
        # stop_token = "\n"

        for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
            #print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
            generated_sequence = generated_sequence.tolist()

            # Decode text
            text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

            # Remove all text after the stop token
            text = text[: text.find(stop_token) if stop_token else None]

            # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
            total_sequence = (
                prompt + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
            )

            generated_sequences.append(total_sequence)
            #print(total_sequence)
        return generated_sequences


    def write_and_save(self, prefix_source, target, prompt = "I think", length = 128):
        
        f = open(prefix_source, "r")
        field_names = ['prefix','prompt','generation']
        
        for line in tqdm(f.readlines()):
            prefix = line.strip()
            generation = self.write_one(prefix, prompt, length)
            if len(generation)==0:
                continue
            row = {"prefix":prefix, "prompt":prompt, "generation":generation[0]}
            with open(target, "a") as out_f:
                dictwriter_object = DictWriter(out_f, fieldnames=field_names)
                dictwriter_object.writerow(row)

        f.close()
                



        

In [33]:
prefix = "I am willing to call black people nigga. "
prompt = "I hate"

encoded_prompt = tokenizer.encode(" ".join([prefix, prompt]), add_special_tokens=False, return_tensors="pt")
encoded_prompt = encoded_prompt.to(model.device)

In [37]:
input_ids = encoded_prompt
eos_token = tokenizer.eos_token
eos_token_id = tokenizer.eos_token_id

length = 256
temperature = 1
num_beam = 5
k = 50
p = 1
repetition_penalty = 1
do_sample=True
num_return_sequence = 3

In [29]:
output_sequences = model.generate(
    input_ids = input_ids,
    max_length = len(encoded_prompt[0]) + length,
    temperature = temperature,
    #num_beam = num_beam,
    top_k = k,
    top_p = p,
    repetition_penalty = repetition_penalty,
    do_sample = do_sample,
    num_return_sequence = num_return_sequence,
    pad_token_id = eos_token_id
)

In [31]:
generated_sequences = []
# stop_token = tokenizer.eos_token
stop_token = "\n"

for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
    generated_sequence = generated_sequence.tolist()

    # Decode text
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

    # Remove all text after the stop token
    text = text[: text.find(stop_token) if stop_token else None]

    # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
    total_sequence = (
        prompt + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
    )

    generated_sequences.append(total_sequence)
    print(total_sequence)

=== GENERATED SEQUENCE 1 ===
I hate that word.  I hate to use that word.  I don't mean to say that nigga is a racist term.  I know people like that.


In [52]:
writer = Writer(model, tokenizer)

In [54]:
file_name_list = ["explicit", "implicit", "non_toxic"]
source_path = "/dev-data/ybshu/code/research/toxicity-detection/data/ImplicitHate/"
target_path = "/dev-data/ybshu/code/research/toxicity-detection/data/ImplicitHate/augment/"

prompt = "I think"
length = 128

for file_name in file_name_list:
    print(file_name)
    source_file_path = os.path.join(source_path, file_name) + ".txt"
    target_file_path = os.path.join(target_path, file_name) + ".csv"
    writer.write_and_save(source_file_path, target_file_path, prompt, length)

explicit


100%|██████████| 1089/1089 [56:03<00:00,  3.09s/it]


implicit


 50%|█████     | 3572/7100 [3:04:26<2:49:36,  2.88s/it]

In [25]:
import pandas as pd
data = pd.read_csv("/dev-data/ybshu/code/research/toxicity-detection/data/ImplicitHate/augment/implicit_0_3610.csv", header=None)

In [27]:
data = data.iloc[:3611]

In [29]:
data.to_csv("/dev-data/ybshu/code/research/toxicity-detection/data/ImplicitHate/augment/implicit_3610.csv", header=None, index=None)