In [13]:
!pip install -q "transformers==4.31.0" "datasets==2.13.0" "accelerate==0.21.0" "trl==0.4.7" "safetensors>=0.3.1" --upgrade

In [14]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPTNeoForCausalLM, AutoTokenizer,OPTForCausalLM, AutoModel

In [15]:
torch.manual_seed(42)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 2048)

In [16]:
df = pd.read_csv('/content/instruct_data_chatgpt_generate.csv')
df.head()

Unnamed: 0,Instruction,Context,Response,Category
0,Use the Input and generate the suitable catego...,"Lions, Tigers, Cheetahs, Leopards",Big Cats,category_generation
1,Use the Input and generate the suitable catego...,"Piano, Guitar, Violin, Trumpet",Musical Instruments,category_generation
2,Use the Input and generate the suitable catego...,"Apple, Banana, Cherry, Orange",Fruits,category_generation
3,Use the Input and generate the suitable catego...,"Toyota, Ford, BMW, Tesla",Car Brands,category_generation
4,Use the Input and generate the suitable catego...,"Python, Java, C++, Ruby",Programming Languages,category_generation


In [17]:
cat_df = df[df['Category']=='category_generation']

In [18]:
for i in cat_df.iterrows():
  text = f""" <|startoftext|>\n### Instruction:\n{i[1]['Instruction']}\n\nInput:\n{i[1]['Context']}\n\n### Response:\n{i[1]['Response']}\n<|endoftext|>"""
  print(text)
  break

 <|startoftext|>
### Instruction:
Use the Input and generate the suitable category name

Input:
Lions, Tigers, Cheetahs, Leopards

### Response:
Big Cats
<|endoftext|>


In [19]:
class InstructDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        for i in cat_df.iterrows():
            text = f""" <|startoftext|>\n### Instruction:\n{i[1]['Instruction']}\n\nInput:\n{i[1]['Context']}\n\n### Response:\n{i[1]['Response']}\n<|endoftext|>"""
            encodings_dict = tokenizer(text,
                                       truncation=True,
                                       max_length=max_length,
                                       padding="max_length"
                                       )
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [20]:
max_length = 128

In [21]:
dataset = InstructDataset(df, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [22]:
training_args = TrainingArguments(
                                  output_dir='./results',
                                  num_train_epochs=15,
                                  per_device_train_batch_size=4,
                                  per_device_eval_batch_size=4,
                                  warmup_steps=10,
                                  logging_steps = 10,
                                  weight_decay=0.1,
                                  logging_dir='./logs'
                                  )

In [23]:
Trainer(model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()



Step,Training Loss
10,3.0674
20,0.6892
30,0.1774
40,0.082
50,0.0799
60,0.0397
70,0.0309


TrainOutput(global_step=75, training_loss=0.5572239051262537, metrics={'train_runtime': 25.6832, 'train_samples_per_second': 10.513, 'train_steps_per_second': 2.92, 'total_flos': 250585263636480.0, 'train_loss': 0.5572239051262537, 'epoch': 15.0})

In [25]:
prompt = f"""
 <|startoftext|>
### Instruction:
Use the Input and generate the suitable category name

Input:
google chrome, mozila firebox, safari

### Response:
"""

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to('cuda')

print(generated)

sample_outputs = model.generate(
                                generated,
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,
                                top_k=40,
                                max_length = 300,
                                top_p=0.9,
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
  print("===============================")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  198,   220, 50257,   198, 21017, 46486,    25,   198, 11041,   262,
         23412,   290,  7716,   262, 11080,  6536,  1438,   198,   198, 20560,
            25,   198, 13297, 32030,    11,  6941,    89, 10102,  2046,  3524,
            11,  1932,  2743,   198,   198, 21017, 18261,    25,   198]],
       device='cuda:0')
0: 
 
### Instruction:
Use the Input and generate the suitable category name

Input:
google chrome, mozila firebox, safari

### Response:
Android, iOS, Windows, Linux


1: 
 
### Instruction:
Use the Input and generate the suitable category name

Input:
google chrome, mozila firebox, safari

### Response:
Android, Windows, Linux


2: 
 
### Instruction:
Use the Input and generate the suitable category name

Input:
google chrome, mozila firebox, safari

### Response:
Operating Systems


3: 
 
### Instruction:
Use the Input and generate the suitable category name

Input:
google chrome, mozila firebox, safari

### Response:
Big Brands


4: 
 
### Instruction:
