In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel,AutoTokenizer,Trainer,TrainingArguments,GPT2Config
from torch.utils.data.dataloader import DataLoader, Dataset
import os
import tqdm
from random import randint
import glob

In [2]:
files_name =[file for file in glob.glob("../input/tho-7-chu/7 chu/*.txt")]
files_name = sorted(files_name)
files_name = files_name[:22000]

In [None]:
sentences = []

for file in files_name:
    with open(file, 'r') as fr:
        data = fr.read()
        sentences.append(data)
        
sentences = "\n".join(sentences)
with open('data.txt', 'w') as f:
    f.write(sentences)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
tokenizer.add_tokens('\n')

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [4]:
tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

1

In [5]:
tokenizer

PreTrainedTokenizer(name_or_path='vinai/phobert-base', vocab_size=64000, model_max_len=256, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [6]:
tokenizer.convert_tokens_to_ids(tokenizer.bos_token)

0

In [7]:
tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

2

In [8]:
class CustomerDataset(Dataset):
    def __init__(self,tokenizer,file_path,block_size: int):
        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
#         print(f"block_size = {block_size}")
        self.examples = []
        self.mask = []
        for file in file_path:
            with open(file, encoding="utf-8") as f:
                text = f.read()
            # text --> token-->number
            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            if len(tokenized_text) < block_size:
                inds = [tokenizer.convert_tokens_to_ids(tokenizer.bos_token)] + tokenized_text + [tokenizer.convert_tokens_to_ids(tokenizer.eos_token)] + \
            (block_size - len(tokenized_text)) * [tokenizer.convert_tokens_to_ids(tokenizer.pad_token)]
                mask = [x != tokenizer.convert_tokens_to_ids(tokenizer.pad_token) for x in inds]
#                 print(mask)
            else:
                inds = [tokenizer.convert_tokens_to_ids(tokenizer.bos_token)] + tokenized_text[:block_size] + [tokenizer.convert_tokens_to_ids(tokenizer.eos_token)]
                mask = [1] * len(inds)

            self.examples.append(inds) 
            self.mask.append(mask)
            
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return {
            "input_ids":torch.tensor(self.examples[i], dtype=torch.long), 
            "attention_mask":torch.tensor(self.mask[i], dtype = torch.long)
        }


In [9]:

from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling, LineByLineWithSOPTextDataset

def load_dataset(train_path, tokenizer):
    train_dataset = CustomerDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size= 256)#256
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,data_collator

In [10]:

train_path = ["../input/tho-7-chu/7 chu/100018.txt"]
train_dataset,data_collator = load_dataset(train_path,tokenizer)

In [11]:
files_name =[file for file in glob.glob("../input/tho-7-chu/7 chu/*.txt")]
files_name = sorted(files_name)
files_name = files_name[:22000]

In [12]:
train_dataset,data_collator = load_dataset(files_name,tokenizer)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
model = GPT2LMHeadModel.from_pretrained('danghuy1999/gpt2-viwiki')
rand_weight = torch.rand(model.lm_head.weight.shape)
model.lm_head.weight = torch.nn.parameter.Parameter(rand_weight)
task_gpt2 = {"text-generation": {"do_sample": True, "max_length": 256}} 
configuration = GPT2Config(vocab_size=64002, n_positions=260, n_ctx=260,
                           task_specific_params=task_gpt2,
                           eos_token_id = 2,
                           bos_token_id = 0,
                           pad_token_id = 1,
                           sep_token_id = 2,)
model = GPT2LMHeadModel(configuration)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

Downloading:   0%|          | 0.00/916 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/487M [00:00<?, ?B/s]

Some weights of the model checkpoint at danghuy1999/gpt2-viwiki were not used when initializing GPT2LMHeadModel: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# model = GPT2LMHeadModel.from_pretrained('../input/7chucheckpoint/checkpoint-30000/checkpoint-30000')

In [16]:
from transformers.trainer_callback import TrainerCallback
from transformers import pipeline

training_args = TrainingArguments(
    output_dir="./output_dir", 
    overwrite_output_dir=True,
    num_train_epochs=40,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16, 
    save_steps=1000,
    save_total_limit = 2,
    warmup_steps=1000, 
    logging_steps=100,
    report_to="wandb"
    )

In [17]:
device = torch.device('cuda')
trainer = Trainer(
    model=model, 
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [18]:
trainer.train()


***** Running training *****
  Num examples = 22000
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 110000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
100,10.1957
200,9.1795
300,8.0934
400,7.0917
500,6.6499
600,6.4847
700,6.3007
800,6.2114
900,6.1114
1000,6.01


Saving model checkpoint to ./output_dir/checkpoint-1000
Configuration saved in ./output_dir/checkpoint-1000/config.json
Model weights saved in ./output_dir/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./output_dir/checkpoint-2000
Configuration saved in ./output_dir/checkpoint-2000/config.json
Model weights saved in ./output_dir/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./output_dir/checkpoint-3000
Configuration saved in ./output_dir/checkpoint-3000/config.json
Model weights saved in ./output_dir/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [output_dir/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to ./output_dir/checkpoint-4000
Configuration saved in ./output_dir/checkpoint-4000/config.json
Model weights saved in ./output_dir/checkpoint-4000/pytorch_model.bin
Deleting older checkpoint [output_dir/checkpoint-2000] due to args.save_total_limit
Saving model checkpoint to ./output_dir/checkpoint-5000
Configuration sa

KeyboardInterrupt: 

In [19]:
trainer.save_model()

Saving model checkpoint to ./output_dir
Configuration saved in ./output_dir/config.json
Model weights saved in ./output_dir/pytorch_model.bin


In [20]:

from transformers import pipeline
poem = pipeline('text-generation', model="./output_dir", tokenizer=tokenizer, max_length = 50)
#Test


loading configuration file ./output_dir/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 260,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 260,
  "pad_token_id": 1,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "sep_token_id": 2,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 256
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 64002
}

loading co

In [22]:
a = poem('<s>yêu')
print(a[0]['generated_text'])

<s>yêu nhau thắm thiết áng thơ xinh 
 cách trở xa xôi kết nghĩa mình 
 duyên phận ngàn năm đời luyến nhớ 
 trăng sương vạn chuỗi ánh rung rinh 
 mộng lòng say đắm hồn phu phụ 
 chăn gối yêu thương bóng bạn tình 

