In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 64.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [3]:
torch.manual_seed(1)

<torch._C.Generator at 0x7fafb18ecad0>

In [4]:
torch.cuda.is_available()

True

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Embedding(50259, 1024)

In [120]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [121]:
%cd /content/drive/My Drive/Colab Notebooks/DeepLearningVision/Data

/content/drive/My Drive/Colab Notebooks/DeepLearningVision/Data


In [122]:
# Load clean text to avoid running above codes

my_file = open("/content/drive/My Drive/Colab Notebooks/DeepLearningVision/Data/clean_text_tech_news.txt", "r")
content = my_file.read()

corpus = content.split(",")
my_file.close()

In [124]:
import pandas as pd
descriptions = pd.Series (corpus)

In [125]:
type(descriptions)

pandas.core.series.Series

In [126]:
descriptions

0          madden espn football score in different way ...
1        group to propose new high speed wireless forma...
2        aol to sell cheap pcs to minority and senior a...
3        company approve new high capacity disc format ...
4        miss june deal slow to return for software cos...
                               ...                        
29996    digitize and bring to life digital technology ...
29997    new computer six step to safe surfing to see t...
29998    video file present search challenge indexing w...
29999    compromise seal climate meeting climate confer...
30000                                                     
Length: 30001, dtype: object

In [127]:
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

In [128]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [129]:
dataset = NetflixDataset(descriptions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [67]:
import gc
gc.collect()

159

In [68]:
torch.cuda.empty_cache()

In [69]:
training_args = TrainingArguments(output_dir='/content/drive/My Drive/Colab Notebooks/DeepLearningVision/results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='/content/drive/My Drive/Colab Notebooks/DeepLearningVision/results/logs', report_to = 'none')

PyTorch: setting up devices


In [70]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 7926
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 7926


Step,Training Loss
100,5.8525
200,1.8749
300,1.8874
400,1.8891
500,1.9182
600,1.8745
700,1.884
800,1.8565
900,1.8405
1000,1.8602


Saving model checkpoint to /content/drive/My Drive/Colab Notebooks/DeepLearningVision/results/checkpoint-5000
Configuration saved in /content/drive/My Drive/Colab Notebooks/DeepLearningVision/results/checkpoint-5000/config.json
Model weights saved in /content/drive/My Drive/Colab Notebooks/DeepLearningVision/results/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7926, training_loss=1.826281685916998, metrics={'train_runtime': 1074.9869, 'train_samples_per_second': 7.373, 'train_steps_per_second': 7.373, 'total_flos': 891356768944128.0, 'train_loss': 1.826281685916998, 'epoch': 1.0})

In [130]:
generated = tokenizer(descriptions[10], return_tensors="pt").input_ids.cuda()

In [131]:
generated

tensor([[18040,   284,  1280,  1218,   474,  2674,  2771,  6308,  3650,   428,
          1227,  8352, 31463,  8352, 31463, 17180,  3644,   753,   481,  1280,
           663,  1218,   474,  2674,  2771,  6308,  3650,  1568,   428,  1227,
           287,   262,  8830,   474,  2674,  2771,  1748,   286, 28686,  8130,
           340,   910,   294,  3479]], device='cuda:0')

In [132]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=3000, top_p=0.95, temperature=0.1, num_return_sequences=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [139]:
text = tokenizer.decode(sample_output, skip_special_tokens=True)
print(text)

apple to open second japanese retail store this month maccentral maccentral apple computer inc will open its second japanese retail store later this month in the western japanese city of osaka it say thursday.
