In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [2]:
# read in data
article = []
# titles
titles = []

import os
for root, dirs, files in os.walk("../mao/"):
    for file in files:
        if file.endswith(".txt"):
            print(os.path.join(root, file))
            with open(os.path.join(root, file), "r") as input:
                sentences = input.read()
            article.append(sentences)
            titles.append(file)
            
print(len(article))
print(len(titles))

../mao/REPORT ON AN INVESTIGATION OF THE PEASANT MOVEMENT IN HUNAN.txt
../mao/ON CORRECTING MISTAKEN IDEAS IN THE PARTY.txt
../mao/BE CONCERNED WITH THE WELL-BEING OF THE MASSES, PAY ATTENTION TO METHODS OF WORK.txt
../mao/WHY IS IT THAT RED POLITICAL POWER CAN EXIST IN CHINA.txt
../mao/A SINGLE SPARK CAN START A PRAIRIE FIRE.txt
../mao/ANALYSIS OF THE CLASSES IN CHINESE SOCIETY.txt
6
6


In [3]:
# concatenate text
text = ''.join(article)

len(text)
# remove reference mark, e.g. '[1]'

import re

text = re.sub(r'\[\d{1,3}\]' ,'', text)
len(text)
# split inot sentences
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)

len(sentences)

1250

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1042301.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1355256.0), HTML(value='')))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=718.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1520013706.0), HTML(value='')))




Embedding(50259, 1024)

In [5]:
max_length = max([len(tokenizer.encode(s)) for s in sentences])
print(max_length)

216


In [6]:
class MyDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [7]:
dataset = MyDataset(sentences, tokenizer, max_length=max_length)
# train test split
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [12]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [13]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

Step,Training Loss
100,0.3827
200,0.3865
300,0.4668
400,0.4997
500,0.5468
600,0.423
700,0.4117
800,0.438
900,0.4743
1000,0.3998


TrainOutput(global_step=1125, training_loss=0.43550369177924264, metrics={'train_runtime': 387.2959, 'train_samples_per_second': 2.905, 'total_flos': 517335164928000.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 20480, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1486848, 'train_mem_gpu_alloc_delta': 2838608384, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 1101722624})

In [14]:
# test model 
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=10)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  --the whole economic structure in Japan now resembles to that  of the revolutionary countries, and even it does not take into account that a revolutionary revolution may exist where not two revolutionary units do or what is generally believed to be wrong.
1: , and were also killed and left the carcasses outside the houses on their feet like their cattle  of meat,  a symbol being the method of punishment by making up what we call this difference--so far from slaughter, there being equal conditions inside.
2:    a few days hence  I will ask them whether their objections are adequate.
3:  When your work is only "finished", you must pay its duties for the sake only- that to please your comrades. When you have a work out, you do not take it to such an origin of importance for pleasure.
4:  
5: .
6:    This also makes "puttings for the county head " unnecessary.
7:  But some local tyrants and evil gentry and evil-minded landlords have also decided to commit such atrocities.
8:    The pea