In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [2]:
# read in data
article = []
# titles
titles = []

import os
for root, dirs, files in os.walk("../mao/"):
    for file in files:
        if file.endswith(".txt"):
            print(os.path.join(root, file))
            with open(os.path.join(root, file), "r") as input:
                sentences = input.read()
            article.append(sentences)
            titles.append(file)
            
print(len(article))
print(len(titles))

../mao/REPORT ON AN INVESTIGATION OF THE PEASANT MOVEMENT IN HUNAN.txt
../mao/ON CORRECTING MISTAKEN IDEAS IN THE PARTY.txt
../mao/BE CONCERNED WITH THE WELL-BEING OF THE MASSES, PAY ATTENTION TO METHODS OF WORK.txt
../mao/WHY IS IT THAT RED POLITICAL POWER CAN EXIST IN CHINA.txt
../mao/A SINGLE SPARK CAN START A PRAIRIE FIRE.txt
../mao/ANALYSIS OF THE CLASSES IN CHINESE SOCIETY.txt
6
6


In [3]:
# concatenate text
text = ''.join(article)

len(text)
# remove reference mark, e.g. '[1]'

import re

text = re.sub(r'\[\d{1,3}\]' ,'', text)
len(text)
# split inot sentences
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)

len(sentences)

1250

### Before fine tuning

In [4]:
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

In [5]:
# generate 10 sentences for testing
generated = tokenizer("<|endoftext|>", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=10)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Here's his first full trailer!

This clip of the upcoming episode of "Star Trek" which I have posted over will show him from time to time until he drops another teaser for the episode which will take the action scene right in our lives as soon as the movie makes this trip right around the corner which doesn't take toooo long! Hope this post makes your weekend much more enjoyable, I'm totally looking forward to it (especially after my week at a comedy show!) If there's something new for "Archer Season 3"—maybe new CGI or even something new that is going over well with both us readers or Trekkies—"Fever" fans—and that something else that does "good" with Star Trek is being pitched, it's definitely going to do fine. This doesn't mean we have to stay glued watching the Trek channel for every new film, but let's hope Star Trek season 8 is something very worthwhile to be looking forward to all those years for fans! Let The Klingons Have An Apportionment for the Uplift! And while everythin

## Fine tuning

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Embedding(50259, 1024)

In [7]:
max_length = max([len(tokenizer.encode(s)) for s in sentences])
print(max_length)

216


In [8]:
class MyDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [9]:
dataset = MyDataset(sentences, tokenizer, max_length=max_length)
# train test split
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [10]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=2, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [11]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

Step,Training Loss
100,1.9891
200,0.5011
300,0.4854
400,0.4365
500,0.456
600,0.4342
700,0.4569
800,0.4327
900,0.4095
1000,0.4575


TrainOutput(global_step=2250, training_loss=0.4445423355102539, metrics={'train_runtime': 670.2388, 'train_samples_per_second': 3.357, 'total_flos': 1034670329856000.0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 8192, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 4395008, 'train_mem_gpu_alloc_delta': 4265118208, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 1096393728})

In [12]:
# test model 
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=10)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  为弙遊丝 一本大一乳啊亚充俘 中區行勝克?
1:   The three months' worth per capita rent are five denarii ten denars.
2:  When we call for donations to be carried out on a production, our purpose would very rarely be any other than to be supplied to our Red Army from Red funds, that is, money and hard work.
3: .
4: ?
5:   The rural women refuse to yield to the city gentry  which puts forward the demand, "Give us women's right!" 
6:  The most notorious crimes were being carried out in rural areas and other places, when rural areas could not cope with the population.
7:  **************************** B. ROWL
Though these revolutionary elements now under Kiangsi military control also number comparatively small and comprise only 30-40 per cent of the proletariat., in the old regime the overwhelming majority, in places as far as Pao Tsuochua from Kao'o County to the Wuen Chuan were peasant revolutionary fronts; the remaining 1 per cent tended to join the "road work" or petty-military workers  of the Kiangsi g