In [8]:
!pip install transformers

from transformers.tokenization_utils_base import BatchEncoding
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Config
import torch


class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs: BatchEncoding, labels: BatchEncoding):
        """
        Inputs: encoded sequences of words.
        Labels: the next word in the sequence (also encoded)
        Usage: you can access them like lists, i.e. self.inputs[i]
        this will give you a tokenizers.Encoding object; among its
        useful attributes are ids (words encoded as numbers) and
        attention_mask (to feed into the model). Examples:
        >> self.inputs[i].ids
        >> self.inputs[i].attention_mask
        """
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        input_ids = torch.LongTensor(self.inputs[idx].ids)
        input_masks = torch.LongTensor(self.inputs[idx].attention_mask)
        label_ids = torch.LongTensor(self.labels[idx].ids)
        # print(f'INDIV. SHAPES: {input_ids.shape}, {input_masks.shape}, {label_ids.shape}')
        data = {'inputs_ids': input_ids,
                'inputs_mask': input_masks,
                'labels_ids': label_ids}
        return data

    def __len__(self):
        return len(self.labels)


def parse_corpus_text(text, seq_len, skip_size=1):
    """
    Takes as input a corpus of text as a single string.
    
    Parameters:
        - text: str
        - seq_len: (int), the number of characters you want per sequence in 
                        your dataset
        - skip_size: (int) the size of the jump between sequences.
    
    Returns:
        - X: list of sentences, each of length seq_len
        - Y: next word after each sentence
        - vocabulary: set of unique words
    """
    assert seq_len > 0, 'Training sequences must be of length at least 1'

    text = text.replace('\n', ' ')

    X = []
    Y = []
    vocabulary = set()

    words = text.split()

    i = 0
    while i < len(words) - 2*seq_len - 1:
        sentence = ' '.join(words[i:i+seq_len])
        next_word = ' '.join(words[i+seq_len: i+2*seq_len])
        vocabulary.update(words[i:i+seq_len])
        vocabulary.update(next_word)
        # vocabulary.add(next_word)
        X.append(sentence)
        if len(next_word) == 0:
            print(f'FOUND WORD OF LENGTH ZERO')
        Y.append(next_word)
        i += skip_size

    return X, Y, vocabulary


def encode_inputs_labels(X, Y, tokenizer):
    assert len(X) == len(Y), 'Lengths of sentences and targets are different'

    X_enc = []
    Y_enc = []
    for i in range(len(X)):
        sentence = X[i]
        next_word = Y[i]
        enc_input = tokenizer.encode(sentence)
        enc_target = tokenizer.encode(next_word)
        X_enc.append(enc_input)
        Y_enc.append(enc_target)
    X_enc = torch.tensor(X_enc)
    Y_enc = torch.tensor(Y_enc)
    return X_enc, Y_enc


def generate_poem(model, seed_str, num_stanzas=20, num_poems=5):
    for i in range(num_poems):
        print(f'POEM #{i}')
        poem = seed_str
        curr_seed = seed_str
        for j in range(num_stanzas):
            tokens = tokenizer.encode(curr_seed, return_tensors="pt").to(device)
            prediction = model.generate(tokens, min_length=40, max_length=48, do_sample=True, repetition_penalty=1.2)
            poem += f'({j}): \t{tokenizer.decode(prediction[0])[len(curr_seed):]}\n'
            curr_seed = tokenizer.decode(prediction[0])[len(curr_seed):]
        print(poem)


if __name__ == '__main__':
    # Make dataset: raw text -> sentences -> encoded inputs & targets
    raw_text = open('shakespeare.txt').read()
    seq_len = 20
    X, Y, vocabulary = parse_corpus_text(raw_text, seq_len)
    # config = GPT2Config(vocab_size=len(vocabulary))
    tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
    tokenizer.pad_token = tokenizer.eos_token
    (X_enc, Y_enc) = (tokenizer(X, truncation=True, padding=True), tokenizer(Y, truncation=True, padding=True))

    dataset = TextDataset(X_enc, Y_enc)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model = AutoModelForCausalLM.from_pretrained('distilgpt2')
    model.to(device)

    # Hyper-parameters
    bs = 512
    lr = 0.0005
    num_epochs = 50

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=bs,
                                               shuffle=False)
    print('POEM BEFORE TRAINING')
    generate_poem(model, "I love deep learning", num_poems=1)





Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


POEM BEFORE TRAINING
POEM #0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

I love deep learning(0): 	, but also it›​ Â  has its own unique challenge (and not a complete lack of understanding). I think this is an easy way to take advantage and find resources on your hands in the most highly
(1): 	 competitive market outside of
(2): 	 Japan, so as to make sure that we are good enough for the U.S. and China respectively."
The announcement comes just weeks after a similar plan from Germany was approved by Congress in 2012 which would see Apple
(3): 	 adopt an online shopping
(4): 	 service and have access to all the information they need.
“The company makes some of its products with our customers, which are so far from their core value or experience – it seems that customer privacy is no longer
(5): 	 a main concern when
(6): 	 we are on the field at full swing. We're looking forward to every single day and making sure that everything goes smoothly in our direction."
The Nationals will go up 1-0 overall against them next season, with
(7): 	 one win over Carolina


In [9]:
    for curr_epoch in range(num_epochs):
        print(f'EPOCH #{curr_epoch}')
        tokens = tokenizer.encode("Nature's bequest gives nothing but doth lend",
                                  return_tensors="pt").to(device)
        prediction = model.generate(tokens,
                                    min_length=20,
                                    max_length=40,
                                    do_sample=True,
                                    repetition_penalty=1.2,
                                    temperature=1.0)
        print(tokenizer.decode(prediction[0]))
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()
            
            # from the tokenizers.Encoding objects, get what is needed
            sentences_ids = batch['inputs_ids']
            sentences_att_mask = batch['inputs_mask']
            target_ids = batch['labels_ids']

            # print(f'SHAPES: {sentences_ids.shape, sentences_att_mask.shape, target_ids.shape}')
            outputs = model(sentences_ids,
                            attention_mask=sentences_att_mask,
                            labels=target_ids)

            loss = outputs[0]
            loss.backward()
            optimizer.step()

    print()
    print('POEMS AFTER TRAINING')
    generate_poem(model, "I love deep learning",num_poems=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #0
Nature's bequest gives nothing but doth lend itself to a very particular, almost mythical sense of meaning. If you look at Dany in his book The World is Always Beautiful: His Work as


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #1
Nature's bequest gives nothing but doth lend th m' a w lh, of
O thee thou shall thyest thate<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #2
Nature's bequest gives nothing but doth lendt- to n that,
est thou the<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #3
Nature's bequest gives nothing but doth lend eyes to my son and its own tender' bright<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #4
Nature's bequest gives nothing but doth lend Thy eyes bright light that thy own, flame tender toine the his th beartlight faint thou soul my gaze Light His sweet memory her flesh shall


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #5
Nature's bequest gives nothing but doth lend th own in, heir thy tot tender light thou might contracted memory bear not his a eyes willine father myth. bright he His brother'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #6
Nature's bequest gives nothing but doth lend his heir bear to of, memory: His contracted<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #7
Nature's bequest gives nothing but doth lend thou: hist memory tender thy will th His heir Feed<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #8
Nature's bequest gives nothing but doth lendt memory, heir: his might bear light to<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #9
Nature's bequest gives nothing but doth lend to tender memory But his might Feed: thy contracted th thou,<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #10
Nature's bequest gives nothing but doth lend thy thout own might, his bear th to:<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #11
Nature's bequest gives nothing but doth lend tot might thy memory, His own thou eyes his light But bright th:<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #12
Nature's bequest gives nothing but doth lend thou, th own to memory thy might contracted heir: But bright light eyes flame tendert and Feed his His<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #13
Nature's bequest gives nothing but doth lend to his memory th Feed His eyes But own,: might thy thou bright flame lightt the contractedine would bear tender.<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #14
Nature's bequest gives nothing but doth lend memory to thy own bear Feed His thou But, its eyes: bright light contracted th isine flame might theet the his heirst can.<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #15
Nature's bequest gives nothing but doth lend to bear: flame contracted, his eyes own light might Feed thine thy thou<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #16
Nature's bequest gives nothing but doth lend his eyes to th heir memory, thy thou light: bright Feed His own flame But might contractedine.<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #17
Nature's bequest gives nothing but doth lend, flame to Feed light But bright: thy thou contractedine th own let his eyes bear might should feed His<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #18
Nature's bequest gives nothing but doth lend th, his own: flame But bright thou to thy memory light Feedt<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #19
Nature's bequest gives nothing but doth lend his But bright, flame thou: Light thy tot<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #20
Nature's bequest gives nothing but doth lend eyes to, his memory Feed: light flame But bright thou contracted tht thy own Light<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #21
Nature's bequest gives nothing but doth lend: his own Feed, light thou But bright contracted might to thy His eyes flame bear andine tht<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #22
Nature's bequest gives nothing but doth lend to bear: flame, eyes thouine contracted light thy Feed But bright memory th his own His mightt<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #23
Nature's bequest gives nothing but doth lend heir, his flame: light contracted to thy thout eyes bright But own His memory Feed is might thine or<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #24
Nature's bequest gives nothing but doth lend eyes, his own: Feed thy might contracted thout His light But bright flame to thine<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #25
Nature's bequest gives nothing but doth lend, his own eyes thou� flame thy contracted Feed But bright: light tot His;<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #26
Nature's bequest gives nothing but doth lend But bright, light thou: thy own flame Feed his eyes contractedine tot His torch might th and<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #27
Nature's bequest gives nothing but doth lend, his memory thou Feed: light But bright thy tot eyes contracted thine flame And own<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #28
Nature's bequest gives nothing but doth lend, his But bright thou: light Feed His flame to thy�<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #29
Nature's bequest gives nothing but doth lend, his heir thou But bright: thy own flame Feed might contracted eyes to light His memory thine andt<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #30
Nature's bequest gives nothing but doth lend, his eyes But bright: light thou Feed thy tot flame th own.<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #31
Nature's bequest gives nothing but doth lend, his eyes But bright: light thou contracted to thy Feed His flame own<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #32
Nature's bequest gives nothing but doth lend, his flame contracted thou bright But own eyes tot thy' light Feed:<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #33
Nature's bequest gives nothing but doth lend But bright: eyes thou, his own light Feed thy contractedine feed flame tot fedthy�<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #34
Nature's bequest gives nothing but doth lend, his memory But light thou Feed thy: flame contracted tot thine eyes bright and own
<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #35
Nature's bequest gives nothing but doth lend, his heir thou contracted thy Feed But bright: eyes flame tot light His own thine<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #36
Nature's bequest gives nothing but doth lend, his heir But bright: eyes thou contractedine Feed thy memory tot light flame And own<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #37
Nature's bequest gives nothing but doth lend But bright, his eyes thou contractedine Feed feeds thy tht light: flame to own feed<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #38
Nature's bequest gives nothing but doth lend, his eyes thou contracted tot thy Feed th own light But bright: flame�<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #39
Nature's bequest gives nothing but doth lend But bright, light thou Feed thy contracted toine eyes: flame his own<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #40
Nature's bequest gives nothing but doth lendst, his memory But bright thou contracted to light thy Feed His own eyes flame th:<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #41
Nature's bequest gives nothing but doth lend, his But bright Feed: light thou thy contracted eyes tot flame own thinest<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #42
Nature's bequest gives nothing but doth lend, his But bright thou: light thy Feed th own eyes tot flame contractedine His eye might feed<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #43
Nature's bequest gives nothing but doth lendt, bear thou contracted to light But bright: eyes Feed thy contract his flame own thine mightst His shining Thou and<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #44
Nature's bequest gives nothing but doth lend, his heir thou But bright Feed thy own eyes contracted light His flame:<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #45
Nature's bequest gives nothing but doth lend contracted thou, his eyes But bright: light Feed thy own tot flame His memory<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #46
Nature's bequest gives nothing but doth lend, light thou Feed But bright: eyes contracted thy own flame tot his<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #47
Nature's bequest gives nothing but doth lend But bright, light thou contracted thy Feed th own eyes tot his flame:<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #48
Nature's bequest gives nothing but doth lend, his heir thout Feed But bright: light thy might to flame contracted thine eyes own His memory Thy�<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EPOCH #49
Nature's bequest gives nothing but doth lend, his eyes thou contracted to thy But bright: light Feed th own flame's<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



POEMS AFTER TRAINING
POEM #0


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

I love deep learning(0): 	 memory But his eyes thou's heir, bright Feed thy contracted to light: flame thine own might tot His eye shall feed thy' but bright be lamp incease bear.<|endoftext|>
(1): 	 flame Thy andtst of lights your candle'slight
(2): 	 thou contracted, thy But bright Feed His light: his eyes to own memory shalt's might to thine but bright or lamp Thou on the Light thee's.<|endoftext|>
(3): 	 flame ist and dim off-light Thy" moon feed; light your
(4): 	 Feed's thou, his bright But own thy: eyes contracted to thine His Light Thou' lamp but shines thou might.<|endoftext|>
(5): 	 light thee flame thout deceiving memory thou shalt bestainsth from thy heir tender w bear'slight
(6): 	, his eyes contracted tot own But bright Feed: His light turned on lamp but lamps might be suninecease<|endoftext|>
(7): 	 flame thou's thy tot th it brightness Thou' light. Light Bright and light heir's Thy memory thou in lights
(8): 	 But bright, eyes contracted tot own might his eye Feed His l