In [42]:
import pandas as pd
import tensorflow as tf
import json
from pathlib import Path
import re
import inflect
import numpy as np


In [168]:
# Load Phonemes

# Standard Dict
WORDS = {}
f = open("INPUT/cmudict.dict.txt", "r")
#with (/'cmudict.dict.txt').open('r') as f:
for line in f.readlines():
    word, phonemes = line.strip().split(' ', 1)
    word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0]
    phonemes = phonemes.split(' ')
    syllables = sum([re.match(r'.*\d', p) is not None for p in phonemes])
    #print(word, phonemes, syllables)
    if word not in WORDS:
        WORDS[word] = []
    WORDS[word].append({
        'phonemes': phonemes,
        'syllables': syllables
    })
f.close()
f = open("INPUT/cmudict.dict.txt", "r")
# Load custom phonemes
CUSTOM_WORDS = {}
vowels = ['AA', 'AE', 'AH', 'AO', 'AW', 'AX', 'AXR', 'AY', 'EH', 'ER', 'EY', 'IH', 'IX', 'IY', 'OW', 'OY', 'UH', 'UW', 'UX']
#with (/'custom.dict.txt').open('r') as f:
for line in f.readlines():
    try:
        word, phonemes = line.strip().split('\t', 1)
    except:
        #print(line)
        continue
    word = re.match(r'([^\(\)]*)(\(\d\))*', word).groups()[0].lower()
    phonemes = phonemes.split(' ')
    syllables = sum([(p in vowels) for p in phonemes])

    if word not in CUSTOM_WORDS:
        CUSTOM_WORDS[word] = []
    CUSTOM_WORDS[word].append({
        'phonemes': phonemes,
        'syllables': syllables
    })

In [44]:
inflect_engine = inflect.engine()

# Dictionary of words not found, must go get the phonemes
# http://www.speech.cs.cmu.edu/tools/lextool.html
NOT_FOUND = set()

def get_words(line):
    """
    Get a list of the words in a line
    """
    line = line.lower()
    # Replace numeric words with the words written out
    ws = []
    for word in line.split(' '):
        if re.search(r'\d', word):
            x = inflect_engine.number_to_words(word).replace('-', ' ')
            ws = ws + x.split(' ')
        else:
            ws.append(word)

    line = ' '.join(ws)

    words = []
    for word in line.split(' '):
        word = re.match(r'[\'"]*([\w\']*)[\'"]*(.*)', word).groups()[0]
        word = word.replace('_', '')
        words.append(word)
        
    return words

def count_non_standard_words(line):
    """
    Count the number of words on the line that don't appear in the default CMU Dictionary.
    """
    count = 0
    for word in get_words(line):
        if word and (word not in WORDS):
            count += 1
    return count

def get_syllable_count(line):
    """
    Get the possible syllable counts for the line
    """
    counts = [0]
    return_none = False
    for word in get_words(line):
        try:
            if word:
                if (word not in WORDS) and (word not in CUSTOM_WORDS):
                    word = word.strip('\'')
                    
                if word in WORDS:
                    syllables = set(p['syllables'] for p in WORDS[word])
                else:
                    syllables = set(p['syllables'] for p in CUSTOM_WORDS[word])
                #print(syllables)
                new_counts = []
                for c in counts:
                    for s in syllables:
                        new_counts.append(c+s)

                counts = new_counts
        except:
            NOT_FOUND.add(word)
            return_none = True

    if return_none:
        return None
    #return counts
    #print(counts)
    temp=[]
    temp.append(counts[0])
    #print(temp)
    return ','.join([str(i) for i in set(temp)])

In [45]:
nRowsRead = None # specify 'None' if want to read whole file
df = pd.read_csv('INPUT/all_haiku.csv', delimiter=',', nrows = nRowsRead)
df.dataframeName = 'all_haiku.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 144123 rows and 6 columns


In [46]:
df=df.drop('Unnamed: 0',axis=1)
df=df.drop('source',axis=1)
df=df.drop('hash',axis=1)

In [47]:
df["2"] = df["2"].astype(str)
df["2"] = [x.replace('-','') for x in df["2"]]

In [48]:
df['2']

0                     the rainbow
1                        my dream
2                 of black coffee
3                     in the oven
4                behind the house
                   ...           
144118      what you said neither
144119     inclined to think both
144120           like Theresa May
144121             into Democrats
144122    blood is loud Talk soon
Name: 2, Length: 144123, dtype: object

In [49]:
df = df.replace('[^\w\s]', '')
df = df.replace('-','',regex=True)
df.rename(columns = {'0':0,'1':1,'2':2}, inplace = True)

In [40]:
for i in range(3):
    df['%s_syllables' % i] = df[i].apply(get_syllable_count)

In [69]:
df.head(20)

Unnamed: 0,0,1,2
0,fishing boats,colors of,the rainbow
1,ash wednesday,trying to remember,my dream
2,snowy morn,pouring another cup,of black coffee
3,shortest day,flames dance,in the oven
4,haze,half the horse hidden,behind the house
5,low sun,the lady in red,on high heels
6,advent,the passing stranger,farts
7,tarn,a bubble in,the ice
8,snowflakes,new asphalt,in the holes
9,Crystal Night',gusts of rain,outside


In [65]:
df[0][0]+df[1][0]+df[2][0]

'fishing boatscolors ofthe rainbow'

In [108]:
input_list=[]
for i in range(100):
    input_list.append(df[0][i]+'\n'+df[1][i]+'\n'+df[2][i]+'.')

In [109]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt
  
input1 = flatten(input_list)

In [110]:
print(input1)



In [None]:
df1=df

In [None]:
nRowsRead = None # specify 'None' if want to read whole file
df = pd.read_csv('INPUT/lines.txt', delimiter='/', nrows = nRowsRead,header=None)
df.dataframeName = 'lines.txt'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')


In [None]:
df[2]

In [None]:
df[2] = df[2].str.replace('[^\w\s]', '')

In [None]:
df.head()

In [None]:
df2=df

In [None]:
df.append(df1,ignore_index = True)

In [129]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, GPT2TokenizerFast
from fastai.text.all import *

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [95]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [130]:
splits = [range_of(70), range(100)] # use a 70/30 split
tls = TfmdLists(input1, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)
pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = TFGPT2LMHeadModel.from_pretrained(pretrained_weights)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [99]:
model.save('INPUT/model/GPT2MODEL1')



INFO:tensorflow:Assets written to: INPUT/model/GPT2MODEL1\assets


INFO:tensorflow:Assets written to: INPUT/model/GPT2MODEL1\assets


In [None]:
model=model.load('INPUT/model/GPT2MODEL')

In [112]:
show_at(tls.train, 0)

fishing boats
colors of
the rainbow.


In [117]:
#bs,sl = 4,256
dls = tls.dataloaders()
dls.show_batch(max_n)

Unnamed: 0,text,text_
0,glowing embers\nI start my story\nfrom the end.,owing embers\nI start my story\nfrom the end.three
1,three petals fall\nfrom the purple coneflower...\nalmost,petals fall\nfrom the purple coneflower...\nalmost summer
2,summer.snowflakes\nnew asphalt\nin the holes.,.snowflakes\nnew asphalt\nin the holes.my
3,my hand\n on her hip\n,hand\n on her hip\n
4,full moon.dachau\na blue sky,full moon.dachau\na blue sky above
5,above\nthe chimneys.visiting the graves\nstronger the,\nthe chimneys.visiting the graves\nstronger the October
6,October wind\nat my grandparents'.the last light of day ~\n,wind\nat my grandparents'.the last light of day ~\npur
7,purple rhododendrons\ndissolve in the dark,ple rhododendrons\ndissolve in the dark.
8,.summer break\nthe sun scatters\nmy freckles,summer break\nthe sun scatters\nmy freckles.
9,.advent\nthe passing stranger\nfarts.learning to eat,advent\nthe passing stranger\nfarts.learning to eat\n


In [118]:
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

In [125]:
learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), cbs=[DropOutput], metrics=Perplexity()).to_fp16()

In [126]:
learn.validate()

AttributeError: Exception occured in `TrainEvalCallback` when calling event `before_fit`:
	'TFGPT2LMHeadModel' object has no attribute 'to'

In [163]:
prompt = 'Moon' # create an initial text prompt to start your generated text
#prompt_ids = tokenizer.encode(prompt)
#inp = tensor(prompt_ids)[None].cuda()
#inp = tokenizer(prompt, return_tensors="tf")[0]
input_ids = tf.constant(tokenizer.encode(prompt, add_special_tokens=True))[None, :]  # Batch size 1
#inp.shape

In [164]:
#learn.fit_one_cycle(1, 1e-4)
preds = learn.model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2,num_return_sequences=3, early_stopping=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [165]:
outputs=preds

In [145]:
#input_ids = tf.constant(tokenizer.encode(prompt, add_special_tokens=True))[None, :]  # Batch size 1
outputs = model.generate(input_ids=input_ids,
                         max_length=20,
                         num_beams=5,
                         no_repeat_ngram_size=2,
                         num_return_sequences=3,
                         early_stopping=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [143]:
#input_ids = tf.constant(tokenizer.encode(input_list, add_special_tokens=True))[None, :]  # Batch size 1
outputs = model.generate(input_ids=input_ids,
                         max_length=20,
                         num_beams=5,
                         no_repeat_ngram_size=2,
                         num_return_sequences=3,
                         early_stopping=True)


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [166]:
print(outputs[0])
result = tokenizer.decode(outputs[0])
print(result)

tf.Tensor(
[31640    13   198   198     1    40   836   470   760   644   284   910
   284   326   553   339   531    13   366    40], shape=(20,), dtype=int32)
Moon.

"I don't know what to say to that," he said. "I


In [167]:
print("Output:\n" + 100 * '-')
for i, output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(output)))

Output:
----------------------------------------------------------------------------------------------------
0: Moon.

"I don't know what to say to that," he said. "I
1: Moon.

"I don't know what to do," he said. "I'm just
2: Moon.

"I don't know what to say," he said. "I'm just
